mod_pottymouth: update bloom_gen_server to support arbitraty list sizes. tested up to 87M terms. update README.
This commit is contained in:
		
							parent
							
								
									bb8802a70a
								
							
						
					
					
						commit
						1b86f7ccaa
					
				
							
								
								
									
										61
									
								
								mod_pottymouth/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								mod_pottymouth/README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,61 @@ | |||||||
|  | # mod_pottymouth | ||||||
|  | 
 | ||||||
|  | The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' | ||||||
|  | which has disappeared from the net. It allows individual whole words of a | ||||||
|  | message to be filtered against a blacklist. It allows multiple blacklists | ||||||
|  | sharded by language. To make use of this module the client must add the xml:lang | ||||||
|  | attribute to the message xml. | ||||||
|  | 
 | ||||||
|  | #### Installation | ||||||
|  | 
 | ||||||
|  | On Ubuntu: | ||||||
|  | ```` | ||||||
|  | cd ~/.ejabberd-modules/sources | ||||||
|  | clone the git repo | ||||||
|  | cd mod_pottymouth | ||||||
|  | ejabberdctl module_install mod_pottymouth | ||||||
|  | ejabberdctl restart | ||||||
|  | ```` | ||||||
|  | 
 | ||||||
|  | module will be installed in: ~/.ejabberd-modules/mod_pottymouth | ||||||
|  | 
 | ||||||
|  | #### Config | ||||||
|  | 
 | ||||||
|  | The file format is as follows: | ||||||
|  | 
 | ||||||
|  | ```` | ||||||
|  | modules: | ||||||
|  |     mod_pottymouth: | ||||||
|  |         blacklists: | ||||||
|  |             default: /home/your_user/blacklist_en.txt | ||||||
|  |             en: /home/your_user/blacklist_en.txt | ||||||
|  |             cn: /home/your_user/blacklist_cn.txt | ||||||
|  |             fr: /home/your_user/blacklist_fr.txt | ||||||
|  | ```` | ||||||
|  | 
 | ||||||
|  | For each language (en,cn,fr,...whatever) provide a full path to a backlist file. | ||||||
|  | The blacklist file is a plain text file with blacklisted words listed one per | ||||||
|  | line. | ||||||
|  | 
 | ||||||
|  | #### Gotchas | ||||||
|  | 
 | ||||||
|  | The language will be looked up by whatever value is passed in the xml:lang | ||||||
|  | attribute of the xml message. So, any xml:lang value to be supported will need | ||||||
|  | a corresponding entry/blacklist in the config file. If xml:lang is missing, | ||||||
|  | the 'default' entry in config will be used. | ||||||
|  | 
 | ||||||
|  | For xml:lang attribute docs, see: | ||||||
|  |     [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message) | ||||||
|  | 
 | ||||||
|  | The internal bloomfilter used to ingest the blacklists currently requires about | ||||||
|  | 4,000 entries in the blacklist to ensure acceptable error probability. (We've | ||||||
|  | gotten around this by duplicating entries in a short list) | ||||||
|  | 
 | ||||||
|  | #### Todo | ||||||
|  | 
 | ||||||
|  | Look into acceptable error probabilities for shorter blacklists. | ||||||
|  | 
 | ||||||
|  | #### Tip of the hat | ||||||
|  | 
 | ||||||
|  | This mod makes use of the excellent 'etbloom' module: | ||||||
|  |     [https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom) | ||||||
| @ -18,24 +18,16 @@ serverName(Lang) -> | |||||||
| member({Lang, Word} = _MessageToken) -> | member({Lang, Word} = _MessageToken) -> | ||||||
|   gen_server:call(serverName(Lang), {member, Word}). |   gen_server:call(serverName(Lang), {member, Word}). | ||||||
| 
 | 
 | ||||||
| loadWordList(BlacklistFile) -> |  | ||||||
|   BlacklistExists = filelib:is_file(BlacklistFile), |  | ||||||
|   if |  | ||||||
|     BlacklistExists -> |  | ||||||
|       {ok, S} = file:read_file(BlacklistFile); |  | ||||||
|     true -> |  | ||||||
|       ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), |  | ||||||
|       S = <<>> |  | ||||||
|   end, |  | ||||||
|   WordList = string:tokens(binary_to_list(S), "\n"), |  | ||||||
|   WordList. |  | ||||||
| 
 |  | ||||||
| start({Lang, BlacklistFile} = _Opts) -> | start({Lang, BlacklistFile} = _Opts) -> | ||||||
|   gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). |   gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). | ||||||
| 
 | 
 | ||||||
|  | stop() -> | ||||||
|  |   ok. | ||||||
|  | 
 | ||||||
| init([BlacklistFile]) -> | init([BlacklistFile]) -> | ||||||
|   WordList = loadWordList(BlacklistFile), |   ?INFO_MSG("Building bloom", []), | ||||||
|   {ok, etbloom:bloom(WordList)}. |   Bloom = etbloom:sbf(10000000), | ||||||
|  |   {ok, loadWordList(Bloom, BlacklistFile)}. | ||||||
| 
 | 
 | ||||||
| handle_call({member, Word}, _From, Bloom) -> | handle_call({member, Word}, _From, Bloom) -> | ||||||
|   Reply = etbloom:member(Word, Bloom), |   Reply = etbloom:member(Word, Bloom), | ||||||
| @ -45,3 +37,19 @@ handle_cast(_Msg, State) -> {noreply, State}. | |||||||
| handle_info(_Info, State) -> {noreply, State}. | handle_info(_Info, State) -> {noreply, State}. | ||||||
| terminate(_Reason, _State) -> ok. | terminate(_Reason, _State) -> ok. | ||||||
| code_change(_OldVsn, State, _Extra) -> {ok, State}. | code_change(_OldVsn, State, _Extra) -> {ok, State}. | ||||||
|  | 
 | ||||||
|  | loadWordList(Bloom, BlacklistFile) -> | ||||||
|  |   BlacklistExists = filelib:is_file(BlacklistFile), | ||||||
|  |   if | ||||||
|  |     BlacklistExists -> | ||||||
|  |       {ok, S} = file:open(BlacklistFile, read), | ||||||
|  |       loadWordList(io:get_line(S, ''), Bloom, S); | ||||||
|  |     true -> | ||||||
|  |       ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), | ||||||
|  |       loadWordList(eof, Bloom, BlacklistFile) | ||||||
|  |   end. | ||||||
|  | 
 | ||||||
|  | loadWordList(eof, Bloom, _S) -> | ||||||
|  |   Bloom; | ||||||
|  | loadWordList(Line, Bloom, S) -> | ||||||
|  |   loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S). | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user