mod_pottymouth: update bloom_gen_server to support arbitraty list sizes. tested up to 87M terms. update README.
This commit is contained in:
parent
bb8802a70a
commit
1b86f7ccaa
|
@ -0,0 +1,61 @@
|
|||
# mod_pottymouth
|
||||
|
||||
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
|
||||
which has disappeared from the net. It allows individual whole words of a
|
||||
message to be filtered against a blacklist. It allows multiple blacklists
|
||||
sharded by language. To make use of this module the client must add the xml:lang
|
||||
attribute to the message xml.
|
||||
|
||||
#### Installation
|
||||
|
||||
On Ubuntu:
|
||||
````
|
||||
cd ~/.ejabberd-modules/sources
|
||||
clone the git repo
|
||||
cd mod_pottymouth
|
||||
ejabberdctl module_install mod_pottymouth
|
||||
ejabberdctl restart
|
||||
````
|
||||
|
||||
module will be installed in: ~/.ejabberd-modules/mod_pottymouth
|
||||
|
||||
#### Config
|
||||
|
||||
The file format is as follows:
|
||||
|
||||
````
|
||||
modules:
|
||||
mod_pottymouth:
|
||||
blacklists:
|
||||
default: /home/your_user/blacklist_en.txt
|
||||
en: /home/your_user/blacklist_en.txt
|
||||
cn: /home/your_user/blacklist_cn.txt
|
||||
fr: /home/your_user/blacklist_fr.txt
|
||||
````
|
||||
|
||||
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
|
||||
The blacklist file is a plain text file with blacklisted words listed one per
|
||||
line.
|
||||
|
||||
#### Gotchas
|
||||
|
||||
The language will be looked up by whatever value is passed in the xml:lang
|
||||
attribute of the xml message. So, any xml:lang value to be supported will need
|
||||
a corresponding entry/blacklist in the config file. If xml:lang is missing,
|
||||
the 'default' entry in config will be used.
|
||||
|
||||
For xml:lang attribute docs, see:
|
||||
[http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message)
|
||||
|
||||
The internal bloomfilter used to ingest the blacklists currently requires about
|
||||
4,000 entries in the blacklist to ensure acceptable error probability. (We've
|
||||
gotten around this by duplicating entries in a short list)
|
||||
|
||||
#### Todo
|
||||
|
||||
Look into acceptable error probabilities for shorter blacklists.
|
||||
|
||||
#### Tip of the hat
|
||||
|
||||
This mod makes use of the excellent 'etbloom' module:
|
||||
[https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom)
|
|
@ -18,24 +18,16 @@ serverName(Lang) ->
|
|||
member({Lang, Word} = _MessageToken) ->
|
||||
gen_server:call(serverName(Lang), {member, Word}).
|
||||
|
||||
loadWordList(BlacklistFile) ->
|
||||
BlacklistExists = filelib:is_file(BlacklistFile),
|
||||
if
|
||||
BlacklistExists ->
|
||||
{ok, S} = file:read_file(BlacklistFile);
|
||||
true ->
|
||||
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
|
||||
S = <<>>
|
||||
end,
|
||||
WordList = string:tokens(binary_to_list(S), "\n"),
|
||||
WordList.
|
||||
|
||||
start({Lang, BlacklistFile} = _Opts) ->
|
||||
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
|
||||
|
||||
stop() ->
|
||||
ok.
|
||||
|
||||
init([BlacklistFile]) ->
|
||||
WordList = loadWordList(BlacklistFile),
|
||||
{ok, etbloom:bloom(WordList)}.
|
||||
?INFO_MSG("Building bloom", []),
|
||||
Bloom = etbloom:sbf(10000000),
|
||||
{ok, loadWordList(Bloom, BlacklistFile)}.
|
||||
|
||||
handle_call({member, Word}, _From, Bloom) ->
|
||||
Reply = etbloom:member(Word, Bloom),
|
||||
|
@ -45,3 +37,19 @@ handle_cast(_Msg, State) -> {noreply, State}.
|
|||
handle_info(_Info, State) -> {noreply, State}.
|
||||
terminate(_Reason, _State) -> ok.
|
||||
code_change(_OldVsn, State, _Extra) -> {ok, State}.
|
||||
|
||||
loadWordList(Bloom, BlacklistFile) ->
|
||||
BlacklistExists = filelib:is_file(BlacklistFile),
|
||||
if
|
||||
BlacklistExists ->
|
||||
{ok, S} = file:open(BlacklistFile, read),
|
||||
loadWordList(io:get_line(S, ''), Bloom, S);
|
||||
true ->
|
||||
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
|
||||
loadWordList(eof, Bloom, BlacklistFile)
|
||||
end.
|
||||
|
||||
loadWordList(eof, Bloom, _S) ->
|
||||
Bloom;
|
||||
loadWordList(Line, Bloom, S) ->
|
||||
loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S).
|
||||
|
|
Loading…
Reference in New Issue