From 1b86f7ccaa5ea1b69a191b0b089eeb360685aeb3 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Wed, 29 Jun 2016 21:19:12 +0000 Subject: [PATCH 1/4] mod_pottymouth: update bloom_gen_server to support arbitraty list sizes. tested up to 87M terms. update README. --- mod_pottymouth/README.md | 61 +++++++++++++++++++++++++ mod_pottymouth/src/bloom_gen_server.erl | 36 +++++++++------ 2 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 mod_pottymouth/README.md diff --git a/mod_pottymouth/README.md b/mod_pottymouth/README.md new file mode 100644 index 0000000..9b21a3c --- /dev/null +++ b/mod_pottymouth/README.md @@ -0,0 +1,61 @@ +# mod_pottymouth + +The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' +which has disappeared from the net. It allows individual whole words of a +message to be filtered against a blacklist. It allows multiple blacklists +sharded by language. To make use of this module the client must add the xml:lang +attribute to the message xml. + +#### Installation + +On Ubuntu: +```` +cd ~/.ejabberd-modules/sources +clone the git repo +cd mod_pottymouth +ejabberdctl module_install mod_pottymouth +ejabberdctl restart +```` + +module will be installed in: ~/.ejabberd-modules/mod_pottymouth + +#### Config + +The file format is as follows: + +```` +modules: + mod_pottymouth: + blacklists: + default: /home/your_user/blacklist_en.txt + en: /home/your_user/blacklist_en.txt + cn: /home/your_user/blacklist_cn.txt + fr: /home/your_user/blacklist_fr.txt +```` + +For each language (en,cn,fr,...whatever) provide a full path to a backlist file. +The blacklist file is a plain text file with blacklisted words listed one per +line. + +#### Gotchas + +The language will be looked up by whatever value is passed in the xml:lang +attribute of the xml message. So, any xml:lang value to be supported will need +a corresponding entry/blacklist in the config file. If xml:lang is missing, +the 'default' entry in config will be used. + +For xml:lang attribute docs, see: + [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message) + +The internal bloomfilter used to ingest the blacklists currently requires about +4,000 entries in the blacklist to ensure acceptable error probability. (We've +gotten around this by duplicating entries in a short list) + +#### Todo + +Look into acceptable error probabilities for shorter blacklists. + +#### Tip of the hat + +This mod makes use of the excellent 'etbloom' module: + [https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom) diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl index 1fb098a..cd7155e 100644 --- a/mod_pottymouth/src/bloom_gen_server.erl +++ b/mod_pottymouth/src/bloom_gen_server.erl @@ -18,24 +18,16 @@ serverName(Lang) -> member({Lang, Word} = _MessageToken) -> gen_server:call(serverName(Lang), {member, Word}). -loadWordList(BlacklistFile) -> - BlacklistExists = filelib:is_file(BlacklistFile), - if - BlacklistExists -> - {ok, S} = file:read_file(BlacklistFile); - true -> - ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), - S = <<>> - end, - WordList = string:tokens(binary_to_list(S), "\n"), - WordList. - start({Lang, BlacklistFile} = _Opts) -> gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). +stop() -> + ok. + init([BlacklistFile]) -> - WordList = loadWordList(BlacklistFile), - {ok, etbloom:bloom(WordList)}. + ?INFO_MSG("Building bloom", []), + Bloom = etbloom:sbf(10000000), + {ok, loadWordList(Bloom, BlacklistFile)}. handle_call({member, Word}, _From, Bloom) -> Reply = etbloom:member(Word, Bloom), @@ -45,3 +37,19 @@ handle_cast(_Msg, State) -> {noreply, State}. handle_info(_Info, State) -> {noreply, State}. terminate(_Reason, _State) -> ok. code_change(_OldVsn, State, _Extra) -> {ok, State}. + +loadWordList(Bloom, BlacklistFile) -> + BlacklistExists = filelib:is_file(BlacklistFile), + if + BlacklistExists -> + {ok, S} = file:open(BlacklistFile, read), + loadWordList(io:get_line(S, ''), Bloom, S); + true -> + ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), + loadWordList(eof, Bloom, BlacklistFile) + end. + +loadWordList(eof, Bloom, _S) -> + Bloom; +loadWordList(Line, Bloom, S) -> + loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S). From 3a3a3f1db1f34cfce643bfc9a9fa4423b669ef6a Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Wed, 29 Jun 2016 21:23:54 +0000 Subject: [PATCH 2/4] mod_pottymouth: bring back bloom_gen_server stop() to avoid error messages --- .DS_Store | Bin 0 -> 6148 bytes mod_pottymouth/src/bloom_gen_server.erl | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..04767b967bcfeca655a83489f10aee9f1f045125 GIT binary patch literal 6148 zcmeHKJ8r^25S>XVkRl;U%DnfQ6L;Oex%6>*G0ZZ+1CTcoguqPC+`D( zlQSyqcir*pv$caI(x?CxpaN8Y3Q&Q&708x4eY?973s3A2dY0#x9s6wt|fyI$dyvbT<2&U$Tuf50v04{nCNQ?PhD k26{Wj!P@b{lOnI!9`|cvALw+%oet#BfayY`0*_YU9}2k_q5uE@ literal 0 HcmV?d00001 diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl index cd7155e..b61ef2e 100644 --- a/mod_pottymouth/src/bloom_gen_server.erl +++ b/mod_pottymouth/src/bloom_gen_server.erl @@ -21,8 +21,8 @@ member({Lang, Word} = _MessageToken) -> start({Lang, BlacklistFile} = _Opts) -> gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). -stop() -> - ok. +stop(_Host) -> + ok. init([BlacklistFile]) -> ?INFO_MSG("Building bloom", []), From 3a03dde3c07d7cb0d4e01f6d022027f5d13162d6 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Fri, 1 Jul 2016 15:11:22 -0400 Subject: [PATCH 3/4] update README. Remove list size TODO. Add link to permute_wordlist. --- mod_pottymouth/README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mod_pottymouth/README.md b/mod_pottymouth/README.md index 9b21a3c..6c2565b 100644 --- a/mod_pottymouth/README.md +++ b/mod_pottymouth/README.md @@ -3,8 +3,10 @@ The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' which has disappeared from the net. It allows individual whole words of a message to be filtered against a blacklist. It allows multiple blacklists -sharded by language. To make use of this module the client must add the xml:lang -attribute to the message xml. +sharded by language. The internal bloomfilter can support arbitrary blacklist +sizes. Using a large list (say, 87M terms) will slow down the initial server +boot time (to about 15 minutes respectively), but once loaded lookups are very +speedy. #### Installation @@ -47,13 +49,12 @@ the 'default' entry in config will be used. For xml:lang attribute docs, see: [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message) -The internal bloomfilter used to ingest the blacklists currently requires about -4,000 entries in the blacklist to ensure acceptable error probability. (We've -gotten around this by duplicating entries in a short list) +#### Blacklist helper -#### Todo - -Look into acceptable error probabilities for shorter blacklists. +Thinking of a bunch of swear words and all the permutations can be tough. We made +a helper script to take a bare wordlist and generate permutations given a +dictionary of substitution characters: + [https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist) #### Tip of the hat From 779024b9c5ac866841ca554468b34361218ebc24 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Fri, 1 Jul 2016 15:13:20 -0400 Subject: [PATCH 4/4] remove .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 04767b967bcfeca655a83489f10aee9f1f045125..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ8r^25S>XVkRl;U%DnfQ6L;Oex%6>*G0ZZ+1CTcoguqPC+`D( zlQSyqcir*pv$caI(x?CxpaN8Y3Q&Q&708x4eY?973s3A2dY0#x9s6wt|fyI$dyvbT<2&U$Tuf50v04{nCNQ?PhD k26{Wj!P@b{lOnI!9`|cvALw+%oet#BfayY`0*_YU9}2k_q5uE@