From 612b2b78e9965b81749657c246c0302c73ffc071 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 19 Jul 2016 21:34:04 +0000 Subject: [PATCH 01/11] code cleanup. add optional normalize_leet to do dynamic substitution of one-to-one mappings (reduce blacklist size). --- mod_pottymouth/conf/mod_pottymouth.yml | 9 ++-- mod_pottymouth/mod_pottymouth.spec | 4 +- mod_pottymouth/src/bloom_gen_server.erl | 9 ++-- mod_pottymouth/src/mod_pottymouth.erl | 17 ++++--- mod_pottymouth/src/normalize_leet.erl | 38 +++++++++++++++ .../src/normalize_leet_gen_server.erl | 46 +++++++++++++++++++ 6 files changed, 105 insertions(+), 18 deletions(-) create mode 100644 mod_pottymouth/src/normalize_leet.erl create mode 100644 mod_pottymouth/src/normalize_leet_gen_server.erl diff --git a/mod_pottymouth/conf/mod_pottymouth.yml b/mod_pottymouth/conf/mod_pottymouth.yml index 458ad4f..228a22e 100644 --- a/mod_pottymouth/conf/mod_pottymouth.yml +++ b/mod_pottymouth/conf/mod_pottymouth.yml @@ -1,7 +1,8 @@ modules: mod_pottymouth: blacklists: - default: /home/vagrant/blacklist_en.txt - en: /home/vagrant/blacklist_en.txt - cn: /home/vagrant/blacklist_cn.txt - fr: /home/vagrant/blacklist_fr.txt + default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + charmaps: + default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt diff --git a/mod_pottymouth/mod_pottymouth.spec b/mod_pottymouth/mod_pottymouth.spec index 4a269aa..f32eb21 100644 --- a/mod_pottymouth/mod_pottymouth.spec +++ b/mod_pottymouth/mod_pottymouth.spec @@ -1,5 +1,5 @@ author: "Tom Quackenbush " category: "data" summary: "Filter bad words in messages" -home: "https://github.com/madglory/mod_pottymouth/tree/master" -url: "git@github.com:madglory/mod_pottymouth.git" +home: "https://github.com/processone/ejabberd-contrib/tree/master/" +url: "git@github.com:processone/ejabberd-contrib.git" diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl index b61ef2e..19abfc0 100644 --- a/mod_pottymouth/src/bloom_gen_server.erl +++ b/mod_pottymouth/src/bloom_gen_server.erl @@ -5,12 +5,11 @@ -include("logger.hrl"). -import(etbloom, [bloom/1, member/2]). --export([start/1]). +-export([member/1]). %% gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, +-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --compile(export_all). serverName(Lang) -> list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). @@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) -> start({Lang, BlacklistFile} = _Opts) -> gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). -stop(_Host) -> +stop() -> ok. init([BlacklistFile]) -> - ?INFO_MSG("Building bloom", []), + ?INFO_MSG("Building bloom ~p~n", [BlacklistFile]), Bloom = etbloom:sbf(10000000), {ok, loadWordList(Bloom, BlacklistFile)}. diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 49161a0..7ae8f77 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -2,8 +2,6 @@ -behaviour(gen_mod). --include("logger.hrl"). - -export([ start/2, stop/1, @@ -14,6 +12,7 @@ -include("ejabberd.hrl"). -import(bloom_gen_server, [start/0, stop/0, member/1]). +-import(nomalize_leet_gen_server, [normalize/1]). getMessageLang(Attrs) -> LangAttr = lists:keyfind(<<"lang">>, 1, Attrs), @@ -26,8 +25,8 @@ getMessageLang(Attrs) -> end, Lang. -censorWord({_Lang, Word} = MessageTerm) -> - IsBadWord = bloom_gen_server:member(MessageTerm), +censorWord({Lang, Word} = MessageTerm) -> + IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}), if IsBadWord -> "****"; @@ -41,11 +40,14 @@ filterWords(L) -> start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), lists:map(fun bloom_gen_server:start/1, Blacklists), + CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []), + lists:map(fun normalize_leet_gen_server:start/1, CharMaps), ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0), ok. stop(_Host) -> bloom_gen_server:stop(), + normalize_leet_gen_server:stop(), ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0), ok. @@ -54,9 +56,9 @@ on_filter_packet(drop) -> on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> Lang = getMessageLang(Attrs), - MessageWords = string:tokens(binary_to_list(MessageText), " "), + MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageTerms = [{Lang, Word} || Word <- MessageWords], - FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")), + FilteredMessageWords = unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")), {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet(Msg) -> @@ -64,4 +66,5 @@ on_filter_packet(Msg) -> Msg. mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; -mod_opt_type(_) -> [blacklists]. +mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; +mod_opt_type(_) -> [blacklists, charmaps]. diff --git a/mod_pottymouth/src/normalize_leet.erl b/mod_pottymouth/src/normalize_leet.erl new file mode 100644 index 0000000..fbd9bec --- /dev/null +++ b/mod_pottymouth/src/normalize_leet.erl @@ -0,0 +1,38 @@ +-module(normalize_leet). + +-export([ + normalize/2 +]). + +distinctLetters([H|T]) -> + distinctLetters(T, [H]). + +distinctLetters([H|T], Letters) -> + distinctLetters(T, lists:umerge(Letters, [H])); +distinctLetters([], Letters) -> + Letters. + +checkMetaChar(Char) -> + MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"], + lists:member(Char, MetaChars). + +replaceChar(true, Char, X, Word) -> + re:replace(Word, ["\\", Char], X, [global,{return,list}]); +replaceChar(false, Char, X, Word) -> + re:replace(Word, Char, X, [global,{return,list}]). + +replaceLetters([H|T], CharMap, Word) -> + CurChar = [H], + NormChar = maps:get(CurChar, CharMap, skip), + if + NormChar == skip -> + replaceLetters(T, CharMap, Word); + true -> + IsMetaChar = checkMetaChar(CurChar), + replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word)) + end; +replaceLetters([], _CharMap, Word) -> + Word. + +normalize(CharMap, Word) -> + replaceLetters(distinctLetters(Word), CharMap, Word). diff --git a/mod_pottymouth/src/normalize_leet_gen_server.erl b/mod_pottymouth/src/normalize_leet_gen_server.erl new file mode 100644 index 0000000..88818cf --- /dev/null +++ b/mod_pottymouth/src/normalize_leet_gen_server.erl @@ -0,0 +1,46 @@ +-module(normalize_leet_gen_server). + +-behaviour(gen_server). + +-include("logger.hrl"). + +-import(normailize_leet, [normalize/2]). +-export([normalize/1]). + +%% gen_server callbacks +-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). + +serverName(Lang) -> + list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). + +normalize({Lang, Word} = _MessageToken) -> + try gen_server:call(serverName(Lang), {normalize, Word}) + catch + exit:{noproc, _Reason} -> Word + end. + +start({Lang, CharMapFile} = _Opts) -> + gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []). + +stop() -> + ok. + +init([CharMapFile]) -> + ?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]), + {ok, loadCharMapConfig(file:consult(CharMapFile))}. + +loadCharMapConfig({ok, [CharMapConfig]}) -> + maps:from_list(CharMapConfig); +loadCharMapConfig({error, Reason}) -> + ?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]), + maps:new(). + +handle_call({normalize, Word}, _From, CharMap) -> + Reply = normalize_leet:normalize(CharMap, Word), + {reply, Reply, CharMap}. + +handle_cast(_Msg, State) -> {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. From 265ff3dc70c11d97be1bc0fd298b8dd397662b66 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 2 Aug 2016 14:42:37 +0000 Subject: [PATCH 02/11] add pattern to match group chat messages for filtering --- mod_pottymouth/src/mod_pottymouth.erl | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 7ae8f77..0260344 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -2,6 +2,8 @@ -behaviour(gen_mod). +-include("logger.hrl"). + -export([ start/2, stop/1, @@ -37,6 +39,12 @@ censorWord({Lang, Word} = MessageTerm) -> filterWords(L) -> lists:map(fun censorWord/1, L). +filterMessageText(MessageAttrs, MessageText) -> + Lang = getMessageLang(MessageAttrs), + MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), + MessageTerms = [{Lang, Word} || Word <- MessageWords], + unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")). + start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), lists:map(fun bloom_gen_server:start/1, Blacklists), @@ -55,12 +63,13 @@ on_filter_packet(drop) -> drop; on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - Lang = getMessageLang(Attrs), - MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), - MessageTerms = [{Lang, Word} || Word <- MessageWords], - FilteredMessageWords = unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")), + FilteredMessageWords = filterMessageText(Attrs, MessageText), {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; +on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [{xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> + FilteredMessageWords = filterMessageText(Attrs, MessageText), + {_From, _To, {xmlel, <<"message">>, Attrs, [{xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; + on_filter_packet(Msg) -> % Handle the generic case (any packet that isn't a message with a body). Msg. From 610c0e72eb0016fad7c04c783470438cdc6c4009 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 2 Aug 2016 15:52:41 +0000 Subject: [PATCH 03/11] update README.txt with charmap description. remove README.md. --- README.md | 66 --------------------------------------- mod_pottymouth/README.txt | 36 ++++++++++++++++----- 2 files changed, 28 insertions(+), 74 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 49970e4..0000000 --- a/README.md +++ /dev/null @@ -1,66 +0,0 @@ -ejabberd-contrib -================ - -This is a collaborative development area for ejabberd module developers -and users. - - -For users ---------- - -To use an ejabberd module coming from this repository: - -- You need to have ejabberd installed. - -- If you have not already done it, run `ejabberdctl modules_update_specs` - to retrieve the list of available modules. - -- Run `ejabberdctl module_install ` to get the source code and to - compile and install the `beam` file into ejabberd's module search path. - This path is either `~/.ejabberd-modules` or defined by the - `CONTRIB_MODULES_PATH` setting in `ejabberdctl.cfg`. - -- Edit the configuration file provided in the `conf` directory of the - installed module and update it to your needs. Then apply the changes to - your main ejabberd configuration. In a future release, ejabberd will - automatically add this file to its runtime configuration without - changes. - -- Run `ejabberdctl module_uninstall ` to remove a module from - ejabberd. - - -For developers --------------- - -The following organization has been set up for the development: - -- Development and compilation of modules is done by ejabberd. You need - ejabberd installed. Use `ejabberdctl module_check ` to ensure it - compiles correctly before committing your work. The sources of your - module must be located in `$CONTRIB_MODULES_PATH/sources/`. - -- Compilation can by done manually (if you know what you are doing) so you - don't need ejabberd running: - ``` - cd /path/of/module - mkdir ebin - /path/of/ejabberd's/erlc \ - -o ebin \ - -I include -I /path/of/ejabberd/lib/ejabberd-XX.YY/include \ - -DLAGER -DNO_EXT_LIB \ - src/*erl - ``` - -- The module directory structure is usually the following: - * `README.txt`: Module description. - * `COPYING`: License for the module. - * `doc/`: Documentation directory. - * `src/`: Erlang source directory. - * `lib/`: Elixir source directory. - * `priv/msgs/`: Directory with translation files (pot, po and msg). - * `conf/.yml`: Configuration for your module. - * `.spec`: Yaml description file for your module. - -- Module developers should note in the `README.txt` file whether the - module has requirements or known incompatibilities with other modules. diff --git a/mod_pottymouth/README.txt b/mod_pottymouth/README.txt index 9d7649d..809a59a 100644 --- a/mod_pottymouth/README.txt +++ b/mod_pottymouth/README.txt @@ -1,8 +1,10 @@ The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' which has disappeared from the net. It allows individual whole words of a message to be filtered against a blacklist. It allows multiple blacklists -sharded by language. To make use of this module the client must add the xml:lang -attribute to the message xml. +sharded by language. The internal bloomfilter can support arbitrary blacklist +sizes. Using a large list (say, 87M terms) will slow down the initial server +boot time (to about 15 minutes respectively), but once loaded lookups are very +speedy. To install in ejabberd: @@ -25,11 +27,31 @@ modules: en: /home/your_user/blacklist_en.txt cn: /home/your_user/blacklist_cn.txt fr: /home/your_user/blacklist_fr.txt + charmaps: + default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt For each language (en,cn,fr,...whatever) provide a full path to a backlist file. The blacklist file is a plain text file with blacklisted words listed one per line. +You can also provide an optional 'charmap' for each language. This allows you +to specify simple substitutions that will be made on the fly so you don't need +to include those permutations in the blacklist. This keeps the blacklist small +and reduces server startup time. For example, if you included the word: +'xyza' in the blacklist, adding the following substitutions in the charmap +would filter permutations such as 'XYZA', 'xYz4', or 'Xyz@' automatically. + +charmap format: + +[ + {"X", "x"}, + {"Y", "y"}, + {"Z", "z"}, + {"@", "a"}, + {"4", "a"} +]. + Gotchas: The language will be looked up by whatever value is passed in the xml:lang @@ -40,13 +62,11 @@ the 'default' entry in config will be used. For xml:lang attribute docs, see: http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message -The internal bloomfilter used to ingest the blacklists currently requires about -4,000 entries in the blacklist to ensure acceptable error probability. (We've -gotten around this by duplicating entries in a short list) +Blacklist helper -Todo: - -Look into acceptable error probabilities for shorter blacklists. +Thinking of a bunch of swear words and all the permutations can be tough. We made +a helper script to take a bare wordlist and generate permutations given a +dictionary of substitution characters: https://github.com/madglory/permute_wordlist Tip of the hat: From 06c96ad7789705b00c1fcdfc00ca6cf7d929ce6c Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 2 Aug 2016 20:41:19 +0000 Subject: [PATCH 04/11] re-add repo README.md --- README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..49970e4 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +ejabberd-contrib +================ + +This is a collaborative development area for ejabberd module developers +and users. + + +For users +--------- + +To use an ejabberd module coming from this repository: + +- You need to have ejabberd installed. + +- If you have not already done it, run `ejabberdctl modules_update_specs` + to retrieve the list of available modules. + +- Run `ejabberdctl module_install ` to get the source code and to + compile and install the `beam` file into ejabberd's module search path. + This path is either `~/.ejabberd-modules` or defined by the + `CONTRIB_MODULES_PATH` setting in `ejabberdctl.cfg`. + +- Edit the configuration file provided in the `conf` directory of the + installed module and update it to your needs. Then apply the changes to + your main ejabberd configuration. In a future release, ejabberd will + automatically add this file to its runtime configuration without + changes. + +- Run `ejabberdctl module_uninstall ` to remove a module from + ejabberd. + + +For developers +-------------- + +The following organization has been set up for the development: + +- Development and compilation of modules is done by ejabberd. You need + ejabberd installed. Use `ejabberdctl module_check ` to ensure it + compiles correctly before committing your work. The sources of your + module must be located in `$CONTRIB_MODULES_PATH/sources/`. + +- Compilation can by done manually (if you know what you are doing) so you + don't need ejabberd running: + ``` + cd /path/of/module + mkdir ebin + /path/of/ejabberd's/erlc \ + -o ebin \ + -I include -I /path/of/ejabberd/lib/ejabberd-XX.YY/include \ + -DLAGER -DNO_EXT_LIB \ + src/*erl + ``` + +- The module directory structure is usually the following: + * `README.txt`: Module description. + * `COPYING`: License for the module. + * `doc/`: Documentation directory. + * `src/`: Erlang source directory. + * `lib/`: Elixir source directory. + * `priv/msgs/`: Directory with translation files (pot, po and msg). + * `conf/.yml`: Configuration for your module. + * `.spec`: Yaml description file for your module. + +- Module developers should note in the `README.txt` file whether the + module has requirements or known incompatibilities with other modules. From 29b4b5ae3058bb7fb44b329fa8d51c15c2eba096 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 9 Aug 2016 16:49:14 -0400 Subject: [PATCH 05/11] pull message body attibs to look for lang --- mod_pottymouth/src/mod_pottymouth.erl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 0260344..6d6ddae 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -62,13 +62,13 @@ stop(_Host) -> on_filter_packet(drop) -> drop; -on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(Attrs, MessageText), - {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; +on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> + FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; -on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [{xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(Attrs, MessageText), - {_From, _To, {xmlel, <<"message">>, Attrs, [{xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; +on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> + FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet(Msg) -> % Handle the generic case (any packet that isn't a message with a body). From 5662e1c530551a0266f3ace8563bbd950366f768 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 13 Sep 2016 15:22:37 -0400 Subject: [PATCH 06/11] fix lookup/substitution of unicode (example: chinese) characters --- .DS_Store | Bin 0 -> 6148 bytes mod_pottymouth/src/mod_pottymouth.erl | 15 ++++++++++----- mod_pottymouth/src/normalize_leet.erl | 6 ++++-- 3 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..04767b967bcfeca655a83489f10aee9f1f045125 GIT binary patch literal 6148 zcmeHKJ8r^25S>XVkRl;U%DnfQ6L;Oex%6>*G0ZZ+1CTcoguqPC+`D( zlQSyqcir*pv$caI(x?CxpaN8Y3Q&Q&708x4eY?973s3A2dY0#x9s6wt|fyI$dyvbT<2&U$Tuf50v04{nCNQ?PhD k26{Wj!P@b{lOnI!9`|cvALw+%oet#BfayY`0*_YU9}2k_q5uE@ literal 0 HcmV?d00001 diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 6d6ddae..e50c9f8 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -27,8 +27,11 @@ getMessageLang(Attrs) -> end, Lang. -censorWord({Lang, Word} = MessageTerm) -> - IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}), +censorWord({Lang, Word} = _MessageTerm) -> + % we need unicode characters to normlize the word + NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}), + % we need bytewise format for bloom lookup + IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), if IsBadWord -> "****"; @@ -41,9 +44,11 @@ filterWords(L) -> filterMessageText(MessageAttrs, MessageText) -> Lang = getMessageLang(MessageAttrs), + % we want to token-ize utf8 'words' MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageTerms = [{Lang, Word} || Word <- MessageWords], - unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")). + % we get back bytewise format terms (rather than utf8) + list_to_binary(string:join(filterWords(MessageTerms), " ")). start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), @@ -63,11 +68,11 @@ on_filter_packet(drop) -> drop; on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet(Msg) -> diff --git a/mod_pottymouth/src/normalize_leet.erl b/mod_pottymouth/src/normalize_leet.erl index fbd9bec..1f396af 100644 --- a/mod_pottymouth/src/normalize_leet.erl +++ b/mod_pottymouth/src/normalize_leet.erl @@ -1,3 +1,5 @@ +%% -*- coding: utf-8 -*- + -module(normalize_leet). -export([ @@ -17,9 +19,9 @@ checkMetaChar(Char) -> lists:member(Char, MetaChars). replaceChar(true, Char, X, Word) -> - re:replace(Word, ["\\", Char], X, [global,{return,list}]); + re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]); replaceChar(false, Char, X, Word) -> - re:replace(Word, Char, X, [global,{return,list}]). + re:replace(Word, Char, X, [unicode,global,{return,list}]). replaceLetters([H|T], CharMap, Word) -> CurChar = [H], From 83b32e7e4fcee8235f4744e1906f38d9e9b395f3 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Thu, 1 Sep 2016 21:00:42 +0000 Subject: [PATCH 07/11] initial multi-body parse --- mod_pottymouth/conf/mod_pottymouth.yml | 8 +-- mod_pottymouth/src/mod_pottymouth.erl | 81 +++++++++++++++++++++++--- 2 files changed, 77 insertions(+), 12 deletions(-) diff --git a/mod_pottymouth/conf/mod_pottymouth.yml b/mod_pottymouth/conf/mod_pottymouth.yml index 228a22e..d257b60 100644 --- a/mod_pottymouth/conf/mod_pottymouth.yml +++ b/mod_pottymouth/conf/mod_pottymouth.yml @@ -1,8 +1,8 @@ modules: mod_pottymouth: blacklists: - default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt - en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + default: /home/vagrant/blacklist_en.txt + en: /home/vagrant/blacklist_en.txt charmaps: - default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt - en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + default: /home/vagrant/charmap_en.txt + en: /home/vagrant/charmap_en.txt diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index e50c9f8..0c51ded 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -21,9 +21,11 @@ getMessageLang(Attrs) -> if LangAttr -> {<<"lang">>, LangBin} = LangAttr, - Lang = list_to_atom(binary_to_list(LangBin)); + Lang = list_to_atom(binary_to_list(LangBin)), + ?ERROR_MSG("LANG: ~p~n", [Lang]); true -> - Lang = default + Lang = default, + ?ERROR_MSG("LANG DEFAULT~n", []) end, Lang. @@ -50,6 +52,11 @@ filterMessageText(MessageAttrs, MessageText) -> % we get back bytewise format terms (rather than utf8) list_to_binary(string:join(filterWords(MessageTerms), " ")). +filterMessageBodyElements([H|T]) -> + lists:map +filterMessageBodyElements([], Element) -> + Element. + start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), lists:map(fun bloom_gen_server:start/1, Blacklists), @@ -67,18 +74,76 @@ stop(_Host) -> on_filter_packet(drop) -> drop; -on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), - {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; -on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), - {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; +% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> +% FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), +% {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; +% +% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> +% FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), +% {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; + +% chat message with chat state +% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> +% ?ERROR_MSG("CHAT CHAT MSG W CHAT STATE: ~p~n", _Msg), +% FilteredMessageWords = filterMessageText(BodyAttr, MessageText), +% {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; + +% chat message without chat state +on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [MessageBody] = _Els} = _Packet} = _Msg) -> + ?ERROR_MSG("CHAT MSG WITHOUT CHAT STATE: ~p~n", [_Msg]), + + # {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody + + if message type chat/group && !archived + foreach body + FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + {_From, _To, {xmlel, <<"message">>, _Attrs, [MessageBody]}]}}; on_filter_packet(Msg) -> % Handle the generic case (any packet that isn't a message with a body). + ?ERROR_MSG("FILTER PACKET MSG: ~p~n", [Msg]), Msg. +% PSI one-to-one +% { +% {jid,<<"foo">>,<<"kalamari">>,<<"Psi">>,<<"foo">>,<<"kalamari">>,<<"Psi">>}, +% {jid,<<"foo">>,<<"kalamari">>,<<>>,<<"foo">>,<<"kalamari">>,<<>>}, +% {xmlel,<<"message">>, +% [{<<"xml:lang">>,<<"en">>},{<<"type">>,<<"chat">>},{<<"to">>,<<"foo@kalamari">>},{<<"id">>,<<"aacba">>}], +% [{xmlcdata,<<"\n">>},{xmlel,<<"body">>,[],[{xmlcdata,<<"hi">>}]},{xmlcdata,<<"\n">>}, +% {xmlel,<<"active">>,[{<<"xmlns">>,<<"http://jabber.org/protocol/chatstates">>}],[]},{xmlcdata,<<"\n">>}]}} + +% gloox muc +% { +% {jid,<<"#12345">>,<<"conference.kalamari">>,<<"bar">>,<<"#12345">>,<<"conference.kalamari">>,<<"bar">>}, +% {jid,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>}, +% {xmlel,<<"message">>, +% [{<<"xml:lang">>,<<"en">>},{<<"to">>,<<"#12345@conference.kalamari">>},{<<"type">>,<<"groupchat">>},{<<"from">>,<<"bar@kalamari/12145048529523376186799">>}], +% [{xmlel,<<"archived">>, +% [{<<"by">>,<<"conference.kalamari">>},{<<"xmlns">>,<<"urn:xmpp:mam:tmp">>},{<<"id">>,<<"1471940767114309">>}], +% [] +% }, +% {xmlel,<<"stanza-id">>, +% [{<<"by">>,<<"conference.kalamari">>},{<<"xmlns">>,<<"urn:xmpp:sid:0">>},{<<"id">>,<<"1471940767114309">>}], +% [] +% }, +% {xmlel,<<"body">>, +% [], +% [{xmlcdata,<<"HELLO THERE">>}] +% }] +% } +% } +% +% { +% {jid,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>}, +% {jid,<<"#12345">>,<<"conference.kalamari">>,<<>>,<<"#12345">>,<<"conference.kalamari">>,<<>>}, +% {xmlel,<<"message">>, +% [{<<"xml:lang">>,<<"en">>},{<<"to">>,<<"#12345@conference.kalamari">>},{<<"type">>,<<"groupchat">>},{<<"from">>,<<"bar@kalamari/12145048529523376186799">>}], +% [{xmlel,<<"body">>,[],[{xmlcdata,<<"HELLO THERE">>}]}] +% } +}% + mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; mod_opt_type(_) -> [blacklists, charmaps]. From a190156d0ab312ab9e9b717864d648fd0237dc3a Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 13 Sep 2016 17:13:32 -0400 Subject: [PATCH 08/11] filter all body elemements in message --- mod_pottymouth/conf/mod_pottymouth.yml | 8 +-- mod_pottymouth/src/mod_pottymouth.erl | 92 +++++--------------------- 2 files changed, 22 insertions(+), 78 deletions(-) diff --git a/mod_pottymouth/conf/mod_pottymouth.yml b/mod_pottymouth/conf/mod_pottymouth.yml index d257b60..228a22e 100644 --- a/mod_pottymouth/conf/mod_pottymouth.yml +++ b/mod_pottymouth/conf/mod_pottymouth.yml @@ -1,8 +1,8 @@ modules: mod_pottymouth: blacklists: - default: /home/vagrant/blacklist_en.txt - en: /home/vagrant/blacklist_en.txt + default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt charmaps: - default: /home/vagrant/charmap_en.txt - en: /home/vagrant/charmap_en.txt + default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 0c51ded..1f4ce9d 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -21,11 +21,9 @@ getMessageLang(Attrs) -> if LangAttr -> {<<"lang">>, LangBin} = LangAttr, - Lang = list_to_atom(binary_to_list(LangBin)), - ?ERROR_MSG("LANG: ~p~n", [Lang]); + Lang = list_to_atom(binary_to_list(LangBin)); true -> - Lang = default, - ?ERROR_MSG("LANG DEFAULT~n", []) + Lang = default end, Lang. @@ -52,10 +50,19 @@ filterMessageText(MessageAttrs, MessageText) -> % we get back bytewise format terms (rather than utf8) list_to_binary(string:join(filterWords(MessageTerms), " ")). -filterMessageBodyElements([H|T]) -> - lists:map -filterMessageBodyElements([], Element) -> - Element. + +filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) -> + FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), + FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}, + filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody])); + +filterMessageBodyElements([H|T], MessageElements) -> + % skip this tag, but pass it on as processed + filterMessageBodyElements(T, lists:append(MessageElements, [H])); + +filterMessageBodyElements([], MessageElements) -> + MessageElements. + start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), @@ -74,76 +81,13 @@ stop(_Host) -> on_filter_packet(drop) -> drop; - -% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> -% FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), -% {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; -% -% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> -% FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), -% {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; - -% chat message with chat state -% on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> -% ?ERROR_MSG("CHAT CHAT MSG W CHAT STATE: ~p~n", _Msg), -% FilteredMessageWords = filterMessageText(BodyAttr, MessageText), -% {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; - -% chat message without chat state -on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [MessageBody] = _Els} = _Packet} = _Msg) -> - ?ERROR_MSG("CHAT MSG WITHOUT CHAT STATE: ~p~n", [_Msg]), - - # {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody - - if message type chat/group && !archived - foreach body - FilteredMessageWords = filterMessageText(BodyAttr, MessageText), - {_From, _To, {xmlel, <<"message">>, _Attrs, [MessageBody]}]}}; - +on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) -> + FilteredEls = filterMessageBodyElements(Els, []), + {_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}}; on_filter_packet(Msg) -> % Handle the generic case (any packet that isn't a message with a body). - ?ERROR_MSG("FILTER PACKET MSG: ~p~n", [Msg]), Msg. -% PSI one-to-one -% { -% {jid,<<"foo">>,<<"kalamari">>,<<"Psi">>,<<"foo">>,<<"kalamari">>,<<"Psi">>}, -% {jid,<<"foo">>,<<"kalamari">>,<<>>,<<"foo">>,<<"kalamari">>,<<>>}, -% {xmlel,<<"message">>, -% [{<<"xml:lang">>,<<"en">>},{<<"type">>,<<"chat">>},{<<"to">>,<<"foo@kalamari">>},{<<"id">>,<<"aacba">>}], -% [{xmlcdata,<<"\n">>},{xmlel,<<"body">>,[],[{xmlcdata,<<"hi">>}]},{xmlcdata,<<"\n">>}, -% {xmlel,<<"active">>,[{<<"xmlns">>,<<"http://jabber.org/protocol/chatstates">>}],[]},{xmlcdata,<<"\n">>}]}} - -% gloox muc -% { -% {jid,<<"#12345">>,<<"conference.kalamari">>,<<"bar">>,<<"#12345">>,<<"conference.kalamari">>,<<"bar">>}, -% {jid,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>}, -% {xmlel,<<"message">>, -% [{<<"xml:lang">>,<<"en">>},{<<"to">>,<<"#12345@conference.kalamari">>},{<<"type">>,<<"groupchat">>},{<<"from">>,<<"bar@kalamari/12145048529523376186799">>}], -% [{xmlel,<<"archived">>, -% [{<<"by">>,<<"conference.kalamari">>},{<<"xmlns">>,<<"urn:xmpp:mam:tmp">>},{<<"id">>,<<"1471940767114309">>}], -% [] -% }, -% {xmlel,<<"stanza-id">>, -% [{<<"by">>,<<"conference.kalamari">>},{<<"xmlns">>,<<"urn:xmpp:sid:0">>},{<<"id">>,<<"1471940767114309">>}], -% [] -% }, -% {xmlel,<<"body">>, -% [], -% [{xmlcdata,<<"HELLO THERE">>}] -% }] -% } -% } -% -% { -% {jid,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>,<<"bar">>,<<"kalamari">>,<<"12145048529523376186799">>}, -% {jid,<<"#12345">>,<<"conference.kalamari">>,<<>>,<<"#12345">>,<<"conference.kalamari">>,<<>>}, -% {xmlel,<<"message">>, -% [{<<"xml:lang">>,<<"en">>},{<<"to">>,<<"#12345@conference.kalamari">>},{<<"type">>,<<"groupchat">>},{<<"from">>,<<"bar@kalamari/12145048529523376186799">>}], -% [{xmlel,<<"body">>,[],[{xmlcdata,<<"HELLO THERE">>}]}] -% } -}% - mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; mod_opt_type(_) -> [blacklists, charmaps]. From 707321e0e06b8a9d507e2c54a41d832a87a2430c Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 13 Sep 2016 17:15:52 -0400 Subject: [PATCH 09/11] remove README.md --- mod_pottymouth/README.md | 62 ---------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 mod_pottymouth/README.md diff --git a/mod_pottymouth/README.md b/mod_pottymouth/README.md deleted file mode 100644 index 6c2565b..0000000 --- a/mod_pottymouth/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# mod_pottymouth - -The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' -which has disappeared from the net. It allows individual whole words of a -message to be filtered against a blacklist. It allows multiple blacklists -sharded by language. The internal bloomfilter can support arbitrary blacklist -sizes. Using a large list (say, 87M terms) will slow down the initial server -boot time (to about 15 minutes respectively), but once loaded lookups are very -speedy. - -#### Installation - -On Ubuntu: -```` -cd ~/.ejabberd-modules/sources -clone the git repo -cd mod_pottymouth -ejabberdctl module_install mod_pottymouth -ejabberdctl restart -```` - -module will be installed in: ~/.ejabberd-modules/mod_pottymouth - -#### Config - -The file format is as follows: - -```` -modules: - mod_pottymouth: - blacklists: - default: /home/your_user/blacklist_en.txt - en: /home/your_user/blacklist_en.txt - cn: /home/your_user/blacklist_cn.txt - fr: /home/your_user/blacklist_fr.txt -```` - -For each language (en,cn,fr,...whatever) provide a full path to a backlist file. -The blacklist file is a plain text file with blacklisted words listed one per -line. - -#### Gotchas - -The language will be looked up by whatever value is passed in the xml:lang -attribute of the xml message. So, any xml:lang value to be supported will need -a corresponding entry/blacklist in the config file. If xml:lang is missing, -the 'default' entry in config will be used. - -For xml:lang attribute docs, see: - [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message) - -#### Blacklist helper - -Thinking of a bunch of swear words and all the permutations can be tough. We made -a helper script to take a bare wordlist and generate permutations given a -dictionary of substitution characters: - [https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist) - -#### Tip of the hat - -This mod makes use of the excellent 'etbloom' module: - [https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom) From 78bb06c6aaceb59bd0a3f25cbb084450ec33d260 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 13 Sep 2016 17:23:47 -0400 Subject: [PATCH 10/11] add depends/2 to mod_pottymouth... seems to be required by gen_mod now --- mod_pottymouth/src/mod_pottymouth.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 1f4ce9d..8fa3e59 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -8,7 +8,8 @@ start/2, stop/1, on_filter_packet/1, - mod_opt_type/1 + mod_opt_type/1, + depends/2 ]). -include("ejabberd.hrl"). @@ -91,3 +92,4 @@ on_filter_packet(Msg) -> mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; mod_opt_type(_) -> [blacklists, charmaps]. +depends(_Host, _Opts) -> []. From 3faa47df4573256f7a6e3b4b7239c85a486e9bca Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Fri, 30 Sep 2016 16:55:33 +0000 Subject: [PATCH 11/11] move list_to_bin on filterMessageText for clarity --- .DS_Store | Bin 6148 -> 0 bytes mod_pottymouth/src/mod_pottymouth.erl | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 04767b967bcfeca655a83489f10aee9f1f045125..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ8r^25S>XVkRl;U%DnfQ6L;Oex%6>*G0ZZ+1CTcoguqPC+`D( zlQSyqcir*pv$caI(x?CxpaN8Y3Q&Q&708x4eY?973s3A2dY0#x9s6wt|fyI$dyvbT<2&U$Tuf50v04{nCNQ?PhD k26{Wj!P@b{lOnI!9`|cvALw+%oet#BfayY`0*_YU9}2k_q5uE@ diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 8fa3e59..0e8bc6f 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -49,11 +49,11 @@ filterMessageText(MessageAttrs, MessageText) -> MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageTerms = [{Lang, Word} || Word <- MessageWords], % we get back bytewise format terms (rather than utf8) - list_to_binary(string:join(filterWords(MessageTerms), " ")). + string:join(filterWords(MessageTerms), " "). filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) -> - FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), + FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))), FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}, filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody]));