diff --git a/mod_pottymouth/README.md b/mod_pottymouth/README.md deleted file mode 100644 index 6c2565b..0000000 --- a/mod_pottymouth/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# mod_pottymouth - -The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' -which has disappeared from the net. It allows individual whole words of a -message to be filtered against a blacklist. It allows multiple blacklists -sharded by language. The internal bloomfilter can support arbitrary blacklist -sizes. Using a large list (say, 87M terms) will slow down the initial server -boot time (to about 15 minutes respectively), but once loaded lookups are very -speedy. - -#### Installation - -On Ubuntu: -```` -cd ~/.ejabberd-modules/sources -clone the git repo -cd mod_pottymouth -ejabberdctl module_install mod_pottymouth -ejabberdctl restart -```` - -module will be installed in: ~/.ejabberd-modules/mod_pottymouth - -#### Config - -The file format is as follows: - -```` -modules: - mod_pottymouth: - blacklists: - default: /home/your_user/blacklist_en.txt - en: /home/your_user/blacklist_en.txt - cn: /home/your_user/blacklist_cn.txt - fr: /home/your_user/blacklist_fr.txt -```` - -For each language (en,cn,fr,...whatever) provide a full path to a backlist file. -The blacklist file is a plain text file with blacklisted words listed one per -line. - -#### Gotchas - -The language will be looked up by whatever value is passed in the xml:lang -attribute of the xml message. So, any xml:lang value to be supported will need -a corresponding entry/blacklist in the config file. If xml:lang is missing, -the 'default' entry in config will be used. - -For xml:lang attribute docs, see: - [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message) - -#### Blacklist helper - -Thinking of a bunch of swear words and all the permutations can be tough. We made -a helper script to take a bare wordlist and generate permutations given a -dictionary of substitution characters: - [https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist) - -#### Tip of the hat - -This mod makes use of the excellent 'etbloom' module: - [https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom) diff --git a/mod_pottymouth/README.txt b/mod_pottymouth/README.txt index 9d7649d..809a59a 100644 --- a/mod_pottymouth/README.txt +++ b/mod_pottymouth/README.txt @@ -1,8 +1,10 @@ The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' which has disappeared from the net. It allows individual whole words of a message to be filtered against a blacklist. It allows multiple blacklists -sharded by language. To make use of this module the client must add the xml:lang -attribute to the message xml. +sharded by language. The internal bloomfilter can support arbitrary blacklist +sizes. Using a large list (say, 87M terms) will slow down the initial server +boot time (to about 15 minutes respectively), but once loaded lookups are very +speedy. To install in ejabberd: @@ -25,11 +27,31 @@ modules: en: /home/your_user/blacklist_en.txt cn: /home/your_user/blacklist_cn.txt fr: /home/your_user/blacklist_fr.txt + charmaps: + default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt For each language (en,cn,fr,...whatever) provide a full path to a backlist file. The blacklist file is a plain text file with blacklisted words listed one per line. +You can also provide an optional 'charmap' for each language. This allows you +to specify simple substitutions that will be made on the fly so you don't need +to include those permutations in the blacklist. This keeps the blacklist small +and reduces server startup time. For example, if you included the word: +'xyza' in the blacklist, adding the following substitutions in the charmap +would filter permutations such as 'XYZA', 'xYz4', or 'Xyz@' automatically. + +charmap format: + +[ + {"X", "x"}, + {"Y", "y"}, + {"Z", "z"}, + {"@", "a"}, + {"4", "a"} +]. + Gotchas: The language will be looked up by whatever value is passed in the xml:lang @@ -40,13 +62,11 @@ the 'default' entry in config will be used. For xml:lang attribute docs, see: http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message -The internal bloomfilter used to ingest the blacklists currently requires about -4,000 entries in the blacklist to ensure acceptable error probability. (We've -gotten around this by duplicating entries in a short list) +Blacklist helper -Todo: - -Look into acceptable error probabilities for shorter blacklists. +Thinking of a bunch of swear words and all the permutations can be tough. We made +a helper script to take a bare wordlist and generate permutations given a +dictionary of substitution characters: https://github.com/madglory/permute_wordlist Tip of the hat: diff --git a/mod_pottymouth/conf/mod_pottymouth.yml b/mod_pottymouth/conf/mod_pottymouth.yml index 458ad4f..228a22e 100644 --- a/mod_pottymouth/conf/mod_pottymouth.yml +++ b/mod_pottymouth/conf/mod_pottymouth.yml @@ -1,7 +1,8 @@ modules: mod_pottymouth: blacklists: - default: /home/vagrant/blacklist_en.txt - en: /home/vagrant/blacklist_en.txt - cn: /home/vagrant/blacklist_cn.txt - fr: /home/vagrant/blacklist_fr.txt + default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt + charmaps: + default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt + en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt diff --git a/mod_pottymouth/mod_pottymouth.spec b/mod_pottymouth/mod_pottymouth.spec index 4a269aa..f32eb21 100644 --- a/mod_pottymouth/mod_pottymouth.spec +++ b/mod_pottymouth/mod_pottymouth.spec @@ -1,5 +1,5 @@ author: "Tom Quackenbush " category: "data" summary: "Filter bad words in messages" -home: "https://github.com/madglory/mod_pottymouth/tree/master" -url: "git@github.com:madglory/mod_pottymouth.git" +home: "https://github.com/processone/ejabberd-contrib/tree/master/" +url: "git@github.com:processone/ejabberd-contrib.git" diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl index b61ef2e..19abfc0 100644 --- a/mod_pottymouth/src/bloom_gen_server.erl +++ b/mod_pottymouth/src/bloom_gen_server.erl @@ -5,12 +5,11 @@ -include("logger.hrl"). -import(etbloom, [bloom/1, member/2]). --export([start/1]). +-export([member/1]). %% gen_server callbacks --export([init/1, handle_call/3, handle_cast/2, handle_info/2, +-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --compile(export_all). serverName(Lang) -> list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). @@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) -> start({Lang, BlacklistFile} = _Opts) -> gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). -stop(_Host) -> +stop() -> ok. init([BlacklistFile]) -> - ?INFO_MSG("Building bloom", []), + ?INFO_MSG("Building bloom ~p~n", [BlacklistFile]), Bloom = etbloom:sbf(10000000), {ok, loadWordList(Bloom, BlacklistFile)}. diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 49161a0..0e8bc6f 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -8,12 +8,14 @@ start/2, stop/1, on_filter_packet/1, - mod_opt_type/1 + mod_opt_type/1, + depends/2 ]). -include("ejabberd.hrl"). -import(bloom_gen_server, [start/0, stop/0, member/1]). +-import(nomalize_leet_gen_server, [normalize/1]). getMessageLang(Attrs) -> LangAttr = lists:keyfind(<<"lang">>, 1, Attrs), @@ -26,8 +28,11 @@ getMessageLang(Attrs) -> end, Lang. -censorWord({_Lang, Word} = MessageTerm) -> - IsBadWord = bloom_gen_server:member(MessageTerm), +censorWord({Lang, Word} = _MessageTerm) -> + % we need unicode characters to normlize the word + NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}), + % we need bytewise format for bloom lookup + IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), if IsBadWord -> "****"; @@ -38,30 +43,53 @@ censorWord({_Lang, Word} = MessageTerm) -> filterWords(L) -> lists:map(fun censorWord/1, L). +filterMessageText(MessageAttrs, MessageText) -> + Lang = getMessageLang(MessageAttrs), + % we want to token-ize utf8 'words' + MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), + MessageTerms = [{Lang, Word} || Word <- MessageWords], + % we get back bytewise format terms (rather than utf8) + string:join(filterWords(MessageTerms), " "). + + +filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) -> + FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))), + FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}, + filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody])); + +filterMessageBodyElements([H|T], MessageElements) -> + % skip this tag, but pass it on as processed + filterMessageBodyElements(T, lists:append(MessageElements, [H])); + +filterMessageBodyElements([], MessageElements) -> + MessageElements. + + start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), lists:map(fun bloom_gen_server:start/1, Blacklists), + CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []), + lists:map(fun normalize_leet_gen_server:start/1, CharMaps), ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0), ok. stop(_Host) -> bloom_gen_server:stop(), + normalize_leet_gen_server:stop(), ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0), ok. on_filter_packet(drop) -> drop; -on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - Lang = getMessageLang(Attrs), - MessageWords = string:tokens(binary_to_list(MessageText), " "), - MessageTerms = [{Lang, Word} || Word <- MessageWords], - FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")), - {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; - +on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) -> + FilteredEls = filterMessageBodyElements(Els, []), + {_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}}; on_filter_packet(Msg) -> % Handle the generic case (any packet that isn't a message with a body). Msg. mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; -mod_opt_type(_) -> [blacklists]. +mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; +mod_opt_type(_) -> [blacklists, charmaps]. +depends(_Host, _Opts) -> []. diff --git a/mod_pottymouth/src/normalize_leet.erl b/mod_pottymouth/src/normalize_leet.erl new file mode 100644 index 0000000..1f396af --- /dev/null +++ b/mod_pottymouth/src/normalize_leet.erl @@ -0,0 +1,40 @@ +%% -*- coding: utf-8 -*- + +-module(normalize_leet). + +-export([ + normalize/2 +]). + +distinctLetters([H|T]) -> + distinctLetters(T, [H]). + +distinctLetters([H|T], Letters) -> + distinctLetters(T, lists:umerge(Letters, [H])); +distinctLetters([], Letters) -> + Letters. + +checkMetaChar(Char) -> + MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"], + lists:member(Char, MetaChars). + +replaceChar(true, Char, X, Word) -> + re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]); +replaceChar(false, Char, X, Word) -> + re:replace(Word, Char, X, [unicode,global,{return,list}]). + +replaceLetters([H|T], CharMap, Word) -> + CurChar = [H], + NormChar = maps:get(CurChar, CharMap, skip), + if + NormChar == skip -> + replaceLetters(T, CharMap, Word); + true -> + IsMetaChar = checkMetaChar(CurChar), + replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word)) + end; +replaceLetters([], _CharMap, Word) -> + Word. + +normalize(CharMap, Word) -> + replaceLetters(distinctLetters(Word), CharMap, Word). diff --git a/mod_pottymouth/src/normalize_leet_gen_server.erl b/mod_pottymouth/src/normalize_leet_gen_server.erl new file mode 100644 index 0000000..88818cf --- /dev/null +++ b/mod_pottymouth/src/normalize_leet_gen_server.erl @@ -0,0 +1,46 @@ +-module(normalize_leet_gen_server). + +-behaviour(gen_server). + +-include("logger.hrl"). + +-import(normailize_leet, [normalize/2]). +-export([normalize/1]). + +%% gen_server callbacks +-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). + +serverName(Lang) -> + list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). + +normalize({Lang, Word} = _MessageToken) -> + try gen_server:call(serverName(Lang), {normalize, Word}) + catch + exit:{noproc, _Reason} -> Word + end. + +start({Lang, CharMapFile} = _Opts) -> + gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []). + +stop() -> + ok. + +init([CharMapFile]) -> + ?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]), + {ok, loadCharMapConfig(file:consult(CharMapFile))}. + +loadCharMapConfig({ok, [CharMapConfig]}) -> + maps:from_list(CharMapConfig); +loadCharMapConfig({error, Reason}) -> + ?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]), + maps:new(). + +handle_call({normalize, Word}, _From, CharMap) -> + Reply = normalize_leet:normalize(CharMap, Word), + {reply, Reply, CharMap}. + +handle_cast(_Msg, State) -> {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}.