Merge pull request #181 from madglory/master

dynamic character replacement / muc message filter fixes
2016-11-10 20:21:14 +01:00 · 2016-11-10 20:21:14 +01:00 · da878bd75c
commit da878bd75c
parent df79a848f6 3faa47df45
8 changed files with 164 additions and 92 deletions
--- a/mod_pottymouth/README.md
+++ b/mod_pottymouth/README.md
@ -1,62 +0,0 @@
 # mod_pottymouth
 The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
 which has disappeared from the net. It allows individual whole words of a
 message to be filtered against a blacklist. It allows multiple blacklists
 sharded by language. The internal bloomfilter can support arbitrary blacklist
 sizes. Using a large list (say, 87M terms) will slow down the initial server
 boot time (to about 15 minutes respectively), but once loaded lookups are very
 speedy.
 #### Installation
 On Ubuntu:
 ````
 cd ~/.ejabberd-modules/sources
 clone the git repo
 cd mod_pottymouth
 ejabberdctl module_install mod_pottymouth
 ejabberdctl restart
 ````
 module will be installed in: ~/.ejabberd-modules/mod_pottymouth
 #### Config
 The file format is as follows:
 ````
 modules:
    mod_pottymouth:
        blacklists:
            default: /home/your_user/blacklist_en.txt
            en: /home/your_user/blacklist_en.txt
            cn: /home/your_user/blacklist_cn.txt
            fr: /home/your_user/blacklist_fr.txt
 ````
 For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
 The blacklist file is a plain text file with blacklisted words listed one per
 line.
 #### Gotchas
 The language will be looked up by whatever value is passed in the xml:lang
 attribute of the xml message. So, any xml:lang value to be supported will need
 a corresponding entry/blacklist in the config file. If xml:lang is missing,
 the 'default' entry in config will be used.
 For xml:lang attribute docs, see:
    [http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message)
 #### Blacklist helper
 Thinking of a bunch of swear words and all the permutations can be tough. We made
 a helper script to take a bare wordlist and generate permutations given a
 dictionary of substitution characters:
  [https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist)
 #### Tip of the hat
 This mod makes use of the excellent 'etbloom' module:
    [https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom)
--- a/mod_pottymouth/README.txt
+++ b/mod_pottymouth/README.txt
@ -1,8 +1,10 @@
 The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
 which has disappeared from the net. It allows individual whole words of a
 message to be filtered against a blacklist. It allows multiple blacklists
-sharded by language. To make use of this module the client must add the xml:lang
+sharded by language. The internal bloomfilter can support arbitrary blacklist
-attribute to the message xml.
+sizes. Using a large list (say, 87M terms) will slow down the initial server
 boot time (to about 15 minutes respectively), but once loaded lookups are very
 speedy.
 To install in ejabberd:
@ -25,11 +27,31 @@ modules:
            en: /home/your_user/blacklist_en.txt
            cn: /home/your_user/blacklist_cn.txt
            fr: /home/your_user/blacklist_fr.txt
        charmaps:
            default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
            en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
 For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
 The blacklist file is a plain text file with blacklisted words listed one per
 line.
 You can also provide an optional 'charmap' for each language. This allows you
 to specify simple substitutions that will be made on the fly so you don't need
 to include those permutations in the blacklist. This keeps the blacklist small
 and reduces server startup time. For example, if you included the word:
 'xyza' in the blacklist, adding the following substitutions in the charmap
 would filter permutations such as 'XYZA', 'xYz4', or 'Xyz@' automatically.
 charmap format:
 [
 {"X", "x"},
 {"Y", "y"},
 {"Z", "z"},
 {"@", "a"},
 {"4", "a"}
 ].
 Gotchas:
 The language will be looked up by whatever value is passed in the xml:lang
@ -40,13 +62,11 @@ the 'default' entry in config will be used.
 For xml:lang attribute docs, see:
 http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message
-The internal bloomfilter used to ingest the blacklists currently requires about
+Blacklist helper
 4,000 entries in the blacklist to ensure acceptable error probability. (We've
 gotten around this by duplicating entries in a short list)
-Todo:
+Thinking of a bunch of swear words and all the permutations can be tough. We made
-
+a helper script to take a bare wordlist and generate permutations given a
-Look into acceptable error probabilities for shorter blacklists.
+dictionary of substitution characters: https://github.com/madglory/permute_wordlist
 Tip of the hat:
--- a/mod_pottymouth/conf/mod_pottymouth.yml
+++ b/mod_pottymouth/conf/mod_pottymouth.yml
@ -1,7 +1,8 @@
 modules:
    mod_pottymouth:
        blacklists:
-            default: /home/vagrant/blacklist_en.txt
+            default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
-            en: /home/vagrant/blacklist_en.txt
+            en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
-            cn: /home/vagrant/blacklist_cn.txt
+        charmaps:
-            fr: /home/vagrant/blacklist_fr.txt
+            default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
            en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
--- a/mod_pottymouth/mod_pottymouth.spec
+++ b/mod_pottymouth/mod_pottymouth.spec
@ -1,5 +1,5 @@
 author: "Tom Quackenbush <tom at madglory.com>"
 category: "data"
 summary: "Filter bad words in messages"
-home: "https://github.com/madglory/mod_pottymouth/tree/master"
+home: "https://github.com/processone/ejabberd-contrib/tree/master/"
-url: "git@github.com:madglory/mod_pottymouth.git"
+url: "git@github.com:processone/ejabberd-contrib.git"
--- a/mod_pottymouth/src/bloom_gen_server.erl
+++ b/mod_pottymouth/src/bloom_gen_server.erl
@ -5,12 +5,11 @@
 -include("logger.hrl").
 -import(etbloom, [bloom/1, member/2]).
-export([start/1]).
+-export([member/1]).
 %% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
        terminate/2, code_change/3]).
 -compile(export_all).
 serverName(Lang) ->
  list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) ->
 start({Lang, BlacklistFile} = _Opts) ->
  gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
-stop(_Host) ->
+stop() ->
    ok.
 init([BlacklistFile]) ->
-  ?INFO_MSG("Building bloom", []),
+  ?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
  Bloom = etbloom:sbf(10000000),
  {ok, loadWordList(Bloom, BlacklistFile)}.
--- a/mod_pottymouth/src/mod_pottymouth.erl
+++ b/mod_pottymouth/src/mod_pottymouth.erl
@ -8,12 +8,14 @@
  start/2,
  stop/1,
  on_filter_packet/1,
-  mod_opt_type/1
+  mod_opt_type/1,
  depends/2
 ]).
 -include("ejabberd.hrl").
 -import(bloom_gen_server, [start/0, stop/0, member/1]).
 -import(nomalize_leet_gen_server, [normalize/1]).
 getMessageLang(Attrs) ->
  LangAttr = lists:keyfind(<<"lang">>, 1, Attrs),
@ -26,8 +28,11 @@ getMessageLang(Attrs) ->
  end,
  Lang.
-censorWord({_Lang, Word} = MessageTerm) ->
+censorWord({Lang, Word} = _MessageTerm) ->
-  IsBadWord = bloom_gen_server:member(MessageTerm),
+  % we need unicode characters to normlize the word
  NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
  % we need bytewise format for bloom lookup
  IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
  if
    IsBadWord ->
      "****";
@ -38,30 +43,53 @@ censorWord({_Lang, Word} = MessageTerm) ->
 filterWords(L) ->
  lists:map(fun censorWord/1, L).
 filterMessageText(MessageAttrs, MessageText) ->
  Lang = getMessageLang(MessageAttrs),
  % we want to token-ize utf8 'words'
  MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
  MessageTerms = [{Lang, Word} || Word <- MessageWords],
  % we get back bytewise format terms (rather than utf8)
  string:join(filterWords(MessageTerms), " ").
 filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) ->
  FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))),
  FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]},
  filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody]));
 filterMessageBodyElements([H|T], MessageElements) ->
  % skip this tag, but pass it on as processed
  filterMessageBodyElements(T, lists:append(MessageElements, [H]));
 filterMessageBodyElements([], MessageElements) ->
  MessageElements.
 start(_Host, Opts) ->
  Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
  lists:map(fun bloom_gen_server:start/1, Blacklists),
  CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []),
  lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
  ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
  ok.
 stop(_Host) ->
  bloom_gen_server:stop(),
  normalize_leet_gen_server:stop(),
  ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
  ok.
 on_filter_packet(drop) ->
  drop;
-on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
+on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) ->
-  Lang = getMessageLang(Attrs),
+  FilteredEls = filterMessageBodyElements(Els, []),
-  MessageWords = string:tokens(binary_to_list(MessageText), " "),
+  {_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}};
  MessageTerms = [{Lang, Word} || Word <- MessageWords],
  FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")),
  {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
 on_filter_packet(Msg) ->
  % Handle the generic case (any packet that isn't a message with a body).
  Msg.
 mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end;
-mod_opt_type(_) -> [blacklists].
+mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end;
 mod_opt_type(_) -> [blacklists, charmaps].
 depends(_Host, _Opts) -> [].
--- a/mod_pottymouth/src/normalize_leet.erl
+++ b/mod_pottymouth/src/normalize_leet.erl
@ -0,0 +1,40 @@
 %% -*- coding: utf-8 -*-
 -module(normalize_leet).
 -export([
  normalize/2
 ]).
 distinctLetters([H|T]) ->
  distinctLetters(T, [H]).
 distinctLetters([H|T], Letters) ->
  distinctLetters(T, lists:umerge(Letters, [H]));
 distinctLetters([], Letters) ->
  Letters.
 checkMetaChar(Char) ->
  MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"],
  lists:member(Char, MetaChars).
 replaceChar(true, Char, X, Word) ->
  re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]);
 replaceChar(false, Char, X, Word) ->
  re:replace(Word, Char, X, [unicode,global,{return,list}]).
 replaceLetters([H|T], CharMap, Word) ->
  CurChar = [H],
  NormChar = maps:get(CurChar, CharMap, skip),
  if
    NormChar == skip ->
      replaceLetters(T, CharMap, Word);
    true ->
      IsMetaChar = checkMetaChar(CurChar),
      replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word))
  end;
 replaceLetters([], _CharMap, Word) ->
  Word.
 normalize(CharMap, Word) ->
  replaceLetters(distinctLetters(Word), CharMap, Word).
--- a/mod_pottymouth/src/normalize_leet_gen_server.erl
+++ b/mod_pottymouth/src/normalize_leet_gen_server.erl
@ -0,0 +1,46 @@
 -module(normalize_leet_gen_server).
 -behaviour(gen_server).
 -include("logger.hrl").
 -import(normailize_leet, [normalize/2]).
 -export([normalize/1]).
 %% gen_server callbacks
 -export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
        terminate/2, code_change/3]).
 serverName(Lang) ->
  list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
 normalize({Lang, Word} = _MessageToken) ->
  try gen_server:call(serverName(Lang), {normalize, Word})
  catch
    exit:{noproc, _Reason} -> Word
  end.
 start({Lang, CharMapFile} = _Opts) ->
  gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []).
 stop() ->
    ok.
 init([CharMapFile]) ->
  ?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]),
  {ok, loadCharMapConfig(file:consult(CharMapFile))}.
 loadCharMapConfig({ok, [CharMapConfig]}) ->
  maps:from_list(CharMapConfig);
 loadCharMapConfig({error, Reason}) ->
  ?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]),
  maps:new().
 handle_call({normalize, Word}, _From, CharMap) ->
  Reply = normalize_leet:normalize(CharMap, Word),
  {reply, Reply, CharMap}.
 handle_cast(_Msg, State) -> {noreply, State}.
 handle_info(_Info, State) -> {noreply, State}.
 terminate(_Reason, _State) -> ok.
 code_change(_OldVsn, State, _Extra) -> {ok, State}.