Merge pull request #181 from madglory/master

dynamic character replacement / muc message filter fixes
This commit is contained in:
badlop 2016-11-10 20:21:14 +01:00 committed by GitHub
commit da878bd75c
8 changed files with 164 additions and 92 deletions

View File

@ -1,62 +0,0 @@
# mod_pottymouth
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
which has disappeared from the net. It allows individual whole words of a
message to be filtered against a blacklist. It allows multiple blacklists
sharded by language. The internal bloomfilter can support arbitrary blacklist
sizes. Using a large list (say, 87M terms) will slow down the initial server
boot time (to about 15 minutes respectively), but once loaded lookups are very
speedy.
#### Installation
On Ubuntu:
````
cd ~/.ejabberd-modules/sources
clone the git repo
cd mod_pottymouth
ejabberdctl module_install mod_pottymouth
ejabberdctl restart
````
module will be installed in: ~/.ejabberd-modules/mod_pottymouth
#### Config
The file format is as follows:
````
modules:
mod_pottymouth:
blacklists:
default: /home/your_user/blacklist_en.txt
en: /home/your_user/blacklist_en.txt
cn: /home/your_user/blacklist_cn.txt
fr: /home/your_user/blacklist_fr.txt
````
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
The blacklist file is a plain text file with blacklisted words listed one per
line.
#### Gotchas
The language will be looked up by whatever value is passed in the xml:lang
attribute of the xml message. So, any xml:lang value to be supported will need
a corresponding entry/blacklist in the config file. If xml:lang is missing,
the 'default' entry in config will be used.
For xml:lang attribute docs, see:
[http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message)
#### Blacklist helper
Thinking of a bunch of swear words and all the permutations can be tough. We made
a helper script to take a bare wordlist and generate permutations given a
dictionary of substitution characters:
[https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist)
#### Tip of the hat
This mod makes use of the excellent 'etbloom' module:
[https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom)

View File

@ -1,8 +1,10 @@
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
which has disappeared from the net. It allows individual whole words of a
message to be filtered against a blacklist. It allows multiple blacklists
sharded by language. To make use of this module the client must add the xml:lang
attribute to the message xml.
sharded by language. The internal bloomfilter can support arbitrary blacklist
sizes. Using a large list (say, 87M terms) will slow down the initial server
boot time (to about 15 minutes respectively), but once loaded lookups are very
speedy.
To install in ejabberd:
@ -25,11 +27,31 @@ modules:
en: /home/your_user/blacklist_en.txt
cn: /home/your_user/blacklist_cn.txt
fr: /home/your_user/blacklist_fr.txt
charmaps:
default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
The blacklist file is a plain text file with blacklisted words listed one per
line.
You can also provide an optional 'charmap' for each language. This allows you
to specify simple substitutions that will be made on the fly so you don't need
to include those permutations in the blacklist. This keeps the blacklist small
and reduces server startup time. For example, if you included the word:
'xyza' in the blacklist, adding the following substitutions in the charmap
would filter permutations such as 'XYZA', 'xYz4', or 'Xyz@' automatically.
charmap format:
[
{"X", "x"},
{"Y", "y"},
{"Z", "z"},
{"@", "a"},
{"4", "a"}
].
Gotchas:
The language will be looked up by whatever value is passed in the xml:lang
@ -40,13 +62,11 @@ the 'default' entry in config will be used.
For xml:lang attribute docs, see:
http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message
The internal bloomfilter used to ingest the blacklists currently requires about
4,000 entries in the blacklist to ensure acceptable error probability. (We've
gotten around this by duplicating entries in a short list)
Blacklist helper
Todo:
Look into acceptable error probabilities for shorter blacklists.
Thinking of a bunch of swear words and all the permutations can be tough. We made
a helper script to take a bare wordlist and generate permutations given a
dictionary of substitution characters: https://github.com/madglory/permute_wordlist
Tip of the hat:

View File

@ -1,7 +1,8 @@
modules:
mod_pottymouth:
blacklists:
default: /home/vagrant/blacklist_en.txt
en: /home/vagrant/blacklist_en.txt
cn: /home/vagrant/blacklist_cn.txt
fr: /home/vagrant/blacklist_fr.txt
default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
charmaps:
default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt

View File

@ -1,5 +1,5 @@
author: "Tom Quackenbush <tom at madglory.com>"
category: "data"
summary: "Filter bad words in messages"
home: "https://github.com/madglory/mod_pottymouth/tree/master"
url: "git@github.com:madglory/mod_pottymouth.git"
home: "https://github.com/processone/ejabberd-contrib/tree/master/"
url: "git@github.com:processone/ejabberd-contrib.git"

View File

@ -5,12 +5,11 @@
-include("logger.hrl").
-import(etbloom, [bloom/1, member/2]).
-export([start/1]).
-export([member/1]).
%% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-compile(export_all).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) ->
start({Lang, BlacklistFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop(_Host) ->
stop() ->
ok.
init([BlacklistFile]) ->
?INFO_MSG("Building bloom", []),
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
Bloom = etbloom:sbf(10000000),
{ok, loadWordList(Bloom, BlacklistFile)}.

View File

@ -8,12 +8,14 @@
start/2,
stop/1,
on_filter_packet/1,
mod_opt_type/1
mod_opt_type/1,
depends/2
]).
-include("ejabberd.hrl").
-import(bloom_gen_server, [start/0, stop/0, member/1]).
-import(nomalize_leet_gen_server, [normalize/1]).
getMessageLang(Attrs) ->
LangAttr = lists:keyfind(<<"lang">>, 1, Attrs),
@ -26,8 +28,11 @@ getMessageLang(Attrs) ->
end,
Lang.
censorWord({_Lang, Word} = MessageTerm) ->
IsBadWord = bloom_gen_server:member(MessageTerm),
censorWord({Lang, Word} = _MessageTerm) ->
% we need unicode characters to normlize the word
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
% we need bytewise format for bloom lookup
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
if
IsBadWord ->
"****";
@ -38,30 +43,53 @@ censorWord({_Lang, Word} = MessageTerm) ->
filterWords(L) ->
lists:map(fun censorWord/1, L).
filterMessageText(MessageAttrs, MessageText) ->
Lang = getMessageLang(MessageAttrs),
% we want to token-ize utf8 'words'
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
MessageTerms = [{Lang, Word} || Word <- MessageWords],
% we get back bytewise format terms (rather than utf8)
string:join(filterWords(MessageTerms), " ").
filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) ->
FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))),
FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]},
filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody]));
filterMessageBodyElements([H|T], MessageElements) ->
% skip this tag, but pass it on as processed
filterMessageBodyElements(T, lists:append(MessageElements, [H]));
filterMessageBodyElements([], MessageElements) ->
MessageElements.
start(_Host, Opts) ->
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
lists:map(fun bloom_gen_server:start/1, Blacklists),
CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []),
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
ok.
stop(_Host) ->
bloom_gen_server:stop(),
normalize_leet_gen_server:stop(),
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
ok.
on_filter_packet(drop) ->
drop;
on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
Lang = getMessageLang(Attrs),
MessageWords = string:tokens(binary_to_list(MessageText), " "),
MessageTerms = [{Lang, Word} || Word <- MessageWords],
FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")),
{_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) ->
FilteredEls = filterMessageBodyElements(Els, []),
{_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}};
on_filter_packet(Msg) ->
% Handle the generic case (any packet that isn't a message with a body).
Msg.
mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end;
mod_opt_type(_) -> [blacklists].
mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end;
mod_opt_type(_) -> [blacklists, charmaps].
depends(_Host, _Opts) -> [].

View File

@ -0,0 +1,40 @@
%% -*- coding: utf-8 -*-
-module(normalize_leet).
-export([
normalize/2
]).
distinctLetters([H|T]) ->
distinctLetters(T, [H]).
distinctLetters([H|T], Letters) ->
distinctLetters(T, lists:umerge(Letters, [H]));
distinctLetters([], Letters) ->
Letters.
checkMetaChar(Char) ->
MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"],
lists:member(Char, MetaChars).
replaceChar(true, Char, X, Word) ->
re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]);
replaceChar(false, Char, X, Word) ->
re:replace(Word, Char, X, [unicode,global,{return,list}]).
replaceLetters([H|T], CharMap, Word) ->
CurChar = [H],
NormChar = maps:get(CurChar, CharMap, skip),
if
NormChar == skip ->
replaceLetters(T, CharMap, Word);
true ->
IsMetaChar = checkMetaChar(CurChar),
replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word))
end;
replaceLetters([], _CharMap, Word) ->
Word.
normalize(CharMap, Word) ->
replaceLetters(distinctLetters(Word), CharMap, Word).

View File

@ -0,0 +1,46 @@
-module(normalize_leet_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-import(normailize_leet, [normalize/2]).
-export([normalize/1]).
%% gen_server callbacks
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
normalize({Lang, Word} = _MessageToken) ->
try gen_server:call(serverName(Lang), {normalize, Word})
catch
exit:{noproc, _Reason} -> Word
end.
start({Lang, CharMapFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []).
stop() ->
ok.
init([CharMapFile]) ->
?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]),
{ok, loadCharMapConfig(file:consult(CharMapFile))}.
loadCharMapConfig({ok, [CharMapConfig]}) ->
maps:from_list(CharMapConfig);
loadCharMapConfig({error, Reason}) ->
?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]),
maps:new().
handle_call({normalize, Word}, _From, CharMap) ->
Reply = normalize_leet:normalize(CharMap, Word),
{reply, Reply, CharMap}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.