Merge pull request #181 from madglory/master
dynamic character replacement / muc message filter fixes
This commit is contained in:
commit
da878bd75c
|
@ -1,62 +0,0 @@
|
||||||
# mod_pottymouth
|
|
||||||
|
|
||||||
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
|
|
||||||
which has disappeared from the net. It allows individual whole words of a
|
|
||||||
message to be filtered against a blacklist. It allows multiple blacklists
|
|
||||||
sharded by language. The internal bloomfilter can support arbitrary blacklist
|
|
||||||
sizes. Using a large list (say, 87M terms) will slow down the initial server
|
|
||||||
boot time (to about 15 minutes respectively), but once loaded lookups are very
|
|
||||||
speedy.
|
|
||||||
|
|
||||||
#### Installation
|
|
||||||
|
|
||||||
On Ubuntu:
|
|
||||||
````
|
|
||||||
cd ~/.ejabberd-modules/sources
|
|
||||||
clone the git repo
|
|
||||||
cd mod_pottymouth
|
|
||||||
ejabberdctl module_install mod_pottymouth
|
|
||||||
ejabberdctl restart
|
|
||||||
````
|
|
||||||
|
|
||||||
module will be installed in: ~/.ejabberd-modules/mod_pottymouth
|
|
||||||
|
|
||||||
#### Config
|
|
||||||
|
|
||||||
The file format is as follows:
|
|
||||||
|
|
||||||
````
|
|
||||||
modules:
|
|
||||||
mod_pottymouth:
|
|
||||||
blacklists:
|
|
||||||
default: /home/your_user/blacklist_en.txt
|
|
||||||
en: /home/your_user/blacklist_en.txt
|
|
||||||
cn: /home/your_user/blacklist_cn.txt
|
|
||||||
fr: /home/your_user/blacklist_fr.txt
|
|
||||||
````
|
|
||||||
|
|
||||||
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
|
|
||||||
The blacklist file is a plain text file with blacklisted words listed one per
|
|
||||||
line.
|
|
||||||
|
|
||||||
#### Gotchas
|
|
||||||
|
|
||||||
The language will be looked up by whatever value is passed in the xml:lang
|
|
||||||
attribute of the xml message. So, any xml:lang value to be supported will need
|
|
||||||
a corresponding entry/blacklist in the config file. If xml:lang is missing,
|
|
||||||
the 'default' entry in config will be used.
|
|
||||||
|
|
||||||
For xml:lang attribute docs, see:
|
|
||||||
[http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message](http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message)
|
|
||||||
|
|
||||||
#### Blacklist helper
|
|
||||||
|
|
||||||
Thinking of a bunch of swear words and all the permutations can be tough. We made
|
|
||||||
a helper script to take a bare wordlist and generate permutations given a
|
|
||||||
dictionary of substitution characters:
|
|
||||||
[https://github.com/madglory/permute_wordlist](https://github.com/madglory/permute_wordlist)
|
|
||||||
|
|
||||||
#### Tip of the hat
|
|
||||||
|
|
||||||
This mod makes use of the excellent 'etbloom' module:
|
|
||||||
[https://github.com/erlangtoolbox/etbloom](https://github.com/erlangtoolbox/etbloom)
|
|
|
@ -1,8 +1,10 @@
|
||||||
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
|
The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit'
|
||||||
which has disappeared from the net. It allows individual whole words of a
|
which has disappeared from the net. It allows individual whole words of a
|
||||||
message to be filtered against a blacklist. It allows multiple blacklists
|
message to be filtered against a blacklist. It allows multiple blacklists
|
||||||
sharded by language. To make use of this module the client must add the xml:lang
|
sharded by language. The internal bloomfilter can support arbitrary blacklist
|
||||||
attribute to the message xml.
|
sizes. Using a large list (say, 87M terms) will slow down the initial server
|
||||||
|
boot time (to about 15 minutes respectively), but once loaded lookups are very
|
||||||
|
speedy.
|
||||||
|
|
||||||
To install in ejabberd:
|
To install in ejabberd:
|
||||||
|
|
||||||
|
@ -25,11 +27,31 @@ modules:
|
||||||
en: /home/your_user/blacklist_en.txt
|
en: /home/your_user/blacklist_en.txt
|
||||||
cn: /home/your_user/blacklist_cn.txt
|
cn: /home/your_user/blacklist_cn.txt
|
||||||
fr: /home/your_user/blacklist_fr.txt
|
fr: /home/your_user/blacklist_fr.txt
|
||||||
|
charmaps:
|
||||||
|
default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
|
||||||
|
en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
|
||||||
|
|
||||||
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
|
For each language (en,cn,fr,...whatever) provide a full path to a backlist file.
|
||||||
The blacklist file is a plain text file with blacklisted words listed one per
|
The blacklist file is a plain text file with blacklisted words listed one per
|
||||||
line.
|
line.
|
||||||
|
|
||||||
|
You can also provide an optional 'charmap' for each language. This allows you
|
||||||
|
to specify simple substitutions that will be made on the fly so you don't need
|
||||||
|
to include those permutations in the blacklist. This keeps the blacklist small
|
||||||
|
and reduces server startup time. For example, if you included the word:
|
||||||
|
'xyza' in the blacklist, adding the following substitutions in the charmap
|
||||||
|
would filter permutations such as 'XYZA', 'xYz4', or 'Xyz@' automatically.
|
||||||
|
|
||||||
|
charmap format:
|
||||||
|
|
||||||
|
[
|
||||||
|
{"X", "x"},
|
||||||
|
{"Y", "y"},
|
||||||
|
{"Z", "z"},
|
||||||
|
{"@", "a"},
|
||||||
|
{"4", "a"}
|
||||||
|
].
|
||||||
|
|
||||||
Gotchas:
|
Gotchas:
|
||||||
|
|
||||||
The language will be looked up by whatever value is passed in the xml:lang
|
The language will be looked up by whatever value is passed in the xml:lang
|
||||||
|
@ -40,13 +62,11 @@ the 'default' entry in config will be used.
|
||||||
For xml:lang attribute docs, see:
|
For xml:lang attribute docs, see:
|
||||||
http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message
|
http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message
|
||||||
|
|
||||||
The internal bloomfilter used to ingest the blacklists currently requires about
|
Blacklist helper
|
||||||
4,000 entries in the blacklist to ensure acceptable error probability. (We've
|
|
||||||
gotten around this by duplicating entries in a short list)
|
|
||||||
|
|
||||||
Todo:
|
Thinking of a bunch of swear words and all the permutations can be tough. We made
|
||||||
|
a helper script to take a bare wordlist and generate permutations given a
|
||||||
Look into acceptable error probabilities for shorter blacklists.
|
dictionary of substitution characters: https://github.com/madglory/permute_wordlist
|
||||||
|
|
||||||
Tip of the hat:
|
Tip of the hat:
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
modules:
|
modules:
|
||||||
mod_pottymouth:
|
mod_pottymouth:
|
||||||
blacklists:
|
blacklists:
|
||||||
default: /home/vagrant/blacklist_en.txt
|
default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
|
||||||
en: /home/vagrant/blacklist_en.txt
|
en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
|
||||||
cn: /home/vagrant/blacklist_cn.txt
|
charmaps:
|
||||||
fr: /home/vagrant/blacklist_fr.txt
|
default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
|
||||||
|
en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
author: "Tom Quackenbush <tom at madglory.com>"
|
author: "Tom Quackenbush <tom at madglory.com>"
|
||||||
category: "data"
|
category: "data"
|
||||||
summary: "Filter bad words in messages"
|
summary: "Filter bad words in messages"
|
||||||
home: "https://github.com/madglory/mod_pottymouth/tree/master"
|
home: "https://github.com/processone/ejabberd-contrib/tree/master/"
|
||||||
url: "git@github.com:madglory/mod_pottymouth.git"
|
url: "git@github.com:processone/ejabberd-contrib.git"
|
||||||
|
|
|
@ -5,12 +5,11 @@
|
||||||
-include("logger.hrl").
|
-include("logger.hrl").
|
||||||
|
|
||||||
-import(etbloom, [bloom/1, member/2]).
|
-import(etbloom, [bloom/1, member/2]).
|
||||||
-export([start/1]).
|
-export([member/1]).
|
||||||
|
|
||||||
%% gen_server callbacks
|
%% gen_server callbacks
|
||||||
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
|
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
|
||||||
terminate/2, code_change/3]).
|
terminate/2, code_change/3]).
|
||||||
-compile(export_all).
|
|
||||||
|
|
||||||
serverName(Lang) ->
|
serverName(Lang) ->
|
||||||
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
|
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
|
||||||
|
@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) ->
|
||||||
start({Lang, BlacklistFile} = _Opts) ->
|
start({Lang, BlacklistFile} = _Opts) ->
|
||||||
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
|
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
|
||||||
|
|
||||||
stop(_Host) ->
|
stop() ->
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
init([BlacklistFile]) ->
|
init([BlacklistFile]) ->
|
||||||
?INFO_MSG("Building bloom", []),
|
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
|
||||||
Bloom = etbloom:sbf(10000000),
|
Bloom = etbloom:sbf(10000000),
|
||||||
{ok, loadWordList(Bloom, BlacklistFile)}.
|
{ok, loadWordList(Bloom, BlacklistFile)}.
|
||||||
|
|
||||||
|
|
|
@ -8,12 +8,14 @@
|
||||||
start/2,
|
start/2,
|
||||||
stop/1,
|
stop/1,
|
||||||
on_filter_packet/1,
|
on_filter_packet/1,
|
||||||
mod_opt_type/1
|
mod_opt_type/1,
|
||||||
|
depends/2
|
||||||
]).
|
]).
|
||||||
|
|
||||||
-include("ejabberd.hrl").
|
-include("ejabberd.hrl").
|
||||||
|
|
||||||
-import(bloom_gen_server, [start/0, stop/0, member/1]).
|
-import(bloom_gen_server, [start/0, stop/0, member/1]).
|
||||||
|
-import(nomalize_leet_gen_server, [normalize/1]).
|
||||||
|
|
||||||
getMessageLang(Attrs) ->
|
getMessageLang(Attrs) ->
|
||||||
LangAttr = lists:keyfind(<<"lang">>, 1, Attrs),
|
LangAttr = lists:keyfind(<<"lang">>, 1, Attrs),
|
||||||
|
@ -26,8 +28,11 @@ getMessageLang(Attrs) ->
|
||||||
end,
|
end,
|
||||||
Lang.
|
Lang.
|
||||||
|
|
||||||
censorWord({_Lang, Word} = MessageTerm) ->
|
censorWord({Lang, Word} = _MessageTerm) ->
|
||||||
IsBadWord = bloom_gen_server:member(MessageTerm),
|
% we need unicode characters to normlize the word
|
||||||
|
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
|
||||||
|
% we need bytewise format for bloom lookup
|
||||||
|
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
|
||||||
if
|
if
|
||||||
IsBadWord ->
|
IsBadWord ->
|
||||||
"****";
|
"****";
|
||||||
|
@ -38,30 +43,53 @@ censorWord({_Lang, Word} = MessageTerm) ->
|
||||||
filterWords(L) ->
|
filterWords(L) ->
|
||||||
lists:map(fun censorWord/1, L).
|
lists:map(fun censorWord/1, L).
|
||||||
|
|
||||||
|
filterMessageText(MessageAttrs, MessageText) ->
|
||||||
|
Lang = getMessageLang(MessageAttrs),
|
||||||
|
% we want to token-ize utf8 'words'
|
||||||
|
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
|
||||||
|
MessageTerms = [{Lang, Word} || Word <- MessageWords],
|
||||||
|
% we get back bytewise format terms (rather than utf8)
|
||||||
|
string:join(filterWords(MessageTerms), " ").
|
||||||
|
|
||||||
|
|
||||||
|
filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) ->
|
||||||
|
FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))),
|
||||||
|
FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]},
|
||||||
|
filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody]));
|
||||||
|
|
||||||
|
filterMessageBodyElements([H|T], MessageElements) ->
|
||||||
|
% skip this tag, but pass it on as processed
|
||||||
|
filterMessageBodyElements(T, lists:append(MessageElements, [H]));
|
||||||
|
|
||||||
|
filterMessageBodyElements([], MessageElements) ->
|
||||||
|
MessageElements.
|
||||||
|
|
||||||
|
|
||||||
start(_Host, Opts) ->
|
start(_Host, Opts) ->
|
||||||
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
|
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
|
||||||
lists:map(fun bloom_gen_server:start/1, Blacklists),
|
lists:map(fun bloom_gen_server:start/1, Blacklists),
|
||||||
|
CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []),
|
||||||
|
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
|
||||||
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
stop(_Host) ->
|
stop(_Host) ->
|
||||||
bloom_gen_server:stop(),
|
bloom_gen_server:stop(),
|
||||||
|
normalize_leet_gen_server:stop(),
|
||||||
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
on_filter_packet(drop) ->
|
on_filter_packet(drop) ->
|
||||||
drop;
|
drop;
|
||||||
|
|
||||||
on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
|
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) ->
|
||||||
Lang = getMessageLang(Attrs),
|
FilteredEls = filterMessageBodyElements(Els, []),
|
||||||
MessageWords = string:tokens(binary_to_list(MessageText), " "),
|
{_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}};
|
||||||
MessageTerms = [{Lang, Word} || Word <- MessageWords],
|
|
||||||
FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")),
|
|
||||||
{_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
|
|
||||||
|
|
||||||
on_filter_packet(Msg) ->
|
on_filter_packet(Msg) ->
|
||||||
% Handle the generic case (any packet that isn't a message with a body).
|
% Handle the generic case (any packet that isn't a message with a body).
|
||||||
Msg.
|
Msg.
|
||||||
|
|
||||||
mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end;
|
mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end;
|
||||||
mod_opt_type(_) -> [blacklists].
|
mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end;
|
||||||
|
mod_opt_type(_) -> [blacklists, charmaps].
|
||||||
|
depends(_Host, _Opts) -> [].
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
%% -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
-module(normalize_leet).
|
||||||
|
|
||||||
|
-export([
|
||||||
|
normalize/2
|
||||||
|
]).
|
||||||
|
|
||||||
|
distinctLetters([H|T]) ->
|
||||||
|
distinctLetters(T, [H]).
|
||||||
|
|
||||||
|
distinctLetters([H|T], Letters) ->
|
||||||
|
distinctLetters(T, lists:umerge(Letters, [H]));
|
||||||
|
distinctLetters([], Letters) ->
|
||||||
|
Letters.
|
||||||
|
|
||||||
|
checkMetaChar(Char) ->
|
||||||
|
MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"],
|
||||||
|
lists:member(Char, MetaChars).
|
||||||
|
|
||||||
|
replaceChar(true, Char, X, Word) ->
|
||||||
|
re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]);
|
||||||
|
replaceChar(false, Char, X, Word) ->
|
||||||
|
re:replace(Word, Char, X, [unicode,global,{return,list}]).
|
||||||
|
|
||||||
|
replaceLetters([H|T], CharMap, Word) ->
|
||||||
|
CurChar = [H],
|
||||||
|
NormChar = maps:get(CurChar, CharMap, skip),
|
||||||
|
if
|
||||||
|
NormChar == skip ->
|
||||||
|
replaceLetters(T, CharMap, Word);
|
||||||
|
true ->
|
||||||
|
IsMetaChar = checkMetaChar(CurChar),
|
||||||
|
replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word))
|
||||||
|
end;
|
||||||
|
replaceLetters([], _CharMap, Word) ->
|
||||||
|
Word.
|
||||||
|
|
||||||
|
normalize(CharMap, Word) ->
|
||||||
|
replaceLetters(distinctLetters(Word), CharMap, Word).
|
|
@ -0,0 +1,46 @@
|
||||||
|
-module(normalize_leet_gen_server).
|
||||||
|
|
||||||
|
-behaviour(gen_server).
|
||||||
|
|
||||||
|
-include("logger.hrl").
|
||||||
|
|
||||||
|
-import(normailize_leet, [normalize/2]).
|
||||||
|
-export([normalize/1]).
|
||||||
|
|
||||||
|
%% gen_server callbacks
|
||||||
|
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
|
||||||
|
terminate/2, code_change/3]).
|
||||||
|
|
||||||
|
serverName(Lang) ->
|
||||||
|
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
|
||||||
|
|
||||||
|
normalize({Lang, Word} = _MessageToken) ->
|
||||||
|
try gen_server:call(serverName(Lang), {normalize, Word})
|
||||||
|
catch
|
||||||
|
exit:{noproc, _Reason} -> Word
|
||||||
|
end.
|
||||||
|
|
||||||
|
start({Lang, CharMapFile} = _Opts) ->
|
||||||
|
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []).
|
||||||
|
|
||||||
|
stop() ->
|
||||||
|
ok.
|
||||||
|
|
||||||
|
init([CharMapFile]) ->
|
||||||
|
?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]),
|
||||||
|
{ok, loadCharMapConfig(file:consult(CharMapFile))}.
|
||||||
|
|
||||||
|
loadCharMapConfig({ok, [CharMapConfig]}) ->
|
||||||
|
maps:from_list(CharMapConfig);
|
||||||
|
loadCharMapConfig({error, Reason}) ->
|
||||||
|
?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]),
|
||||||
|
maps:new().
|
||||||
|
|
||||||
|
handle_call({normalize, Word}, _From, CharMap) ->
|
||||||
|
Reply = normalize_leet:normalize(CharMap, Word),
|
||||||
|
{reply, Reply, CharMap}.
|
||||||
|
|
||||||
|
handle_cast(_Msg, State) -> {noreply, State}.
|
||||||
|
handle_info(_Info, State) -> {noreply, State}.
|
||||||
|
terminate(_Reason, _State) -> ok.
|
||||||
|
code_change(_OldVsn, State, _Extra) -> {ok, State}.
|
Loading…
Reference in New Issue