code cleanup. add optional normalize_leet to do dynamic substitution of one-to-one mappings (reduce blacklist size).

This commit is contained in:
Tom Quackenbush 2016-07-19 21:34:04 +00:00
parent 779024b9c5
commit 612b2b78e9
6 changed files with 105 additions and 18 deletions

View File

@ -1,7 +1,8 @@
modules:
mod_pottymouth:
blacklists:
default: /home/vagrant/blacklist_en.txt
en: /home/vagrant/blacklist_en.txt
cn: /home/vagrant/blacklist_cn.txt
fr: /home/vagrant/blacklist_fr.txt
default: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
en: /etc/ejabberd/modules/mod_pottymouth/blacklist_en.txt
charmaps:
default: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt
en: /etc/ejabberd/modules/mod_pottymouth/charmap_en.txt

View File

@ -1,5 +1,5 @@
author: "Tom Quackenbush <tom at madglory.com>"
category: "data"
summary: "Filter bad words in messages"
home: "https://github.com/madglory/mod_pottymouth/tree/master"
url: "git@github.com:madglory/mod_pottymouth.git"
home: "https://github.com/processone/ejabberd-contrib/tree/master/"
url: "git@github.com:processone/ejabberd-contrib.git"

View File

@ -5,12 +5,11 @@
-include("logger.hrl").
-import(etbloom, [bloom/1, member/2]).
-export([start/1]).
-export([member/1]).
%% gen_server callbacks
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-compile(export_all).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
@ -21,11 +20,11 @@ member({Lang, Word} = _MessageToken) ->
start({Lang, BlacklistFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop(_Host) ->
stop() ->
ok.
init([BlacklistFile]) ->
?INFO_MSG("Building bloom", []),
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
Bloom = etbloom:sbf(10000000),
{ok, loadWordList(Bloom, BlacklistFile)}.

View File

@ -2,8 +2,6 @@
-behaviour(gen_mod).
-include("logger.hrl").
-export([
start/2,
stop/1,
@ -14,6 +12,7 @@
-include("ejabberd.hrl").
-import(bloom_gen_server, [start/0, stop/0, member/1]).
-import(nomalize_leet_gen_server, [normalize/1]).
getMessageLang(Attrs) ->
LangAttr = lists:keyfind(<<"lang">>, 1, Attrs),
@ -26,8 +25,8 @@ getMessageLang(Attrs) ->
end,
Lang.
censorWord({_Lang, Word} = MessageTerm) ->
IsBadWord = bloom_gen_server:member(MessageTerm),
censorWord({Lang, Word} = MessageTerm) ->
IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}),
if
IsBadWord ->
"****";
@ -41,11 +40,14 @@ filterWords(L) ->
start(_Host, Opts) ->
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
lists:map(fun bloom_gen_server:start/1, Blacklists),
CharMaps = gen_mod:get_opt(charmaps, Opts, fun(A) -> A end, []),
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
ok.
stop(_Host) ->
bloom_gen_server:stop(),
normalize_leet_gen_server:stop(),
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
ok.
@ -54,9 +56,9 @@ on_filter_packet(drop) ->
on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
Lang = getMessageLang(Attrs),
MessageWords = string:tokens(binary_to_list(MessageText), " "),
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
MessageTerms = [{Lang, Word} || Word <- MessageWords],
FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")),
FilteredMessageWords = unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")),
{_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
on_filter_packet(Msg) ->
@ -64,4 +66,5 @@ on_filter_packet(Msg) ->
Msg.
mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end;
mod_opt_type(_) -> [blacklists].
mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end;
mod_opt_type(_) -> [blacklists, charmaps].

View File

@ -0,0 +1,38 @@
-module(normalize_leet).
-export([
normalize/2
]).
distinctLetters([H|T]) ->
distinctLetters(T, [H]).
distinctLetters([H|T], Letters) ->
distinctLetters(T, lists:umerge(Letters, [H]));
distinctLetters([], Letters) ->
Letters.
checkMetaChar(Char) ->
MetaChars = ["\\", "^", "$", ".", "|", "?", "*", "+", "(", ")", "[", "{"],
lists:member(Char, MetaChars).
replaceChar(true, Char, X, Word) ->
re:replace(Word, ["\\", Char], X, [global,{return,list}]);
replaceChar(false, Char, X, Word) ->
re:replace(Word, Char, X, [global,{return,list}]).
replaceLetters([H|T], CharMap, Word) ->
CurChar = [H],
NormChar = maps:get(CurChar, CharMap, skip),
if
NormChar == skip ->
replaceLetters(T, CharMap, Word);
true ->
IsMetaChar = checkMetaChar(CurChar),
replaceLetters(T, CharMap, replaceChar(IsMetaChar, CurChar, NormChar, Word))
end;
replaceLetters([], _CharMap, Word) ->
Word.
normalize(CharMap, Word) ->
replaceLetters(distinctLetters(Word), CharMap, Word).

View File

@ -0,0 +1,46 @@
-module(normalize_leet_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-import(normailize_leet, [normalize/2]).
-export([normalize/1]).
%% gen_server callbacks
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
normalize({Lang, Word} = _MessageToken) ->
try gen_server:call(serverName(Lang), {normalize, Word})
catch
exit:{noproc, _Reason} -> Word
end.
start({Lang, CharMapFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [CharMapFile], []).
stop() ->
ok.
init([CharMapFile]) ->
?INFO_MSG("NormalizeLeet Loading: ~p~n", [CharMapFile]),
{ok, loadCharMapConfig(file:consult(CharMapFile))}.
loadCharMapConfig({ok, [CharMapConfig]}) ->
maps:from_list(CharMapConfig);
loadCharMapConfig({error, Reason}) ->
?INFO_MSG("NormalizeLeet Error: ~p~n", [Reason]),
maps:new().
handle_call({normalize, Word}, _From, CharMap) ->
Reply = normalize_leet:normalize(CharMap, Word),
{reply, Reply, CharMap}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.