Use banword instead of bloom + etbloom + bitarray (thanks to ytkang)
This commit is contained in:
parent
2aea4659cc
commit
00e3fa97fc
|
@ -6,24 +6,6 @@ sizes. Using a large list (say, 87M terms) will slow down the initial server
|
||||||
boot time (to about 15 minutes respectively), but once loaded lookups are very
|
boot time (to about 15 minutes respectively), but once loaded lookups are very
|
||||||
speedy.
|
speedy.
|
||||||
|
|
||||||
Prerequisite bitarray lib:
|
|
||||||
|
|
||||||
mod_pottymouth uses a modified version of the 'etbloom' library that uses
|
|
||||||
'bitarray' to replace 'hipe_bifs'. Ejabberd doesn't handle installing
|
|
||||||
dependences of dependecies quite so well (etbloom being a dep of mod_pottymouth
|
|
||||||
and bitarray being a dep of etbloom), so bitarray needs to be installed manually
|
|
||||||
before installation of mod_pottymouth.
|
|
||||||
|
|
||||||
This is how I got it to work... YMMV.
|
|
||||||
|
|
||||||
1. Make sure ejabberd is running
|
|
||||||
2. Get the updated ejabberd-contrib sources:
|
|
||||||
ejabberdctl modules_update_specs
|
|
||||||
3. Execute this script to get and build the dependencies:
|
|
||||||
~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/make-deps.sh
|
|
||||||
4. When you install the module, its dependencies will be installed as well:
|
|
||||||
ejabberdctl module_install mod_pottymouth
|
|
||||||
|
|
||||||
Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml
|
Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml
|
||||||
|
|
||||||
modules:
|
modules:
|
||||||
|
@ -73,8 +55,3 @@ Blacklist helper
|
||||||
Thinking of a bunch of swear words and all the permutations can be tough. We made
|
Thinking of a bunch of swear words and all the permutations can be tough. We made
|
||||||
a helper script to take a bare wordlist and generate permutations given a
|
a helper script to take a bare wordlist and generate permutations given a
|
||||||
dictionary of substitution characters: https://github.com/madglory/permute_wordlist
|
dictionary of substitution characters: https://github.com/madglory/permute_wordlist
|
||||||
|
|
||||||
Tip of the hat:
|
|
||||||
|
|
||||||
This mod makes use of the excellent 'etbloom' module:
|
|
||||||
https://github.com/erlangtoolbox/etbloom
|
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
PWD=`pwd`
|
|
||||||
|
|
||||||
cd ~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/
|
|
||||||
mkdir deps
|
|
||||||
cd deps
|
|
||||||
git clone https://github.com/madglory/etbloom.git
|
|
||||||
cd etbloom
|
|
||||||
rm -rf deps/
|
|
||||||
./rebar get-deps
|
|
||||||
cp ../../rebar.config.bitarray deps/bitarray/rebar.config
|
|
||||||
./rebar compile
|
|
||||||
cp -R deps/bitarray/ebin ../../
|
|
||||||
cp -R deps/bitarray/priv ../../
|
|
||||||
cp -R deps/proper/ebin ../../
|
|
||||||
|
|
||||||
cd $PWD
|
|
|
@ -1,14 +0,0 @@
|
||||||
{ct_extra_params, "-env ERL_LIBS deps/"}.
|
|
||||||
{port_env,
|
|
||||||
[ {"DRV_CFLAGS", "-fPIC -O2 $ERL_CFLAGS"},
|
|
||||||
{"ERL_LDFLAGS", " -L$ERL_EI_LIBDIR -lei"}
|
|
||||||
]}.
|
|
||||||
{port_specs, [{"priv/bitarray.so", ["c_src/bitarray_nif.c"]}]}.
|
|
||||||
|
|
||||||
|
|
||||||
{deps, [
|
|
||||||
%% For runtime
|
|
||||||
%% --
|
|
||||||
%% For tests
|
|
||||||
{proper, "1.2",
|
|
||||||
{git, "https://github.com/manopapad/proper.git", {tag, "v1.2"}}}]}.
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
-module(banword_gen_server).
|
||||||
|
|
||||||
|
-behaviour(gen_server).
|
||||||
|
|
||||||
|
-include("logger.hrl").
|
||||||
|
|
||||||
|
-export([member/1]).
|
||||||
|
|
||||||
|
%% gen_server callbacks
|
||||||
|
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
|
||||||
|
terminate/2, code_change/3]).
|
||||||
|
|
||||||
|
serverName(Lang) ->
|
||||||
|
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
|
||||||
|
|
||||||
|
member({Lang, Word} = _MessageToken) ->
|
||||||
|
gen_server:call(serverName(Lang), {member, Word}).
|
||||||
|
|
||||||
|
start({Lang, BlacklistFile} = _Opts) ->
|
||||||
|
Name = serverName(Lang),
|
||||||
|
?INFO_MSG("Building blacklist name ~p~n", [Name]),
|
||||||
|
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
|
||||||
|
|
||||||
|
stop() ->
|
||||||
|
ok.
|
||||||
|
|
||||||
|
readlines(FileName) ->
|
||||||
|
{ok, Data} = file:read_file(FileName),
|
||||||
|
BinList = binary:split(Data, [<<"\n">>], [global]),
|
||||||
|
[binary_to_list(X) || X <- BinList].
|
||||||
|
|
||||||
|
init([BlacklistFile]) ->
|
||||||
|
?INFO_MSG("Building blacklist ~p~n", [BlacklistFile]),
|
||||||
|
{ok, loadWordList(BlacklistFile)}.
|
||||||
|
|
||||||
|
check_banword(Word, BlackWord) ->
|
||||||
|
try
|
||||||
|
% ?INFO_MSG("== CHECK == ~p ~p~n", [Word, BlackWord]),
|
||||||
|
Res = string:rstr(Word, BlackWord),
|
||||||
|
if
|
||||||
|
Res > 0 ->
|
||||||
|
true;
|
||||||
|
true ->
|
||||||
|
false
|
||||||
|
end
|
||||||
|
catch _ ->
|
||||||
|
false
|
||||||
|
end.
|
||||||
|
|
||||||
|
handle_call({member, Word}, _From, BlackList) ->
|
||||||
|
% ?INFO_MSG("~p ~p~n", [Word, BlackList]),
|
||||||
|
lists:foreach(
|
||||||
|
fun(Elem) ->
|
||||||
|
Res = check_banword(Word, Elem),
|
||||||
|
if
|
||||||
|
Res ->
|
||||||
|
throw({reply, true, BlackList});
|
||||||
|
true ->
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end, BlackList),
|
||||||
|
{reply, false, BlackList}.
|
||||||
|
|
||||||
|
handle_cast(_Msg, State) -> {noreply, State}.
|
||||||
|
handle_info(_Info, State) -> {noreply, State}.
|
||||||
|
terminate(_Reason, _State) -> ok.
|
||||||
|
code_change(_OldVsn, State, _Extra) -> {ok, State}.
|
||||||
|
|
||||||
|
loadWordList(BlacklistFile) ->
|
||||||
|
BlacklistExists = filelib:is_file(BlacklistFile),
|
||||||
|
if
|
||||||
|
BlacklistExists ->
|
||||||
|
readlines(BlacklistFile);
|
||||||
|
true ->
|
||||||
|
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
|
||||||
|
[]
|
||||||
|
end.
|
|
@ -1,54 +0,0 @@
|
||||||
-module(bloom_gen_server).
|
|
||||||
|
|
||||||
-behaviour(gen_server).
|
|
||||||
|
|
||||||
-include("logger.hrl").
|
|
||||||
|
|
||||||
-import(etbloom, [bloom/1, member/2]).
|
|
||||||
-export([member/1]).
|
|
||||||
|
|
||||||
%% gen_server callbacks
|
|
||||||
-export([start/1, stop/1, init/1, handle_call/3, handle_cast/2, handle_info/2,
|
|
||||||
terminate/2, code_change/3]).
|
|
||||||
|
|
||||||
serverName(Lang) ->
|
|
||||||
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
|
|
||||||
|
|
||||||
member({Lang, Word} = _MessageToken) ->
|
|
||||||
gen_server:call(serverName(Lang), {member, Word}).
|
|
||||||
|
|
||||||
start({Lang, BlacklistFile} = _Opts) ->
|
|
||||||
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
|
|
||||||
|
|
||||||
stop({Lang, _BlacklistFile} = _Opts) ->
|
|
||||||
gen_server:stop(serverName(Lang)).
|
|
||||||
|
|
||||||
init([BlacklistFile]) ->
|
|
||||||
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
|
|
||||||
Bloom = etbloom:sbf(10000000),
|
|
||||||
{ok, loadWordList(Bloom, BlacklistFile)}.
|
|
||||||
|
|
||||||
handle_call({member, Word}, _From, Bloom) ->
|
|
||||||
Reply = etbloom:member(Word, Bloom),
|
|
||||||
{reply, Reply, Bloom}.
|
|
||||||
|
|
||||||
handle_cast(_Msg, State) -> {noreply, State}.
|
|
||||||
handle_info(_Info, State) -> {noreply, State}.
|
|
||||||
terminate(_Reason, _State) -> ok.
|
|
||||||
code_change(_OldVsn, State, _Extra) -> {ok, State}.
|
|
||||||
|
|
||||||
loadWordList(Bloom, BlacklistFile) ->
|
|
||||||
BlacklistExists = filelib:is_file(BlacklistFile),
|
|
||||||
if
|
|
||||||
BlacklistExists ->
|
|
||||||
{ok, S} = file:open(BlacklistFile, read),
|
|
||||||
loadWordList(io:get_line(S, ''), Bloom, S);
|
|
||||||
true ->
|
|
||||||
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
|
|
||||||
loadWordList(eof, Bloom, BlacklistFile)
|
|
||||||
end.
|
|
||||||
|
|
||||||
loadWordList(eof, Bloom, _S) ->
|
|
||||||
Bloom;
|
|
||||||
loadWordList(Line, Bloom, S) ->
|
|
||||||
loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S).
|
|
|
@ -15,7 +15,7 @@
|
||||||
mod_options/1
|
mod_options/1
|
||||||
]).
|
]).
|
||||||
|
|
||||||
-import(bloom_gen_server, [start/0, stop/0, member/1]).
|
-import(banword_gen_server, [start/0, stop/0, member/1]).
|
||||||
-import(nomalize_leet_gen_server, [normalize/1]).
|
-import(nomalize_leet_gen_server, [normalize/1]).
|
||||||
|
|
||||||
getMessageLang(Msg) ->
|
getMessageLang(Msg) ->
|
||||||
|
@ -31,8 +31,8 @@ getMessageLang(Msg) ->
|
||||||
censorWord({Lang, Word} = _MessageTerm) ->
|
censorWord({Lang, Word} = _MessageTerm) ->
|
||||||
% we need unicode characters to normlize the word
|
% we need unicode characters to normlize the word
|
||||||
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
|
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
|
||||||
% we need bytewise format for bloom lookup
|
% we need bytewise format for banword lookup
|
||||||
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
|
IsBadWord = banword_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
|
||||||
if
|
if
|
||||||
IsBadWord ->
|
IsBadWord ->
|
||||||
"****";
|
"****";
|
||||||
|
@ -61,7 +61,7 @@ filterMessageText2(Lang, MessageText) ->
|
||||||
|
|
||||||
start(_Host, Opts) ->
|
start(_Host, Opts) ->
|
||||||
Blacklists = gen_mod:get_opt(blacklists, Opts),
|
Blacklists = gen_mod:get_opt(blacklists, Opts),
|
||||||
lists:map(fun bloom_gen_server:start/1, Blacklists),
|
lists:map(fun banword_gen_server:start/1, Blacklists),
|
||||||
CharMaps = gen_mod:get_opt(charmaps, Opts),
|
CharMaps = gen_mod:get_opt(charmaps, Opts),
|
||||||
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
|
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
|
||||||
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
||||||
|
@ -69,7 +69,7 @@ start(_Host, Opts) ->
|
||||||
|
|
||||||
stop(Host) ->
|
stop(Host) ->
|
||||||
Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists),
|
Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists),
|
||||||
lists:map(fun bloom_gen_server:stop/1, Blacklists),
|
lists:map(fun banword_gen_server:stop/1, Blacklists),
|
||||||
CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps),
|
CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps),
|
||||||
lists:map(fun normalize_leet_gen_server:stop/1, CharMaps),
|
lists:map(fun normalize_leet_gen_server:stop/1, CharMaps),
|
||||||
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),
|
||||||
|
|
Loading…
Reference in New Issue