Use banword instead of bloom + etbloom + bitarray (thanks to ytkang)

This commit is contained in:
Badlop 2021-04-16 16:20:11 +02:00
parent 2aea4659cc
commit 00e3fa97fc
6 changed files with 82 additions and 114 deletions

View File

@ -6,24 +6,6 @@ sizes. Using a large list (say, 87M terms) will slow down the initial server
boot time (to about 15 minutes respectively), but once loaded lookups are very
speedy.
Prerequisite bitarray lib:
mod_pottymouth uses a modified version of the 'etbloom' library that uses
'bitarray' to replace 'hipe_bifs'. Ejabberd doesn't handle installing
dependences of dependecies quite so well (etbloom being a dep of mod_pottymouth
and bitarray being a dep of etbloom), so bitarray needs to be installed manually
before installation of mod_pottymouth.
This is how I got it to work... YMMV.
1. Make sure ejabberd is running
2. Get the updated ejabberd-contrib sources:
ejabberdctl modules_update_specs
3. Execute this script to get and build the dependencies:
~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/make-deps.sh
4. When you install the module, its dependencies will be installed as well:
ejabberdctl module_install mod_pottymouth
Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml
modules:
@ -73,8 +55,3 @@ Blacklist helper
Thinking of a bunch of swear words and all the permutations can be tough. We made
a helper script to take a bare wordlist and generate permutations given a
dictionary of substitution characters: https://github.com/madglory/permute_wordlist
Tip of the hat:
This mod makes use of the excellent 'etbloom' module:
https://github.com/erlangtoolbox/etbloom

View File

@ -1,18 +0,0 @@
#!/bin/sh
PWD=`pwd`
cd ~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/
mkdir deps
cd deps
git clone https://github.com/madglory/etbloom.git
cd etbloom
rm -rf deps/
./rebar get-deps
cp ../../rebar.config.bitarray deps/bitarray/rebar.config
./rebar compile
cp -R deps/bitarray/ebin ../../
cp -R deps/bitarray/priv ../../
cp -R deps/proper/ebin ../../
cd $PWD

View File

@ -1,14 +0,0 @@
{ct_extra_params, "-env ERL_LIBS deps/"}.
{port_env,
[ {"DRV_CFLAGS", "-fPIC -O2 $ERL_CFLAGS"},
{"ERL_LDFLAGS", " -L$ERL_EI_LIBDIR -lei"}
]}.
{port_specs, [{"priv/bitarray.so", ["c_src/bitarray_nif.c"]}]}.
{deps, [
%% For runtime
%% --
%% For tests
{proper, "1.2",
{git, "https://github.com/manopapad/proper.git", {tag, "v1.2"}}}]}.

View File

@ -0,0 +1,77 @@
-module(banword_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-export([member/1]).
%% gen_server callbacks
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
member({Lang, Word} = _MessageToken) ->
gen_server:call(serverName(Lang), {member, Word}).
start({Lang, BlacklistFile} = _Opts) ->
Name = serverName(Lang),
?INFO_MSG("Building blacklist name ~p~n", [Name]),
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop() ->
ok.
readlines(FileName) ->
{ok, Data} = file:read_file(FileName),
BinList = binary:split(Data, [<<"\n">>], [global]),
[binary_to_list(X) || X <- BinList].
init([BlacklistFile]) ->
?INFO_MSG("Building blacklist ~p~n", [BlacklistFile]),
{ok, loadWordList(BlacklistFile)}.
check_banword(Word, BlackWord) ->
try
% ?INFO_MSG("== CHECK == ~p ~p~n", [Word, BlackWord]),
Res = string:rstr(Word, BlackWord),
if
Res > 0 ->
true;
true ->
false
end
catch _ ->
false
end.
handle_call({member, Word}, _From, BlackList) ->
% ?INFO_MSG("~p ~p~n", [Word, BlackList]),
lists:foreach(
fun(Elem) ->
Res = check_banword(Word, Elem),
if
Res ->
throw({reply, true, BlackList});
true ->
false
end
end, BlackList),
{reply, false, BlackList}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.
loadWordList(BlacklistFile) ->
BlacklistExists = filelib:is_file(BlacklistFile),
if
BlacklistExists ->
readlines(BlacklistFile);
true ->
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
[]
end.

View File

@ -1,54 +0,0 @@
-module(bloom_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-import(etbloom, [bloom/1, member/2]).
-export([member/1]).
%% gen_server callbacks
-export([start/1, stop/1, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
member({Lang, Word} = _MessageToken) ->
gen_server:call(serverName(Lang), {member, Word}).
start({Lang, BlacklistFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop({Lang, _BlacklistFile} = _Opts) ->
gen_server:stop(serverName(Lang)).
init([BlacklistFile]) ->
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
Bloom = etbloom:sbf(10000000),
{ok, loadWordList(Bloom, BlacklistFile)}.
handle_call({member, Word}, _From, Bloom) ->
Reply = etbloom:member(Word, Bloom),
{reply, Reply, Bloom}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.
loadWordList(Bloom, BlacklistFile) ->
BlacklistExists = filelib:is_file(BlacklistFile),
if
BlacklistExists ->
{ok, S} = file:open(BlacklistFile, read),
loadWordList(io:get_line(S, ''), Bloom, S);
true ->
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
loadWordList(eof, Bloom, BlacklistFile)
end.
loadWordList(eof, Bloom, _S) ->
Bloom;
loadWordList(Line, Bloom, S) ->
loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S).

View File

@ -15,7 +15,7 @@
mod_options/1
]).
-import(bloom_gen_server, [start/0, stop/0, member/1]).
-import(banword_gen_server, [start/0, stop/0, member/1]).
-import(nomalize_leet_gen_server, [normalize/1]).
getMessageLang(Msg) ->
@ -31,8 +31,8 @@ getMessageLang(Msg) ->
censorWord({Lang, Word} = _MessageTerm) ->
% we need unicode characters to normlize the word
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
% we need bytewise format for bloom lookup
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
% we need bytewise format for banword lookup
IsBadWord = banword_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
if
IsBadWord ->
"****";
@ -61,7 +61,7 @@ filterMessageText2(Lang, MessageText) ->
start(_Host, Opts) ->
Blacklists = gen_mod:get_opt(blacklists, Opts),
lists:map(fun bloom_gen_server:start/1, Blacklists),
lists:map(fun banword_gen_server:start/1, Blacklists),
CharMaps = gen_mod:get_opt(charmaps, Opts),
lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
@ -69,7 +69,7 @@ start(_Host, Opts) ->
stop(Host) ->
Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists),
lists:map(fun bloom_gen_server:stop/1, Blacklists),
lists:map(fun banword_gen_server:stop/1, Blacklists),
CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps),
lists:map(fun normalize_leet_gen_server:stop/1, CharMaps),
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),