Use banword instead of bloom + etbloom + bitarray (thanks to ytkang)

This commit is contained in:
Badlop 2021-04-16 16:20:11 +02:00
parent 2aea4659cc
commit 00e3fa97fc
6 changed files with 82 additions and 114 deletions

View File

@ -6,24 +6,6 @@ sizes. Using a large list (say, 87M terms) will slow down the initial server
boot time (to about 15 minutes respectively), but once loaded lookups are very boot time (to about 15 minutes respectively), but once loaded lookups are very
speedy. speedy.
Prerequisite bitarray lib:
mod_pottymouth uses a modified version of the 'etbloom' library that uses
'bitarray' to replace 'hipe_bifs'. Ejabberd doesn't handle installing
dependences of dependecies quite so well (etbloom being a dep of mod_pottymouth
and bitarray being a dep of etbloom), so bitarray needs to be installed manually
before installation of mod_pottymouth.
This is how I got it to work... YMMV.
1. Make sure ejabberd is running
2. Get the updated ejabberd-contrib sources:
ejabberdctl modules_update_specs
3. Execute this script to get and build the dependencies:
~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/make-deps.sh
4. When you install the module, its dependencies will be installed as well:
ejabberdctl module_install mod_pottymouth
Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml
modules: modules:
@ -73,8 +55,3 @@ Blacklist helper
Thinking of a bunch of swear words and all the permutations can be tough. We made Thinking of a bunch of swear words and all the permutations can be tough. We made
a helper script to take a bare wordlist and generate permutations given a a helper script to take a bare wordlist and generate permutations given a
dictionary of substitution characters: https://github.com/madglory/permute_wordlist dictionary of substitution characters: https://github.com/madglory/permute_wordlist
Tip of the hat:
This mod makes use of the excellent 'etbloom' module:
https://github.com/erlangtoolbox/etbloom

View File

@ -1,18 +0,0 @@
#!/bin/sh
PWD=`pwd`
cd ~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/
mkdir deps
cd deps
git clone https://github.com/madglory/etbloom.git
cd etbloom
rm -rf deps/
./rebar get-deps
cp ../../rebar.config.bitarray deps/bitarray/rebar.config
./rebar compile
cp -R deps/bitarray/ebin ../../
cp -R deps/bitarray/priv ../../
cp -R deps/proper/ebin ../../
cd $PWD

View File

@ -1,14 +0,0 @@
{ct_extra_params, "-env ERL_LIBS deps/"}.
{port_env,
[ {"DRV_CFLAGS", "-fPIC -O2 $ERL_CFLAGS"},
{"ERL_LDFLAGS", " -L$ERL_EI_LIBDIR -lei"}
]}.
{port_specs, [{"priv/bitarray.so", ["c_src/bitarray_nif.c"]}]}.
{deps, [
%% For runtime
%% --
%% For tests
{proper, "1.2",
{git, "https://github.com/manopapad/proper.git", {tag, "v1.2"}}}]}.

View File

@ -0,0 +1,77 @@
-module(banword_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-export([member/1]).
%% gen_server callbacks
-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
member({Lang, Word} = _MessageToken) ->
gen_server:call(serverName(Lang), {member, Word}).
start({Lang, BlacklistFile} = _Opts) ->
Name = serverName(Lang),
?INFO_MSG("Building blacklist name ~p~n", [Name]),
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop() ->
ok.
readlines(FileName) ->
{ok, Data} = file:read_file(FileName),
BinList = binary:split(Data, [<<"\n">>], [global]),
[binary_to_list(X) || X <- BinList].
init([BlacklistFile]) ->
?INFO_MSG("Building blacklist ~p~n", [BlacklistFile]),
{ok, loadWordList(BlacklistFile)}.
check_banword(Word, BlackWord) ->
try
% ?INFO_MSG("== CHECK == ~p ~p~n", [Word, BlackWord]),
Res = string:rstr(Word, BlackWord),
if
Res > 0 ->
true;
true ->
false
end
catch _ ->
false
end.
handle_call({member, Word}, _From, BlackList) ->
% ?INFO_MSG("~p ~p~n", [Word, BlackList]),
lists:foreach(
fun(Elem) ->
Res = check_banword(Word, Elem),
if
Res ->
throw({reply, true, BlackList});
true ->
false
end
end, BlackList),
{reply, false, BlackList}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.
loadWordList(BlacklistFile) ->
BlacklistExists = filelib:is_file(BlacklistFile),
if
BlacklistExists ->
readlines(BlacklistFile);
true ->
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
[]
end.

View File

@ -1,54 +0,0 @@
-module(bloom_gen_server).
-behaviour(gen_server).
-include("logger.hrl").
-import(etbloom, [bloom/1, member/2]).
-export([member/1]).
%% gen_server callbacks
-export([start/1, stop/1, init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
serverName(Lang) ->
list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])).
member({Lang, Word} = _MessageToken) ->
gen_server:call(serverName(Lang), {member, Word}).
start({Lang, BlacklistFile} = _Opts) ->
gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []).
stop({Lang, _BlacklistFile} = _Opts) ->
gen_server:stop(serverName(Lang)).
init([BlacklistFile]) ->
?INFO_MSG("Building bloom ~p~n", [BlacklistFile]),
Bloom = etbloom:sbf(10000000),
{ok, loadWordList(Bloom, BlacklistFile)}.
handle_call({member, Word}, _From, Bloom) ->
Reply = etbloom:member(Word, Bloom),
{reply, Reply, Bloom}.
handle_cast(_Msg, State) -> {noreply, State}.
handle_info(_Info, State) -> {noreply, State}.
terminate(_Reason, _State) -> ok.
code_change(_OldVsn, State, _Extra) -> {ok, State}.
loadWordList(Bloom, BlacklistFile) ->
BlacklistExists = filelib:is_file(BlacklistFile),
if
BlacklistExists ->
{ok, S} = file:open(BlacklistFile, read),
loadWordList(io:get_line(S, ''), Bloom, S);
true ->
?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]),
loadWordList(eof, Bloom, BlacklistFile)
end.
loadWordList(eof, Bloom, _S) ->
Bloom;
loadWordList(Line, Bloom, S) ->
loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S).

View File

@ -15,7 +15,7 @@
mod_options/1 mod_options/1
]). ]).
-import(bloom_gen_server, [start/0, stop/0, member/1]). -import(banword_gen_server, [start/0, stop/0, member/1]).
-import(nomalize_leet_gen_server, [normalize/1]). -import(nomalize_leet_gen_server, [normalize/1]).
getMessageLang(Msg) -> getMessageLang(Msg) ->
@ -31,8 +31,8 @@ getMessageLang(Msg) ->
censorWord({Lang, Word} = _MessageTerm) -> censorWord({Lang, Word} = _MessageTerm) ->
% we need unicode characters to normlize the word % we need unicode characters to normlize the word
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}), NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
% we need bytewise format for bloom lookup % we need bytewise format for banword lookup
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), IsBadWord = banword_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
if if
IsBadWord -> IsBadWord ->
"****"; "****";
@ -61,7 +61,7 @@ filterMessageText2(Lang, MessageText) ->
start(_Host, Opts) -> start(_Host, Opts) ->
Blacklists = gen_mod:get_opt(blacklists, Opts), Blacklists = gen_mod:get_opt(blacklists, Opts),
lists:map(fun bloom_gen_server:start/1, Blacklists), lists:map(fun banword_gen_server:start/1, Blacklists),
CharMaps = gen_mod:get_opt(charmaps, Opts), CharMaps = gen_mod:get_opt(charmaps, Opts),
lists:map(fun normalize_leet_gen_server:start/1, CharMaps), lists:map(fun normalize_leet_gen_server:start/1, CharMaps),
ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0), ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0),
@ -69,7 +69,7 @@ start(_Host, Opts) ->
stop(Host) -> stop(Host) ->
Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists), Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists),
lists:map(fun bloom_gen_server:stop/1, Blacklists), lists:map(fun banword_gen_server:stop/1, Blacklists),
CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps), CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps),
lists:map(fun normalize_leet_gen_server:stop/1, CharMaps), lists:map(fun normalize_leet_gen_server:stop/1, CharMaps),
ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0), ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),