From 00e3fa97fc16eb0d3ce85caf85f951026f154a24 Mon Sep 17 00:00:00 2001 From: Badlop Date: Fri, 16 Apr 2021 16:20:11 +0200 Subject: [PATCH] Use banword instead of bloom + etbloom + bitarray (thanks to ytkang) --- mod_pottymouth/README.txt | 23 ------- mod_pottymouth/make-deps.sh | 18 ------ mod_pottymouth/rebar.config.bitarray | 14 ----- mod_pottymouth/src/banword_gen_server.erl | 77 +++++++++++++++++++++++ mod_pottymouth/src/bloom_gen_server.erl | 54 ---------------- mod_pottymouth/src/mod_pottymouth.erl | 10 +-- 6 files changed, 82 insertions(+), 114 deletions(-) delete mode 100755 mod_pottymouth/make-deps.sh delete mode 100644 mod_pottymouth/rebar.config.bitarray create mode 100644 mod_pottymouth/src/banword_gen_server.erl delete mode 100644 mod_pottymouth/src/bloom_gen_server.erl diff --git a/mod_pottymouth/README.txt b/mod_pottymouth/README.txt index ee29aa8..ad25367 100644 --- a/mod_pottymouth/README.txt +++ b/mod_pottymouth/README.txt @@ -6,24 +6,6 @@ sizes. Using a large list (say, 87M terms) will slow down the initial server boot time (to about 15 minutes respectively), but once loaded lookups are very speedy. -Prerequisite bitarray lib: - -mod_pottymouth uses a modified version of the 'etbloom' library that uses -'bitarray' to replace 'hipe_bifs'. Ejabberd doesn't handle installing -dependences of dependecies quite so well (etbloom being a dep of mod_pottymouth -and bitarray being a dep of etbloom), so bitarray needs to be installed manually -before installation of mod_pottymouth. - -This is how I got it to work... YMMV. - -1. Make sure ejabberd is running -2. Get the updated ejabberd-contrib sources: - ejabberdctl modules_update_specs -3. Execute this script to get and build the dependencies: - ~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/make-deps.sh -4. When you install the module, its dependencies will be installed as well: - ejabberdctl module_install mod_pottymouth - Configuration file is ~/.ejabberd-modules/mod_pottymouth/conf/mod_pottymouth.yml modules: @@ -73,8 +55,3 @@ Blacklist helper Thinking of a bunch of swear words and all the permutations can be tough. We made a helper script to take a bare wordlist and generate permutations given a dictionary of substitution characters: https://github.com/madglory/permute_wordlist - -Tip of the hat: - -This mod makes use of the excellent 'etbloom' module: -https://github.com/erlangtoolbox/etbloom diff --git a/mod_pottymouth/make-deps.sh b/mod_pottymouth/make-deps.sh deleted file mode 100755 index bfe0b09..0000000 --- a/mod_pottymouth/make-deps.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -PWD=`pwd` - -cd ~/.ejabberd-modules/sources/ejabberd-contrib/mod_pottymouth/ -mkdir deps -cd deps -git clone https://github.com/madglory/etbloom.git -cd etbloom -rm -rf deps/ -./rebar get-deps -cp ../../rebar.config.bitarray deps/bitarray/rebar.config -./rebar compile -cp -R deps/bitarray/ebin ../../ -cp -R deps/bitarray/priv ../../ -cp -R deps/proper/ebin ../../ - -cd $PWD diff --git a/mod_pottymouth/rebar.config.bitarray b/mod_pottymouth/rebar.config.bitarray deleted file mode 100644 index 3186d38..0000000 --- a/mod_pottymouth/rebar.config.bitarray +++ /dev/null @@ -1,14 +0,0 @@ -{ct_extra_params, "-env ERL_LIBS deps/"}. -{port_env, - [ {"DRV_CFLAGS", "-fPIC -O2 $ERL_CFLAGS"}, - {"ERL_LDFLAGS", " -L$ERL_EI_LIBDIR -lei"} - ]}. -{port_specs, [{"priv/bitarray.so", ["c_src/bitarray_nif.c"]}]}. - - -{deps, [ - %% For runtime - %% -- - %% For tests - {proper, "1.2", - {git, "https://github.com/manopapad/proper.git", {tag, "v1.2"}}}]}. diff --git a/mod_pottymouth/src/banword_gen_server.erl b/mod_pottymouth/src/banword_gen_server.erl new file mode 100644 index 0000000..045eb0c --- /dev/null +++ b/mod_pottymouth/src/banword_gen_server.erl @@ -0,0 +1,77 @@ +-module(banword_gen_server). + +-behaviour(gen_server). + +-include("logger.hrl"). + +-export([member/1]). + +%% gen_server callbacks +-export([start/1, stop/0, init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). + +serverName(Lang) -> + list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). + +member({Lang, Word} = _MessageToken) -> + gen_server:call(serverName(Lang), {member, Word}). + +start({Lang, BlacklistFile} = _Opts) -> + Name = serverName(Lang), + ?INFO_MSG("Building blacklist name ~p~n", [Name]), + gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). + +stop() -> + ok. + +readlines(FileName) -> + {ok, Data} = file:read_file(FileName), + BinList = binary:split(Data, [<<"\n">>], [global]), + [binary_to_list(X) || X <- BinList]. + +init([BlacklistFile]) -> + ?INFO_MSG("Building blacklist ~p~n", [BlacklistFile]), + {ok, loadWordList(BlacklistFile)}. + +check_banword(Word, BlackWord) -> + try + % ?INFO_MSG("== CHECK == ~p ~p~n", [Word, BlackWord]), + Res = string:rstr(Word, BlackWord), + if + Res > 0 -> + true; + true -> + false + end + catch _ -> + false + end. + +handle_call({member, Word}, _From, BlackList) -> + % ?INFO_MSG("~p ~p~n", [Word, BlackList]), + lists:foreach( + fun(Elem) -> + Res = check_banword(Word, Elem), + if + Res -> + throw({reply, true, BlackList}); + true -> + false + end + end, BlackList), + {reply, false, BlackList}. + +handle_cast(_Msg, State) -> {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. + +loadWordList(BlacklistFile) -> + BlacklistExists = filelib:is_file(BlacklistFile), + if + BlacklistExists -> + readlines(BlacklistFile); + true -> + ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), + [] + end. diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl deleted file mode 100644 index 7b94466..0000000 --- a/mod_pottymouth/src/bloom_gen_server.erl +++ /dev/null @@ -1,54 +0,0 @@ --module(bloom_gen_server). - --behaviour(gen_server). - --include("logger.hrl"). - --import(etbloom, [bloom/1, member/2]). --export([member/1]). - -%% gen_server callbacks --export([start/1, stop/1, init/1, handle_call/3, handle_cast/2, handle_info/2, - terminate/2, code_change/3]). - -serverName(Lang) -> - list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). - -member({Lang, Word} = _MessageToken) -> - gen_server:call(serverName(Lang), {member, Word}). - -start({Lang, BlacklistFile} = _Opts) -> - gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). - -stop({Lang, _BlacklistFile} = _Opts) -> - gen_server:stop(serverName(Lang)). - -init([BlacklistFile]) -> - ?INFO_MSG("Building bloom ~p~n", [BlacklistFile]), - Bloom = etbloom:sbf(10000000), - {ok, loadWordList(Bloom, BlacklistFile)}. - -handle_call({member, Word}, _From, Bloom) -> - Reply = etbloom:member(Word, Bloom), - {reply, Reply, Bloom}. - -handle_cast(_Msg, State) -> {noreply, State}. -handle_info(_Info, State) -> {noreply, State}. -terminate(_Reason, _State) -> ok. -code_change(_OldVsn, State, _Extra) -> {ok, State}. - -loadWordList(Bloom, BlacklistFile) -> - BlacklistExists = filelib:is_file(BlacklistFile), - if - BlacklistExists -> - {ok, S} = file:open(BlacklistFile, read), - loadWordList(io:get_line(S, ''), Bloom, S); - true -> - ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), - loadWordList(eof, Bloom, BlacklistFile) - end. - -loadWordList(eof, Bloom, _S) -> - Bloom; -loadWordList(Line, Bloom, S) -> - loadWordList(io:get_line(S, ''), etbloom:add(lists:droplast(Line), Bloom), S). diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 94d0954..7ccaa5e 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -15,7 +15,7 @@ mod_options/1 ]). --import(bloom_gen_server, [start/0, stop/0, member/1]). +-import(banword_gen_server, [start/0, stop/0, member/1]). -import(nomalize_leet_gen_server, [normalize/1]). getMessageLang(Msg) -> @@ -31,8 +31,8 @@ getMessageLang(Msg) -> censorWord({Lang, Word} = _MessageTerm) -> % we need unicode characters to normlize the word NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}), - % we need bytewise format for bloom lookup - IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), + % we need bytewise format for banword lookup + IsBadWord = banword_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), if IsBadWord -> "****"; @@ -61,7 +61,7 @@ filterMessageText2(Lang, MessageText) -> start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts), - lists:map(fun bloom_gen_server:start/1, Blacklists), + lists:map(fun banword_gen_server:start/1, Blacklists), CharMaps = gen_mod:get_opt(charmaps, Opts), lists:map(fun normalize_leet_gen_server:start/1, CharMaps), ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0), @@ -69,7 +69,7 @@ start(_Host, Opts) -> stop(Host) -> Blacklists = gen_mod:get_module_opt(Host, ?MODULE, blacklists), - lists:map(fun bloom_gen_server:stop/1, Blacklists), + lists:map(fun banword_gen_server:stop/1, Blacklists), CharMaps = gen_mod:get_module_opt(Host, ?MODULE, charmaps), lists:map(fun normalize_leet_gen_server:stop/1, CharMaps), ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0),