diff --git a/mod_pottymouth/README.txt b/mod_pottymouth/README.txt index 0703c96..af8c822 100644 --- a/mod_pottymouth/README.txt +++ b/mod_pottymouth/README.txt @@ -6,10 +6,32 @@ sizes. Using a large list (say, 87M terms) will slow down the initial server boot time (to about 15 minutes respectively), but once loaded lookups are very speedy. -To install in ejabberd: +Prerequisite bitarray lib: + +mod_pottymouth uses a modified version of the 'etbloom' library that uses +'bitarray' to replace 'hipe_bifs'. Ejabberd doesn't handle installing +dependences of dependecies quite so well (etbloom being a dep of mod_pottymouth +and bitarray being a dep of etbloom), so bitarray needs to be installed manually +before installation of mod_pottymouth. + +This is how I got it to work... YMMV. +Given $EJABBERD_HOME is the base directory of your ejabberd install: + +mkdir -p $EJABBERD_HOME/erlang-lib/bitarray +cd $EJABBERD_HOME/erlang-lib/bitarray +clone https://github.com/ferd/bitarray git repo +run: /usr/lib/erlang/bin/escript rebar get-deps +run: /usr/lib/erlang/bin/escript rebar compile +run: /usr/bin/install -c -d /usr/local/lib/bitarray-1.0.0/ebin +run: /usr/bin/install -c -d /usr/local/lib/bitarray-1.0.0/priv +run: /usr/bin/install -c -m 644 ./ebin/bitarray.app /usr/local/lib/bitarray-1.0.0/ebin/bitarray.app +run: /usr/bin/install -c -m 644 ./ebin/bitarray.beam /usr/local/lib/bitarray-1.0.0/ebin/bitarray.beam +run: /usr/bin/install -c -m 644 ./priv/bitarray.so /usr/local/lib/bitarray-1.0.0/priv/bitarray.so + +To install mod_pottymouth in ejabberd: cd ~/.ejabberd-modules/sources -clone the git repo +clone the ejabberd-contrib git repo cd mod_pottymouth edit: ./conf/mod_pottymouth.yml @@ -18,14 +40,6 @@ run: ejabberdctl module_install mod_pottymouth run: ejabberdctl restart module will be installed in: ~/.ejabberd-modules/mod_pottymouth - -If you don't have Erlang HiPE available, it may throw errors that mention: - {undef,[{hipe_bifs,bitarray, -In such case, you can install this library: - https://github.com/ferd/bitarray -and edit etbloom.erl to call that library instead of hipe_bifs. - - Config file format: modules: diff --git a/mod_pottymouth/deps/etbloom/.gitignore b/mod_pottymouth/deps/etbloom/.gitignore new file mode 100644 index 0000000..7f18faa --- /dev/null +++ b/mod_pottymouth/deps/etbloom/.gitignore @@ -0,0 +1,2 @@ +workspace.xml +out diff --git a/mod_pottymouth/deps/etbloom/LICENSE.txt b/mod_pottymouth/deps/etbloom/LICENSE.txt new file mode 100644 index 0000000..729d139 --- /dev/null +++ b/mod_pottymouth/deps/etbloom/LICENSE.txt @@ -0,0 +1,286 @@ +ERLANG PUBLIC LICENSE +Version 1.1 + +1. All Rights Reserved.'' diff --git a/mod_pottymouth/deps/etbloom/README.md b/mod_pottymouth/deps/etbloom/README.md new file mode 100644 index 0000000..96c99bd --- /dev/null +++ b/mod_pottymouth/deps/etbloom/README.md @@ -0,0 +1,13 @@ +Erlang Bloom Filter +======= + +Based on Scalable Bloom Filters Paulo Sérgio Almeida, Carlos Baquero, Nuno Preguiça, David Hutchison +Information Processing Letters Volume 101, Issue 6, 31 March 2007, Pages 255-261 + +Provides scalable bloom filters that can grow indefinitely while +ensuring a desired maximum false positive probability. Also provides +standard partitioned bloom filters with a maximum capacity. Bit arrays +are dimensioned as a power of 2 to enable reusing hash values across +filters through bit operations. diff --git a/mod_pottymouth/deps/etbloom/rebar b/mod_pottymouth/deps/etbloom/rebar
new file mode 100755
index 0000000..97c77e1
Binary files /dev/null and b/mod_pottymouth/deps/etbloom/rebar differ
diff --git a/mod_pottymouth/deps/etbloom/rebar.config b/mod_pottymouth/deps/etbloom/rebar.config
new file mode 100644
index 0000000..72ab93b
--- /dev/null
+++ b/mod_pottymouth/deps/etbloom/rebar.config
@@ -0,0 +1,5 @@
+{erl_opts, [warnings_as_errors, debug_info]}.
+{deps, [
+    {bitarray, "0.1",
+     {git, "https://github.com/madglory/bitarray", {branch, "master"}}}
+]}. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved via the world wide web at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. diff --git a/mod_pottymouth/src/etbloom.erl b/mod_pottymouth/deps/etbloom/src/etbloom.erl
similarity index 96%
rename from mod_pottymouth/src/etbloom.erl
rename to mod_pottymouth/deps/etbloom/src/etbloom.erl
index 113a117..d0bd148 100644
--- a/mod_pottymouth/src/etbloom.erl
+++ b/mod_pottymouth/deps/etbloom/src/etbloom.erl
@@ -74,7 +74,7 @@ bloom(Mode, Dim, E) ->
     M = 1 bsl Mb,
     N = trunc(log(1 - P) / log(1 - 1 / M)),
     #bloom{e = E, n = N, mb = Mb, size = 0,
-           a = [hipe_bifs:bitarray(1 bsl Mb, false) || _ <- lists:seq(1, K)]}.
+           a = [bitarray:new(1 bsl Mb, false) || _ <- lists:seq(1, K)]}.
 
 log2(X) ->
     log(X) / log(2).
@@ -136,7 +136,7 @@ masked_pair(Mask, X, Y) ->
 
 all_set(_Mask, _I1, _I, []) -> true;
 all_set(Mask, I1, I, [H | T]) ->
-    case hipe_bifs:bitarray_sub(H, I) of
+    case bitarray:sub(H, I) of
         true  -> all_set(Mask, I1, (I + I1) band Mask, T);
         false -> false
     end.
@@ -170,5 +170,5 @@ hash_add(Hashes, #bloom{mb = Mb, a = A, size = Size} = B) ->
 
 set_bits(_Mask, _I1, _I, [], Acc) -> lists:reverse(Acc);
 set_bits(Mask, I1, I, [H | T], Acc) ->
-    set_bits(Mask, I1, (I + I1) band Mask, T, [hipe_bifs:bitarray_update(H, I, true) | Acc]).
+    set_bits(Mask, I1, (I + I1) band Mask, T, [bitarray:update(H, I, true) | Acc]). You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved via the world wide web at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +-module(etbloom_tests). +-author("volodymyr.kyrychenko@strikead.com"). +-include_lib("eunit/include/eunit.hrl"). + +bloom_test() -> + Values = [{xxx, binary_to_atom(base64:encode(crypto:strong_rand_bytes(10)), utf8)} || _ <- lists:seq(1, 5000)], + Bloom = etbloom:bloom(Values), + ?assert(lists:all(fun(X) -> etbloom:member(X, Bloom) end, Values)), + ?assertNot(etbloom:member(wtf, Bloom)). diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 0e8bc6f..7c31534 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -2,27 +2,27 @@ -behaviour(gen_mod). +-include("ejabberd.hrl"). -include("logger.hrl"). +-include("xmpp.hrl"). -export([ start/2, stop/1, on_filter_packet/1, mod_opt_type/1, - depends/2 + depends/2, + reload/3 ]). --include("ejabberd.hrl"). - -import(bloom_gen_server, [start/0, stop/0, member/1]). -import(nomalize_leet_gen_server, [normalize/1]). -getMessageLang(Attrs) -> - LangAttr = lists:keyfind(<<"lang">>, 1, Attrs), +getMessageLang(Msg) -> + LangAttr = xmpp:get_lang(Msg), if - LangAttr -> - {<<"lang">>, LangBin} = LangAttr, - Lang = list_to_atom(binary_to_list(LangBin)); + (LangAttr /= <<>>) -> + Lang = list_to_atom(binary_to_list(LangAttr)); true -> Lang = default end, @@ -43,28 +43,13 @@ censorWord({Lang, Word} = _MessageTerm) -> filterWords(L) -> lists:map(fun censorWord/1, L). -filterMessageText(MessageAttrs, MessageText) -> - Lang = getMessageLang(MessageAttrs), +filterMessageText(Lang, MessageText) -> % we want to token-ize utf8 'words' MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageTerms = [{Lang, Word} || Word <- MessageWords], % we get back bytewise format terms (rather than utf8) string:join(filterWords(MessageTerms), " "). - -filterMessageBodyElements([{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}]} = _H|T], MessageElements) -> - FilteredMessageWords = binary:list_to_bin(filterMessageText(BodyAttr, binary:bin_to_list(MessageText))), - FilteredBody = {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}, - filterMessageBodyElements(T, lists:append(MessageElements, [FilteredBody])); - -filterMessageBodyElements([H|T], MessageElements) -> - % skip this tag, but pass it on as processed - filterMessageBodyElements(T, lists:append(MessageElements, [H])); - -filterMessageBodyElements([], MessageElements) -> - MessageElements. - - start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), lists:map(fun bloom_gen_server:start/1, Blacklists), @@ -82,14 +67,28 @@ stop(_Host) -> on_filter_packet(drop) -> drop; -on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, Els} = _Packet} = _Msg) -> - FilteredEls = filterMessageBodyElements(Els, []), - {_From, _To, {xmlel, <<"message">>, _Attrs, FilteredEls}}; on_filter_packet(Msg) -> - % Handle the generic case (any packet that isn't a message with a body). - Msg. + Type = xmpp:get_type(Msg), + if + (Type == chat) orelse (Type == groupchat) -> + BodyText = xmpp:get_text(Msg#message.body), + if + (BodyText /= <<>>) -> + Lang = getMessageLang(Msg), + FilteredMessageWords = binary:list_to_bin(filterMessageText(Lang, binary:bin_to_list(BodyText))), + [BodyObject|_] = Msg#message.body, + NewBodyObject = setelement(3, BodyObject, FilteredMessageWords), + NewMsg = Msg#message{body = [NewBodyObject]}, + NewMsg; + true -> + Msg + end; + true -> + Msg + end. mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; mod_opt_type(charmaps) -> fun (A) when is_list(A) -> A end; mod_opt_type(_) -> [blacklists, charmaps]. depends(_Host, _Opts) -> []. +reload(_Host, _NewOpts, _OldOpts) -> ok.