From 6c22e0fa0c0622722d972d8cf5a2a420ce84c380 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Fri, 24 Jun 2016 15:34:37 +0000 Subject: [PATCH] adding mod_pottymouth --- mod_pottymouth/COPYING | 202 ++++++++++++++++++++++++ mod_pottymouth/README.txt | 54 +++++++ mod_pottymouth/conf/mod_pottymouth.yml | 7 + mod_pottymouth/mod_pottymouth.spec | 5 + mod_pottymouth/src/bloom_gen_server.erl | 47 ++++++ mod_pottymouth/src/etbloom.erl | 174 ++++++++++++++++++++ mod_pottymouth/src/mod_pottymouth.erl | 67 ++++++++ 7 files changed, 556 insertions(+) create mode 100644 mod_pottymouth/COPYING create mode 100644 mod_pottymouth/README.txt create mode 100644 mod_pottymouth/conf/mod_pottymouth.yml create mode 100644 mod_pottymouth/mod_pottymouth.spec create mode 100644 mod_pottymouth/src/bloom_gen_server.erl create mode 100644 mod_pottymouth/src/etbloom.erl create mode 100644 mod_pottymouth/src/mod_pottymouth.erl diff --git a/mod_pottymouth/COPYING b/mod_pottymouth/COPYING new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/mod_pottymouth/COPYING @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/mod_pottymouth/README.txt b/mod_pottymouth/README.txt new file mode 100644 index 0000000..9d7649d --- /dev/null +++ b/mod_pottymouth/README.txt @@ -0,0 +1,54 @@ +The 'mod_pottymouth' ejabberd module aims to fill the void left by 'mod_shit' +which has disappeared from the net. It allows individual whole words of a +message to be filtered against a blacklist. It allows multiple blacklists +sharded by language. To make use of this module the client must add the xml:lang +attribute to the message xml. + +To install in ejabberd: + +cd ~/.ejabberd-modules/sources +clone the git repo +cd mod_pottymouth +edit: ./conf/mod_pottymouth.yml + +make sure ejabberd is running +run: ejabberdctl module_install mod_pottymouth +run: ejabberdctl restart +module will be installed in: ~/.ejabberd-modules/mod_pottymouth + +Config file format: + +modules: + mod_pottymouth: + blacklists: + default: /home/your_user/blacklist_en.txt + en: /home/your_user/blacklist_en.txt + cn: /home/your_user/blacklist_cn.txt + fr: /home/your_user/blacklist_fr.txt + +For each language (en,cn,fr,...whatever) provide a full path to a backlist file. +The blacklist file is a plain text file with blacklisted words listed one per +line. + +Gotchas: + +The language will be looked up by whatever value is passed in the xml:lang +attribute of the xml message. So, any xml:lang value to be supported will need +a corresponding entry/blacklist in the config file. If xml:lang is missing, +the 'default' entry in config will be used. + +For xml:lang attribute docs, see: +http://wiki.xmpp.org/web/Programming_XMPP_Clients#Sending_a_message + +The internal bloomfilter used to ingest the blacklists currently requires about +4,000 entries in the blacklist to ensure acceptable error probability. (We've +gotten around this by duplicating entries in a short list) + +Todo: + +Look into acceptable error probabilities for shorter blacklists. + +Tip of the hat: + +This mod makes use of the excellent 'etbloom' module: +https://github.com/erlangtoolbox/etbloom diff --git a/mod_pottymouth/conf/mod_pottymouth.yml b/mod_pottymouth/conf/mod_pottymouth.yml new file mode 100644 index 0000000..458ad4f --- /dev/null +++ b/mod_pottymouth/conf/mod_pottymouth.yml @@ -0,0 +1,7 @@ +modules: + mod_pottymouth: + blacklists: + default: /home/vagrant/blacklist_en.txt + en: /home/vagrant/blacklist_en.txt + cn: /home/vagrant/blacklist_cn.txt + fr: /home/vagrant/blacklist_fr.txt diff --git a/mod_pottymouth/mod_pottymouth.spec b/mod_pottymouth/mod_pottymouth.spec new file mode 100644 index 0000000..4a269aa --- /dev/null +++ b/mod_pottymouth/mod_pottymouth.spec @@ -0,0 +1,5 @@ +author: "Tom Quackenbush " +category: "data" +summary: "Filter bad words in messages" +home: "https://github.com/madglory/mod_pottymouth/tree/master" +url: "git@github.com:madglory/mod_pottymouth.git" diff --git a/mod_pottymouth/src/bloom_gen_server.erl b/mod_pottymouth/src/bloom_gen_server.erl new file mode 100644 index 0000000..1fb098a --- /dev/null +++ b/mod_pottymouth/src/bloom_gen_server.erl @@ -0,0 +1,47 @@ +-module(bloom_gen_server). + +-behaviour(gen_server). + +-include("logger.hrl"). + +-import(etbloom, [bloom/1, member/2]). +-export([start/1]). + +%% gen_server callbacks +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). +-compile(export_all). + +serverName(Lang) -> + list_to_atom(lists:flatten([atom_to_list(?MODULE), "_", atom_to_list(Lang)])). + +member({Lang, Word} = _MessageToken) -> + gen_server:call(serverName(Lang), {member, Word}). + +loadWordList(BlacklistFile) -> + BlacklistExists = filelib:is_file(BlacklistFile), + if + BlacklistExists -> + {ok, S} = file:read_file(BlacklistFile); + true -> + ?ERROR_MSG("Blacklist file not found: ~p~n", [BlacklistFile]), + S = <<>> + end, + WordList = string:tokens(binary_to_list(S), "\n"), + WordList. + +start({Lang, BlacklistFile} = _Opts) -> + gen_server:start_link({local, serverName(Lang)}, ?MODULE, [BlacklistFile], []). + +init([BlacklistFile]) -> + WordList = loadWordList(BlacklistFile), + {ok, etbloom:bloom(WordList)}. + +handle_call({member, Word}, _From, Bloom) -> + Reply = etbloom:member(Word, Bloom), + {reply, Reply, Bloom}. + +handle_cast(_Msg, State) -> {noreply, State}. +handle_info(_Info, State) -> {noreply, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/mod_pottymouth/src/etbloom.erl b/mod_pottymouth/src/etbloom.erl new file mode 100644 index 0000000..113a117 --- /dev/null +++ b/mod_pottymouth/src/etbloom.erl @@ -0,0 +1,174 @@ +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved via the world wide web at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +-module(etbloom). +-author("Paulo Sergio Almeida "). +-export([sbf/1, sbf/2, sbf/3, sbf/4, + bloom/1, bloom/2, + member/2, add/2, + size/1, capacity/1]). +-export([is_element/2, add_element/2]). % alternative names +-import(math, [log/1, pow/2]). + +is_element(E, B) -> member(E, B). +add_element(E, B) -> add(E, B). + + +%% Based on +%% Scalable Bloom Filters +%% Paulo Sérgio Almeida, Carlos Baquero, Nuno Preguiça, David Hutchison +%% Information Processing Letters +%% Volume 101, Issue 6, 31 March 2007, Pages 255-261 +%% +%% Provides scalable bloom filters that can grow indefinitely while +%% ensuring a desired maximum false positive probability. Also provides +%% standard partitioned bloom filters with a maximum capacity. Bit arrays +%% are dimensioned as a power of 2 to enable reusing hash values across +%% filters through bit operations. Double hashing is used (no need for +%% enhanced double hashing for partitioned bloom filters). +%% + +-record(bloom, { + e, % error probability + n, % maximum number of elements + mb, % 2^mb = m, the size of each slice (bitvector) + size, % number of elements + a % list of bitvectors +}). + +-record(sbf, { + e, % error probability + r, % error probability ratio + s, % log 2 of growth ratio + size, % number of elements + b % list of plain bloom filters +}). + +%% Constructors for (fixed capacity) bloom filters +%% +%% N - capacity +%% E - error probability + +bloom(L) when is_list(L) -> lists:foldl(fun(X, Bloom) -> add(X, Bloom) end, bloom(length(L)), L); +bloom(N) when is_integer(N) -> bloom(N, 0.001). +bloom(N, E) when is_number(N), N > 0, + is_float(E), E > 0, E < 1, + N >= 4 / E -> % rule of thumb; due to double hashing + bloom(size, N, E). + +bloom(Mode, Dim, E) -> + K = 1 + trunc(log2(1 / E)), + P = pow(E, 1 / K), + case Mode of + size -> Mb = 1 + trunc(-log2(1 - pow(1 - P, 1 / Dim))); + bits -> Mb = Dim + end, + M = 1 bsl Mb, + N = trunc(log(1 - P) / log(1 - 1 / M)), + #bloom{e = E, n = N, mb = Mb, size = 0, + a = [hipe_bifs:bitarray(1 bsl Mb, false) || _ <- lists:seq(1, K)]}. + +log2(X) -> log(X) / log(2). + +%% Constructors for scalable bloom filters +%% +%% N - initial capacity before expanding +%% E - error probability +%% S - growth ratio when full (log 2) can be 1, 2 or 3 +%% R - tightening ratio of error probability + +sbf(N) -> sbf(N, 0.001). +sbf(N, E) -> sbf(N, E, 1). +sbf(N, E, 1) -> sbf(N, E, 1, 0.85); +sbf(N, E, 2) -> sbf(N, E, 2, 0.75); +sbf(N, E, 3) -> sbf(N, E, 3, 0.65). +sbf(N, E, S, R) when is_number(N), N > 0, + is_float(E), E > 0, E < 1, + is_integer(S), S > 0, S < 4, + is_float(R), R > 0, R < 1, + N >= 4 / (E * (1 - R)) -> % rule of thumb; due to double hashing + #sbf{e = E, s = S, r = R, size = 0, b = [bloom(N, E * (1 - R))]}. + +%% Returns number of elements +%% +size(#bloom{size = Size}) -> Size; +size(#sbf{size = Size}) -> Size. + +%% Returns capacity +%% +capacity(#bloom{n = N}) -> N; +capacity(#sbf{}) -> infinity. + +%% Test for membership +%% +member(Elem, #bloom{mb = Mb} = B) -> + Hashes = make_hashes(Mb, Elem), + hash_member(Hashes, B); +member(Elem, #sbf{b = [H | _]} = Sbf) -> + Hashes = make_hashes(H#bloom.mb, Elem), + hash_member(Hashes, Sbf). + +hash_member(Hashes, #bloom{mb = Mb, a = A}) -> + Mask = 1 bsl Mb - 1, + {I1, I0} = make_indexes(Mask, Hashes), + all_set(Mask, I1, I0, A); +hash_member(Hashes, #sbf{b = B}) -> + lists:any(fun(X) -> hash_member(Hashes, X) end, B). + +make_hashes(Mb, E) when Mb =< 16 -> + erlang:phash2({E}, 1 bsl 32); +make_hashes(Mb, E) when Mb =< 32 -> + {erlang:phash2({E}, 1 bsl 32), erlang:phash2([E], 1 bsl 32)}. + +make_indexes(Mask, {H0, H1}) when Mask > 1 bsl 16 -> masked_pair(Mask, H0, H1); +make_indexes(Mask, {H0, _}) -> make_indexes(Mask, H0); +make_indexes(Mask, H0) -> masked_pair(Mask, H0 bsr 16, H0). + +masked_pair(Mask, X, Y) -> {X band Mask, Y band Mask}. + +all_set(_Mask, _I1, _I, []) -> true; +all_set(Mask, I1, I, [H | T]) -> + case hipe_bifs:bitarray_sub(H, I) of + true -> all_set(Mask, I1, (I + I1) band Mask, T); + false -> false + end. + +%% Adds element to set +%% +add(Elem, #bloom{mb = Mb} = B) -> + Hashes = make_hashes(Mb, Elem), + hash_add(Hashes, B); +add(Elem, #sbf{size = Size, r = R, s = S, b = [H | T] = Bs} = Sbf) -> + #bloom{mb = Mb, e = E, n = N, size = HSize} = H, + Hashes = make_hashes(Mb, Elem), + case hash_member(Hashes, Sbf) of + true -> Sbf; + false -> + case HSize < N of + true -> Sbf#sbf{size = Size + 1, b = [hash_add(Hashes, H) | T]}; + false -> + B = add(Elem, bloom(bits, Mb + S, E * R)), + Sbf#sbf{size = Size + 1, b = [B | Bs]} + end + end. + +hash_add(Hashes, #bloom{mb = Mb, a = A, size = Size} = B) -> + Mask = 1 bsl Mb - 1, + {I1, I0} = make_indexes(Mask, Hashes), + case all_set(Mask, I1, I0, A) of + true -> B; + false -> B#bloom{size = Size + 1, a = set_bits(Mask, I1, I0, A, [])} + end. + +set_bits(_Mask, _I1, _I, [], Acc) -> lists:reverse(Acc); +set_bits(Mask, I1, I, [H | T], Acc) -> + set_bits(Mask, I1, (I + I1) band Mask, T, [hipe_bifs:bitarray_update(H, I, true) | Acc]). + diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl new file mode 100644 index 0000000..49161a0 --- /dev/null +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -0,0 +1,67 @@ +-module(mod_pottymouth). + +-behaviour(gen_mod). + +-include("logger.hrl"). + +-export([ + start/2, + stop/1, + on_filter_packet/1, + mod_opt_type/1 +]). + +-include("ejabberd.hrl"). + +-import(bloom_gen_server, [start/0, stop/0, member/1]). + +getMessageLang(Attrs) -> + LangAttr = lists:keyfind(<<"lang">>, 1, Attrs), + if + LangAttr -> + {<<"lang">>, LangBin} = LangAttr, + Lang = list_to_atom(binary_to_list(LangBin)); + true -> + Lang = default + end, + Lang. + +censorWord({_Lang, Word} = MessageTerm) -> + IsBadWord = bloom_gen_server:member(MessageTerm), + if + IsBadWord -> + "****"; + true -> + Word + end. + +filterWords(L) -> + lists:map(fun censorWord/1, L). + +start(_Host, Opts) -> + Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), + lists:map(fun bloom_gen_server:start/1, Blacklists), + ejabberd_hooks:add(filter_packet, global, ?MODULE, on_filter_packet, 0), + ok. + +stop(_Host) -> + bloom_gen_server:stop(), + ejabberd_hooks:delete(filter_packet, global, ?MODULE, on_filter_packet, 0), + ok. + +on_filter_packet(drop) -> + drop; + +on_filter_packet({_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> + Lang = getMessageLang(Attrs), + MessageWords = string:tokens(binary_to_list(MessageText), " "), + MessageTerms = [{Lang, Word} || Word <- MessageWords], + FilteredMessageWords = list_to_binary(string:join(filterWords(MessageTerms), " ")), + {_From, _To, {xmlel, <<"message">>, Attrs, [_chatState, {xmlel, <<"body">>, _BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; + +on_filter_packet(Msg) -> + % Handle the generic case (any packet that isn't a message with a body). + Msg. + +mod_opt_type(blacklists) -> fun (A) when is_list(A) -> A end; +mod_opt_type(_) -> [blacklists].