From 83117240d2b07af4ca5be098060f2c46c52b74a8 Mon Sep 17 00:00:00 2001 From: Holger Weiss Date: Tue, 9 Apr 2019 00:51:33 +0200 Subject: [PATCH] Import mod_spam_filter Add a module that allows for filtering spam messages and subscription requests based on lists of known spammer JIDs and/or URLs mentioned in spam messages. Thanks to Georg Lukas for his suggestions. --- mod_spam_filter/COPYING | 342 +++++++++++++ mod_spam_filter/README.txt | 72 +++ mod_spam_filter/conf/mod_spam_filter.yml | 2 + mod_spam_filter/mod_spam_filter.spec | 5 + mod_spam_filter/src/mod_spam_filter.erl | 592 +++++++++++++++++++++++ 5 files changed, 1013 insertions(+) create mode 100644 mod_spam_filter/COPYING create mode 100644 mod_spam_filter/README.txt create mode 100644 mod_spam_filter/conf/mod_spam_filter.yml create mode 100644 mod_spam_filter/mod_spam_filter.spec create mode 100644 mod_spam_filter/src/mod_spam_filter.erl diff --git a/mod_spam_filter/COPYING b/mod_spam_filter/COPYING new file mode 100644 index 0000000..cc498bd --- /dev/null +++ b/mod_spam_filter/COPYING @@ -0,0 +1,342 @@ +As a special exception, the authors give permission to link this program +with the OpenSSL library and distribute the resulting binary. + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/mod_spam_filter/README.txt b/mod_spam_filter/README.txt new file mode 100644 index 0000000..f2657ff --- /dev/null +++ b/mod_spam_filter/README.txt @@ -0,0 +1,72 @@ + + mod_spam_filter - Filter spam messages based on JID/content + + Author: Holger Weiss + + + DESCRIPTION + ----------- + +This module allows for filtering spam messages and subscription requests +received from remote servers based on lists of known spammer JIDs and/or +URLs mentioned in spam messages. Traffic classified as spam is rejected +with an error (and an [info] message is logged) unless the sender is +subscribed to the recipient's presence. An access rule can be specified +to control which recipients are subject to spam filtering. + + + CONFIGURATION + ------------- + +In order to use this module, add a configuration snippet such as the +following: + + modules: + # [...] + mod_spam_filter: + spam_jids_file: "/etc/ejabberd/spam-filter/jids.txt" + spam_urls_file: "/etc/ejabberd/spam-filter/urls.txt" + +The configurable mod_spam_filter options are: + +- spam_jids_file (default: none) + + This option specifies the full path to a plain text file containing a + list of known spammer JIDs, one JID per line. Messages and subscription + requests sent from one of the listed JIDs will be classified as spam. + +- spam_urls_file (default: none) + + This option specifies the full path to a plain text file containing a + list of URLs known to be mentioned in spam message bodys. Messages + containing at least one of the listed URLs will be classified as spam. + Furthermore, the sender's JID will be cached, so that future traffic + originating from that JID will be classified as spam as well. + +- access_spam (default: none) + + This option defines the access rule to control who will be subject to + spam filtering. If the rule returns 'allow' for a given recipient, spam + messages aren't rejected for that recipient. By default, all recipients + are subject to spam filtering. + +- cache_size (default: 10000) + + This option specifies the maximum number of JIDs that will be cached due + to sending spam URLs (see above). If that limit is exceeded, the least + recently used entries are removed from the cache. Setting this option + to 0 disables the caching feature. Note that separate caches are used + for each virtual host, and that the caches aren't distributed across + cluster nodes. + + + ejabberd COMMANDS + ----------------- + +This module provides ejabberdctl/API calls to reread the spam JID/URL +files, to show the JID cache, and to expire old entries from that cache. +See: + +$ ejabberdctl help reload-spam-filter-files +$ ejabberdctl help show-spam-filter-cache +$ ejabberdctl help expire-spam-filter-cache diff --git a/mod_spam_filter/conf/mod_spam_filter.yml b/mod_spam_filter/conf/mod_spam_filter.yml new file mode 100644 index 0000000..feba1e4 --- /dev/null +++ b/mod_spam_filter/conf/mod_spam_filter.yml @@ -0,0 +1,2 @@ +modules: + mod_spam_filter: {} diff --git a/mod_spam_filter/mod_spam_filter.spec b/mod_spam_filter/mod_spam_filter.spec new file mode 100644 index 0000000..fb6e656 --- /dev/null +++ b/mod_spam_filter/mod_spam_filter.spec @@ -0,0 +1,5 @@ +author: "Holger Weiss " +category: "data" +summary: "Filter spam messages based on sender JID and content" +home: "https://github.com/processone/ejabberd-contrib/tree/master/" +url: "git@github.com:processone/ejabberd-contrib.git" diff --git a/mod_spam_filter/src/mod_spam_filter.erl b/mod_spam_filter/src/mod_spam_filter.erl new file mode 100644 index 0000000..20d9926 --- /dev/null +++ b/mod_spam_filter/src/mod_spam_filter.erl @@ -0,0 +1,592 @@ +%%%---------------------------------------------------------------------- +%%% File : mod_spam_filter.erl +%%% Author : Holger Weiss +%%% Purpose : Filter spam messages based on sender JID and content +%%% Created : 31 Mar 2019 by Holger Weiss +%%% +%%% +%%% ejabberd, Copyright (C) 2019 ProcessOne +%%% +%%% This program is free software; you can redistribute it and/or +%%% modify it under the terms of the GNU General Public License as +%%% published by the Free Software Foundation; either version 2 of the +%%% License, or (at your option) any later version. +%%% +%%% This program is distributed in the hope that it will be useful, +%%% but WITHOUT ANY WARRANTY; without even the implied warranty of +%%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +%%% General Public License for more details. +%%% +%%% You should have received a copy of the GNU General Public License along +%%% with this program; if not, write to the Free Software Foundation, Inc., +%%% 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +%%% +%%%---------------------------------------------------------------------- + +-module(mod_spam_filter). +-author('holger@zedat.fu-berlin.de'). + +-behaviour(gen_server). +-behaviour(gen_mod). + +%% gen_mod callbacks. +-export([start/2, + stop/1, + reload/3, + depends/2, + mod_opt_type/1, + mod_options/1]). + +%% gen_server callbacks. +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3]). + +%% ejabberd_hooks callbacks. +-export([s2s_in_handle_info/2, + s2s_receive_packet/1]). + +%% ejabberd_commands callbacks. +-export([get_commands_spec/0, reload_spam_filter_files/1, + show_spam_filter_cache/1, expire_spam_filter_cache/2]). + +-include("ejabberd_commands.hrl"). +-include("logger.hrl"). +-include("xmpp.hrl"). + +-define(SPAM_FILTER_CACHE, spam_filter_cache). +-define(COMMAND_TIMEOUT, timer:seconds(30)). + +-type url() :: binary(). +-type filename() :: binary() | none. +-type s2s_in_state() :: ejabberd_s2s_in:state(). + +-record(state, + {host = <<>> :: binary(), + url_set = sets:new() :: sets:set(url()), + jid_set = sets:new() :: sets:set(ljid()), + jid_cache = #{} :: map(), + max_cache_size = 0 :: non_neg_integer() | unlimited}). + +-type state() :: #state{}. + +%%-------------------------------------------------------------------- +%% gen_mod callbacks. +%%-------------------------------------------------------------------- +-spec start(binary(), gen_mod:opts()) -> ok | {error, any()}. +start(Host, Opts) -> + ejabberd_commands:register_commands(get_commands_spec()), + gen_mod:start_child(?MODULE, Host, Opts). + +-spec stop(binary()) -> ok | {error, any()}. +stop(Host) -> + case gen_mod:is_loaded_elsewhere(Host, ?MODULE) of + false -> + ejabberd_commands:unregister_commands(get_commands_spec()); + true -> + ok + end, + gen_mod:stop_child(?MODULE, Host). + +-spec reload(binary(), gen_mod:opts(), gen_mod:opts()) -> ok. +reload(Host, NewOpts, OldOpts) -> + Proc = get_proc_name(Host), + gen_server:cast(Proc, {reload, NewOpts, OldOpts}). + +-spec depends(binary(), gen_mod:opts()) -> [{module(), hard | soft}]. +depends(_Host, _Opts) -> + []. + +-spec mod_opt_type(atom()) -> fun((term()) -> term()) | [atom()]. +mod_opt_type(spam_jids_file) -> + fun(none) -> none; + (File) -> + {ok, Fd} = file:open(File, [read, raw]), + ok = file:close(Fd), + iolist_to_binary(File) + end; +mod_opt_type(spam_urls_file) -> + fun(none) -> none; + (File) -> + {ok, Fd} = file:open(File, [read, raw]), + ok = file:close(Fd), + iolist_to_binary(File) + end; +mod_opt_type(access_spam) -> + fun acl:access_rules_validator/1; +mod_opt_type(cache_size) -> + fun(I) when is_integer(I), I > 0 -> I; + (infinity) -> unlimited + end. + +-spec mod_options(binary()) -> [{atom(), any()}]. +mod_options(_Host) -> + [{spam_jids_file, none}, + {spam_urls_file, none}, + {access_spam, none}, + {cache_size, 10000}]. + +%%-------------------------------------------------------------------- +%% gen_server callbacks. +%%-------------------------------------------------------------------- +-spec init(list()) -> {ok, state()} | {stop, term()}. +init([Host, Opts]) -> + process_flag(trap_exit, true), + JIDsFile = proplists:get_value(spam_jids_file, Opts), + URLsFile = proplists:get_value(spam_urls_file, Opts), + try read_files(JIDsFile, URLsFile) of + {JIDsSet, URLsSet} -> + ejabberd_hooks:add(s2s_in_handle_info, Host, ?MODULE, + s2s_in_handle_info, 90), + ejabberd_hooks:add(s2s_receive_packet, Host, ?MODULE, + s2s_receive_packet, 50), + {ok, #state{host = Host, + jid_set = JIDsSet, + url_set = URLsSet, + max_cache_size = proplists:get_value(cache_size, Opts)}} + catch {Op, File, Reason} when Op == open; + Op == read -> + ?CRITICAL_MSG("Cannot ~s ~s: ~s", [Op, File, format_error(Reason)]), + {stop, config_error} + end. + +-spec handle_call(term(), {pid(), term()}, state()) + -> {reply, {spam_filter, term()}, state()} | {noreply, state()}. +handle_call({check_jid, From}, _From, #state{jid_set = JIDsSet} = State) -> + {Result, State2} = + case sets:is_element(From, JIDsSet) of + true -> + ?DEBUG("Spam JID found: ~s", [jid:encode(From)]), + {spam, State}; + false -> + case cache_lookup(From, State) of + {true, State1} -> + ?DEBUG("Spam JID found: ~s", [jid:encode(From)]), + {spam, State1}; + {false, State1} -> + ?DEBUG("JID not listed: ~s", [jid:encode(From)]), + {ham, State1} + end + end, + {reply, {spam_filter, Result}, State2}; +handle_call({check_urls, URLs, From}, _From, + #state{url_set = URLsSet} = State) -> + {Result, State1} = + case lists:any(fun(URL) -> sets:is_element(URL, URLsSet) end, URLs) of + true -> + ?DEBUG("Spam URL(s) found: ~p", [URLs]), + {spam, cache_insert(From, State)}; + false -> + ?DEBUG("URL(s) not listed: ~p", [URLs]), + {ham, State} + end, + {reply, {spam_filter, Result}, State1}; +handle_call({reload_files, JIDsFile, URLsFile}, _From, State) -> + {Result, State1} = reload_files(JIDsFile, URLsFile, State), + {reply, {spam_filter, Result}, State1}; +handle_call({expire_cache, Age}, _From, State) -> + {Result, State1} = expire_cache(Age, State), + {reply, {spam_filter, Result}, State1}; +handle_call(get_cache, _From, #state{jid_cache = Cache} = State) -> + {reply, {spam_filter, maps:to_list(Cache)}, State}; +handle_call(Request, From, State) -> + ?ERROR_MSG("Got unexpected request from ~p: ~p", [From, Request]), + {noreply, State}. + +-spec handle_cast(term(), state()) -> {noreply, state()}. +handle_cast({reload, NewOpts, OldOpts}, State) -> + JIDsFile = proplists:get_value(spam_jids_file, NewOpts), + URLsFile = proplists:get_value(spam_urls_file, NewOpts), + State1 = case {proplists:get_value(cache_size, OldOpts), + proplists:get_value(cache_size, NewOpts)} of + {OldMax, NewMax} when NewMax < OldMax -> + shrink_cache(State#state{max_cache_size = NewMax}); + {OldMax, NewMax} when NewMax > OldMax -> + State#state{max_cache_size = NewMax}; + {_OldMax, _NewMax} -> + State + end, + {_Result, State2} = reload_files(JIDsFile, URLsFile, State1), + {noreply, State2}; +handle_cast(Request, State) -> + ?ERROR_MSG("Got unexpected request from: ~p", [Request]), + {noreply, State}. + +-spec handle_info(term(), state()) -> {noreply, state()}. +handle_info(Info, State) -> + ?ERROR_MSG("Got unexpected info: ~p", [Info]), + {noreply, State}. + +-spec terminate(normal | shutdown | {shutdown, term()} | term(), state()) -> ok. +terminate(Reason, #state{host = Host}) -> + ?DEBUG("Stopping spam filter process for ~s: ~p", [Host, Reason]), + ejabberd_hooks:delete(s2s_receive_packet, Host, ?MODULE, + s2s_receive_packet, 50), + ejabberd_hooks:delete(s2s_in_handle_info, Host, ?MODULE, + s2s_in_handle_info, 90). + +-spec code_change({down, term()} | term(), state(), term()) -> {ok, state()}. +code_change(_OldVsn, #state{host = Host} = State, _Extra) -> + ?DEBUG("Updating spam filter process for ~s", [Host]), + {ok, State}. + +%%-------------------------------------------------------------------- +%% Hook callbacks. +%%-------------------------------------------------------------------- +-spec s2s_receive_packet({stanza() | drop, s2s_in_state()}) + -> {stanza(), s2s_in_state()} | {stop, {drop, s2s_in_state()}}. +s2s_receive_packet({drop, _State} = Acc) -> + Acc; +s2s_receive_packet({#message{from = From, + to = #jid{lserver = LServer} = To, + type = Type, body = Body} = Msg, + State} = Acc) when Type /= error -> + case needs_checking(From, To) of + true -> + case check_from(LServer, From) of + ham -> + case check_body(LServer, From, xmpp:get_text(Body)) of + ham -> + Acc; + spam -> + reject(Msg), + {drop, State} + end; + spam -> + reject(Msg), + {drop, State} + end; + false -> + Acc + end; +s2s_receive_packet({#presence{from = From, + to = #jid{lserver = LServer} = To, + type = subscribe} = Presence, State} = Acc) -> + case needs_checking(From, To) of + true -> + case check_from(LServer, From) of + ham -> + Acc; + spam -> + reject(Presence), + {drop, State} + end; + false -> + Acc + end; +s2s_receive_packet({_Stanza, _State} = Acc) -> + Acc. + +-spec s2s_in_handle_info(s2s_in_state(), any()) + -> s2s_in_state() | {stop, s2s_in_state()}. +s2s_in_handle_info(State, {_Ref, {spam_filter, _}}) -> + ?DEBUG("Dropping expired spam filter result", []), + {stop, State}; +s2s_in_handle_info(State, _) -> + State. + +%%-------------------------------------------------------------------- +%% Internal functions. +%%-------------------------------------------------------------------- +-spec needs_checking(jid(), jid()) -> boolean(). +needs_checking(From, #jid{lserver = LServer} = To) -> + Access = gen_mod:get_module_opt(LServer, ?MODULE, access_spam), + case acl:match_rule(LServer, Access, To) of + allow -> + ?DEBUG("Spam not filtered for ~s", [jid:encode(To)]), + false; + deny -> + ?DEBUG("Spam is filtered for ~s", [jid:encode(To)]), + not mod_roster:is_subscribed(From, To) + end. + +-spec check_from(binary(), jid()) -> ham | spam. +check_from(Host, From) -> + Proc = get_proc_name(Host), + LFrom = jid:remove_resource(jid:tolower(From)), + try gen_server:call(Proc, {check_jid, LFrom}) of + {spam_filter, Result} -> + Result + catch exit:{timeout, _} -> + ?WARNING_MSG("Timeout while checking ~s against list of spammers", + [jid:encode(From)]), + ham + end. + +-spec check_body(binary(), jid(), binary()) -> ham | spam. +check_body(Host, From, Body) -> + case extract_urls(Body) of + {urls, URLs} -> + Proc = get_proc_name(Host), + LFrom = jid:remove_resource(jid:tolower(From)), + try gen_server:call(Proc, {check_urls, URLs, LFrom}) of + {spam_filter, Result} -> + Result + catch exit:{timeout, _} -> + ?WARNING_MSG("Timeout while checking body for spam URLs", + []), + ham + end; + none -> + ?DEBUG("No URL(s) found in message", []), + ham + end. + +-spec extract_urls(binary()) -> {urls, [url()]} | none. +extract_urls(Body) -> + RE = <<"https?://\\S+">>, + Options = [global, {capture, all, binary}], + case re:run(Body, RE, Options) of + {match, Captured} when is_list(Captured) -> + {urls, lists:flatten(Captured)}; + nomatch -> + none + end. + +-spec reload_files(filename(), filename(), state()) + -> {{ok | error, binary()}, state()}. +reload_files(JIDsFile, URLsFile, #state{host = Host} = State) -> + try read_files(JIDsFile, URLsFile) of + {JIDsSet, URLsSet} -> + case sets_equal(JIDsSet, State#state.jid_set) of + true -> + ?INFO_MSG("Reloaded spam JIDs for ~s (unchanged)", [Host]); + false -> + ?INFO_MSG("Reloaded spam JIDs for ~s (changed)", [Host]) + end, + case sets_equal(URLsSet, State#state.url_set) of + true -> + ?INFO_MSG("Reloaded spam URLs for ~s (unchanged)", [Host]); + false -> + ?INFO_MSG("Reloaded spam URLs for ~s (modfied)", [Host]) + end, + Txt = <<"Reloaded spam JID/URL files">>, + {{ok, Txt}, State#state{jid_set = JIDsSet, url_set = URLsSet}} + catch {Op, File, Reason} when Op == open; + Op == read -> + Txt = format("Cannot ~s ~s for ~s: ~s", + [Op, File, Host, format_error(Reason)]), + ?ERROR_MSG("~s", [Txt]), + {{error, Txt}, State} + end. + +-spec read_files(filename(), filename()) -> {sets:set(ljid()), sets:set(url())}. +read_files(JIDsFile, URLsFile) -> + {read_file(JIDsFile, fun parse_jid/1), + read_file(URLsFile, fun parse_url/1)}. + +-spec read_file(filename(), fun((binary()) -> ljid() | url())) + -> sets:set(ljid()) | sets:set(url()). +read_file(none, _ParseLine) -> + sets:new(); +read_file(File, ParseLine) -> + case file:open(File, [read, binary, raw, {read_ahead, 65536}]) of + {ok, Fd} -> + try read_line(Fd, ParseLine, sets:new()) + catch throw:E -> throw({read, File, E}) + after ok = file:close(Fd) + end; + {error, Reason} -> + throw({open, File, Reason}) + end. + +-spec read_line(file:io_device(), fun((binary()) -> ljid() | url()), + sets:set(ljid()) | sets:set(url())) + -> sets:set(ljid()) | sets:set(url()). +read_line(Fd, ParseLine, Set) -> + case file:read_line(Fd) of + {ok, Line} -> + read_line(Fd, ParseLine, sets:add_element(ParseLine(Line), Set)); + {error, Reason} -> + throw(Reason); + eof -> + Set + end. + +-spec parse_jid(binary()) -> ljid(). +parse_jid(S) -> + try jid:decode(trim(S)) of + #jid{} = JID -> + jid:remove_resource(jid:tolower(JID)) + catch _:{bad_jid, _} -> + throw({bad_jid, S}) + end. + +-spec parse_url(binary()) -> url(). +parse_url(S) -> + URL = trim(S), + RE = <<"https?://\\S+$">>, + Options = [anchored, caseless, {capture, none}], + case re:run(URL, RE, Options) of + match -> + URL; + nomatch -> + throw({bad_url, S}) + end. + +-spec trim(binary()) -> binary(). +trim(S) -> + re:replace(S, <<"\\s+$">>, <<>>, [{return, binary}]). + +-spec reject(stanza()) -> ok. +reject(#message{from = From, type = Type, lang = Lang} = Msg) + when Type /= groupchat, + Type /= error -> + ?INFO_MSG("Rejecting unsolicited message from ~s", [jid:encode(From)]), + Txt = <<"Your traffic is unsolicited">>, + Err = xmpp:err_policy_violation(Txt, Lang), + ejabberd_router:route_error(Msg, Err); +reject(#presence{from = From}) -> + ?INFO_MSG("Rejecting unsolicited presence from ~s", [jid:encode(From)]); +reject(_) -> + ok. + +-spec get_proc_name(binary()) -> atom(). +get_proc_name(Host) -> + gen_mod:get_module_proc(Host, ?MODULE). + +-spec sets_equal(sets:set(), sets:set()) -> boolean(). +sets_equal(A, B) -> + sets:is_subset(A, B) andalso sets:is_subset(B, A). + +-spec format(io:format(), [term()]) -> binary(). +format(Format, Data) -> + iolist_to_binary(io_lib:format(Format, Data)). + +-spec format_error(atom() | tuple()) -> binary(). +format_error({bad_jid, JID}) -> + <<"Not a valid JID: ", JID/binary>>; +format_error({bad_url, URL}) -> + <<"Not an HTTP(S) URL: ", URL/binary>>; +format_error(Reason) -> + list_to_binary(file:format_error(Reason)). + +%%-------------------------------------------------------------------- +%% Caching. +%%-------------------------------------------------------------------- +-spec cache_insert(ljid(), state()) -> state(). +cache_insert(_LJID, #state{max_cache_size = 0} = State) -> + State; +cache_insert(LJID, #state{jid_cache = Cache, max_cache_size = MaxSize} = State) + when MaxSize /= unlimited, map_size(Cache) >= MaxSize -> + cache_insert(LJID, shrink_cache(State)); +cache_insert(LJID, #state{jid_cache = Cache} = State) -> + ?INFO_MSG("Caching spam JID: ~s", [jid:encode(LJID)]), + Cache1 = Cache#{LJID => erlang:monotonic_time(second)}, + State#state{jid_cache = Cache1}. + +-spec cache_lookup(ljid(), state()) -> {boolean(), state()}. +cache_lookup(LJID, #state{jid_cache = Cache} = State) -> + case Cache of + #{LJID := _Timestamp} -> + Cache1 = Cache#{LJID => erlang:monotonic_time(second)}, + State1 = State#state{jid_cache = Cache1}, + {true, State1}; + #{} -> + {false, State} + end. + +-spec shrink_cache(state()) -> state(). +shrink_cache(#state{jid_cache = Cache, max_cache_size = MaxSize} = State) -> + ShrinkedSize = round(MaxSize / 2), + N = map_size(Cache) - ShrinkedSize, + L = lists:keysort(2, maps:to_list(Cache)), + Cache1 = maps:from_list(lists:nthtail(N, L)), + State#state{jid_cache = Cache1}. + +-spec expire_cache(integer(), state()) -> {{ok, binary()}, state()}. +expire_cache(Age, #state{jid_cache = Cache} = State) -> + Threshold = erlang:monotonic_time(second) - Age, + Cache1 = maps:filter(fun(_, TS) -> TS >= Threshold end, Cache), + NumExp = map_size(Cache) - map_size(Cache1), + Txt = format("Expired ~B cache entries", [NumExp]), + {{ok, Txt}, State#state{jid_cache = Cache1}}. + +%%-------------------------------------------------------------------- +%% ejabberd command callbacks. +%%-------------------------------------------------------------------- +-spec get_commands_spec() -> [ejabberd_commands()]. +get_commands_spec() -> + [#ejabberd_commands{name = reload_spam_filter_files, tags = [filter], + desc = "Reload spam JID/URL files", + module = ?MODULE, function = reload_spam_filter_files, + args = [{host, binary}], + result = {res, restuple}}, + #ejabberd_commands{name = show_spam_filter_cache, tags = [filter], + desc = "Show spam filter cache contents", + module = ?MODULE, function = show_spam_filter_cache, + args = [{host, binary}], + result = {spammers, {list, {spammer, {tuple, + [{jid, string}, {timestamp, integer}]}}}}}, + #ejabberd_commands{name = expire_spam_filter_cache, tags = [filter], + desc = "Remove old/unused spam JIDs from cache", + module = ?MODULE, function = expire_spam_filter_cache, + args = [{host, binary}, {seconds, integer}], + result = {res, restuple}}]. + +-spec reload_spam_filter_files(binary()) -> {ok | error, string()}. +reload_spam_filter_files(<<"global">>) -> + try lists:foreach(fun(Host) -> + {ok, _} = reload_spam_filter_files(Host) + end, ejabberd_config:get_myhosts()) of + ok -> + {ok, "Reloaded spam JID/URL files"} + catch error:{badmatch, {error, _Reason} = Error} -> + Error + end; +reload_spam_filter_files(Host) -> + LServer = jid:nameprep(Host), + case {gen_mod:get_module_opt(LServer, ?MODULE, spam_jids_file), + gen_mod:get_module_opt(LServer, ?MODULE, spam_urls_file)} of + {JIDsFile, URLsFile} -> + Proc = get_proc_name(LServer), + try gen_server:call(Proc, {reload_files, JIDsFile, URLsFile}, + ?COMMAND_TIMEOUT) of + {spam_filter, {Status, Txt}} -> + {Status, binary_to_list(Txt)} + catch exit:{noproc, _} -> + {error, "Not configured for " ++ binary_to_list(Host)}; + exit:{timeout, _} -> + {error, "Timeout while querying ejabberd"} + end + end. + +-spec show_spam_filter_cache(binary()) + -> [{binary(), integer()}] | {error, string()}. +show_spam_filter_cache(Host) -> + LServer = jid:nameprep(Host), + Proc = get_proc_name(LServer), + try gen_server:call(Proc, get_cache, ?COMMAND_TIMEOUT) of + {spam_filter, Cache} -> + [{jid:encode(JID), TS + erlang:time_offset(second)} || + {JID, TS} <- Cache] + catch exit:{noproc, _} -> + {error, "Not configured for " ++ binary_to_list(Host)}; + exit:{timeout, _} -> + {error, "Timeout while querying ejabberd"} + end. + +-spec expire_spam_filter_cache(binary(), integer()) -> {ok | error, string()}. +expire_spam_filter_cache(<<"global">>, Age) -> + try lists:foreach(fun(Host) -> + {ok, _} = expire_spam_filter_cache(Host, Age) + end, ejabberd_config:get_myhosts()) of + ok -> + {ok, "Expired cache filter entries"} + catch error:{badmatch, {error, _Reason} = Error} -> + Error + end; +expire_spam_filter_cache(Host, Age) -> + LServer = jid:nameprep(Host), + Proc = get_proc_name(LServer), + try gen_server:call(Proc, {expire_cache, Age}, ?COMMAND_TIMEOUT) of + {spam_filter, {Status, Txt}} -> + {Status, binary_to_list(Txt)} + catch exit:{noproc, _} -> + {error, "Not configured for " ++ binary_to_list(Host)}; + exit:{timeout, _} -> + {error, "Timeout while querying ejabberd"} + end.