mod_spam_filter: Also check body for listed JIDs

Also check whether the message body contains a listed JID (usually
mentioned as a contact address), rather than just using the JID list to
check the 'from' address of incoming stanzas.
This commit is contained in:
Holger Weiss 2019-04-09 20:46:12 +02:00
parent 8f63b2cbae
commit f459a7e57c
2 changed files with 84 additions and 42 deletions

View File

@ -34,6 +34,10 @@ The configurable mod_spam_filter options are:
This option specifies the full path to a plain text file containing a This option specifies the full path to a plain text file containing a
list of known spammer JIDs, one JID per line. Messages and subscription list of known spammer JIDs, one JID per line. Messages and subscription
requests sent from one of the listed JIDs will be classified as spam. requests sent from one of the listed JIDs will be classified as spam.
Messages containing at least one of the listed JIDs will be classified
as spam as well. Furthermore, the sender's JID will be cached, so that
future traffic originating from that JID will also be classified as
spam.
- spam_urls_file (default: none) - spam_urls_file (default: none)

View File

@ -62,12 +62,14 @@
-type url() :: binary(). -type url() :: binary().
-type filename() :: binary() | none. -type filename() :: binary() | none.
-type jid_set() :: sets:set(ljid()).
-type url_set() :: sets:set(url()).
-type s2s_in_state() :: ejabberd_s2s_in:state(). -type s2s_in_state() :: ejabberd_s2s_in:state().
-record(state, -record(state,
{host = <<>> :: binary(), {host = <<>> :: binary(),
url_set = sets:new() :: sets:set(url()), url_set = sets:new() :: url_set(),
jid_set = sets:new() :: sets:set(ljid()), jid_set = sets:new() :: jid_set(),
jid_cache = #{} :: map(), jid_cache = #{} :: map(),
max_cache_size = 0 :: non_neg_integer() | unlimited}). max_cache_size = 0 :: non_neg_integer() | unlimited}).
@ -156,34 +158,18 @@ init([Host, Opts]) ->
-spec handle_call(term(), {pid(), term()}, state()) -spec handle_call(term(), {pid(), term()}, state())
-> {reply, {spam_filter, term()}, state()} | {noreply, state()}. -> {reply, {spam_filter, term()}, state()} | {noreply, state()}.
handle_call({check_jid, From}, _From, #state{jid_set = JIDsSet} = State) -> handle_call({check_jid, From}, _From, #state{jid_set = JIDsSet} = State) ->
{Result, State2} = {Result, State1} = filter_jid(From, JIDsSet, State),
case sets:is_element(From, JIDsSet) of
true ->
?DEBUG("Spam JID found: ~s", [jid:encode(From)]),
{spam, State};
false ->
case cache_lookup(From, State) of
{true, State1} ->
?DEBUG("Spam JID found: ~s", [jid:encode(From)]),
{spam, State1};
{false, State1} ->
?DEBUG("JID not listed: ~s", [jid:encode(From)]),
{ham, State1}
end
end,
{reply, {spam_filter, Result}, State2};
handle_call({check_urls, URLs, From}, _From,
#state{url_set = URLsSet} = State) ->
{Result, State1} =
case lists:any(fun(URL) -> sets:is_element(URL, URLsSet) end, URLs) of
true ->
?DEBUG("Spam URL(s) found: ~p", [URLs]),
{spam, cache_insert(From, State)};
false ->
?DEBUG("URL(s) not listed: ~p", [URLs]),
{ham, State}
end,
{reply, {spam_filter, Result}, State1}; {reply, {spam_filter, Result}, State1};
handle_call({check_body, URLs, JIDs, From}, _From,
#state{url_set = URLsSet, jid_set = JIDsSet} = State) ->
{Result1, State1} = filter_body(URLs, URLsSet, From, State),
{Result2, State2} = filter_body(JIDs, JIDsSet, From, State1),
Result = if Result1 == spam ->
Result1;
true ->
Result2
end,
{reply, {spam_filter, Result}, State2};
handle_call({reload_files, JIDsFile, URLsFile}, _From, State) -> handle_call({reload_files, JIDsFile, URLsFile}, _From, State) ->
{Result, State1} = reload_files(JIDsFile, URLsFile, State), {Result, State1} = reload_files(JIDsFile, URLsFile, State),
{reply, {spam_filter, Result}, State1}; {reply, {spam_filter, Result}, State1};
@ -324,21 +310,20 @@ check_from(Host, From) ->
-spec check_body(binary(), jid(), binary()) -> ham | spam. -spec check_body(binary(), jid(), binary()) -> ham | spam.
check_body(Host, From, Body) -> check_body(Host, From, Body) ->
case extract_urls(Body) of case {extract_urls(Body), extract_jids(Body)} of
{urls, URLs} -> {none, none} ->
?DEBUG("No JIDs/URLs found in message", []),
ham;
{URLs, JIDs} ->
Proc = get_proc_name(Host), Proc = get_proc_name(Host),
LFrom = jid:remove_resource(jid:tolower(From)), LFrom = jid:remove_resource(jid:tolower(From)),
try gen_server:call(Proc, {check_urls, URLs, LFrom}) of try gen_server:call(Proc, {check_body, URLs, JIDs, LFrom}) of
{spam_filter, Result} -> {spam_filter, Result} ->
Result Result
catch exit:{timeout, _} -> catch exit:{timeout, _} ->
?WARNING_MSG("Timeout while checking body for spam URLs", ?WARNING_MSG("Timeout while checking body", []),
[]),
ham ham
end; end
none ->
?DEBUG("No URL(s) found in message", []),
ham
end. end.
-spec extract_urls(binary()) -> {urls, [url()]} | none. -spec extract_urls(binary()) -> {urls, [url()]} | none.
@ -352,6 +337,59 @@ extract_urls(Body) ->
none none
end. end.
-spec extract_jids(binary()) -> {jids, [ljid()]} | none.
extract_jids(Body) ->
RE = <<"\\S+@\\S+">>,
Options = [global, {capture, all, binary}],
case re:run(Body, RE, Options) of
{match, Captured} when is_list(Captured) ->
{jids, lists:filtermap(fun try_decode_jid/1,
lists:flatten(Captured))};
nomatch ->
none
end.
-spec try_decode_jid(binary()) -> {true, ljid()} | false.
try_decode_jid(S) ->
try jid:decode(S) of
#jid{} = JID ->
{true, jid:remove_resource(jid:tolower(JID))}
catch _:{bad_jid, _} ->
false
end.
-spec filter_jid(ljid(), jid_set(), state()) -> {ham | spam, state()}.
filter_jid(From, Set, State) ->
case sets:is_element(From, Set) of
true ->
?DEBUG("Spam JID found: ~s", [jid:encode(From)]),
{spam, State};
false ->
case cache_lookup(From, State) of
{true, State1} ->
?DEBUG("Spam JID found: ~s", [jid:encode(From)]),
{spam, State1};
{false, State1} ->
?DEBUG("JID not listed: ~s", [jid:encode(From)]),
{ham, State1}
end
end.
-spec filter_body({urls, [url()]} | {jids, [ljid()]} | none,
url_set() | jid_set(), jid(), state())
-> {ham | spam, state()}.
filter_body({_, Addrs}, Set, From, State) ->
case lists:any(fun(Addr) -> sets:is_element(Addr, Set) end, Addrs) of
true ->
?DEBUG("Spam addresses found: ~p", [Addrs]),
{spam, cache_insert(From, State)};
false ->
?DEBUG("Addresses not listed: ~p", [Addrs]),
{ham, State}
end;
filter_body(none, _Set, _From, State) ->
{ham, State}.
-spec reload_files(filename(), filename(), state()) -spec reload_files(filename(), filename(), state())
-> {{ok | error, binary()}, state()}. -> {{ok | error, binary()}, state()}.
reload_files(JIDsFile, URLsFile, #state{host = Host} = State) -> reload_files(JIDsFile, URLsFile, #state{host = Host} = State) ->
@ -379,13 +417,13 @@ reload_files(JIDsFile, URLsFile, #state{host = Host} = State) ->
{{error, Txt}, State} {{error, Txt}, State}
end. end.
-spec read_files(filename(), filename()) -> {sets:set(ljid()), sets:set(url())}. -spec read_files(filename(), filename()) -> {jid_set(), url_set()}.
read_files(JIDsFile, URLsFile) -> read_files(JIDsFile, URLsFile) ->
{read_file(JIDsFile, fun parse_jid/1), {read_file(JIDsFile, fun parse_jid/1),
read_file(URLsFile, fun parse_url/1)}. read_file(URLsFile, fun parse_url/1)}.
-spec read_file(filename(), fun((binary()) -> ljid() | url())) -spec read_file(filename(), fun((binary()) -> ljid() | url()))
-> sets:set(ljid()) | sets:set(url()). -> jid_set() | url_set().
read_file(none, _ParseLine) -> read_file(none, _ParseLine) ->
sets:new(); sets:new();
read_file(File, ParseLine) -> read_file(File, ParseLine) ->
@ -400,8 +438,8 @@ read_file(File, ParseLine) ->
end. end.
-spec read_line(file:io_device(), fun((binary()) -> ljid() | url()), -spec read_line(file:io_device(), fun((binary()) -> ljid() | url()),
sets:set(ljid()) | sets:set(url())) jid_set() | url_set())
-> sets:set(ljid()) | sets:set(url()). -> jid_set() | url_set().
read_line(Fd, ParseLine, Set) -> read_line(Fd, ParseLine, Set) ->
case file:read_line(Fd) of case file:read_line(Fd) of
{ok, Line} -> {ok, Line} ->