fix lookup/substitution of unicode (example: chinese) characters
This commit is contained in:
parent
29b4b5ae30
commit
5662e1c530
|
@ -27,8 +27,11 @@ getMessageLang(Attrs) ->
|
||||||
end,
|
end,
|
||||||
Lang.
|
Lang.
|
||||||
|
|
||||||
censorWord({Lang, Word} = MessageTerm) ->
|
censorWord({Lang, Word} = _MessageTerm) ->
|
||||||
IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}),
|
% we need unicode characters to normlize the word
|
||||||
|
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
|
||||||
|
% we need bytewise format for bloom lookup
|
||||||
|
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
|
||||||
if
|
if
|
||||||
IsBadWord ->
|
IsBadWord ->
|
||||||
"****";
|
"****";
|
||||||
|
@ -41,9 +44,11 @@ filterWords(L) ->
|
||||||
|
|
||||||
filterMessageText(MessageAttrs, MessageText) ->
|
filterMessageText(MessageAttrs, MessageText) ->
|
||||||
Lang = getMessageLang(MessageAttrs),
|
Lang = getMessageLang(MessageAttrs),
|
||||||
|
% we want to token-ize utf8 'words'
|
||||||
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
|
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
|
||||||
MessageTerms = [{Lang, Word} || Word <- MessageWords],
|
MessageTerms = [{Lang, Word} || Word <- MessageWords],
|
||||||
unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")).
|
% we get back bytewise format terms (rather than utf8)
|
||||||
|
list_to_binary(string:join(filterWords(MessageTerms), " ")).
|
||||||
|
|
||||||
start(_Host, Opts) ->
|
start(_Host, Opts) ->
|
||||||
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
|
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
|
||||||
|
@ -63,11 +68,11 @@ on_filter_packet(drop) ->
|
||||||
drop;
|
drop;
|
||||||
|
|
||||||
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
|
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
|
||||||
FilteredMessageWords = filterMessageText(BodyAttr, MessageText),
|
FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)),
|
||||||
{_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
|
{_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
|
||||||
|
|
||||||
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
|
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
|
||||||
FilteredMessageWords = filterMessageText(BodyAttr, MessageText),
|
FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)),
|
||||||
{_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
|
{_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
|
||||||
|
|
||||||
on_filter_packet(Msg) ->
|
on_filter_packet(Msg) ->
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
%% -*- coding: utf-8 -*-
|
||||||
|
|
||||||
-module(normalize_leet).
|
-module(normalize_leet).
|
||||||
|
|
||||||
-export([
|
-export([
|
||||||
|
@ -17,9 +19,9 @@ checkMetaChar(Char) ->
|
||||||
lists:member(Char, MetaChars).
|
lists:member(Char, MetaChars).
|
||||||
|
|
||||||
replaceChar(true, Char, X, Word) ->
|
replaceChar(true, Char, X, Word) ->
|
||||||
re:replace(Word, ["\\", Char], X, [global,{return,list}]);
|
re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]);
|
||||||
replaceChar(false, Char, X, Word) ->
|
replaceChar(false, Char, X, Word) ->
|
||||||
re:replace(Word, Char, X, [global,{return,list}]).
|
re:replace(Word, Char, X, [unicode,global,{return,list}]).
|
||||||
|
|
||||||
replaceLetters([H|T], CharMap, Word) ->
|
replaceLetters([H|T], CharMap, Word) ->
|
||||||
CurChar = [H],
|
CurChar = [H],
|
||||||
|
|
Loading…
Reference in New Issue