fix lookup/substitution of unicode (example: chinese) characters

This commit is contained in:
Tom Quackenbush 2016-09-13 15:22:37 -04:00
parent 29b4b5ae30
commit 5662e1c530
No known key found for this signature in database
GPG Key ID: F08C0F59E57F9F5E
3 changed files with 14 additions and 7 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@ -27,8 +27,11 @@ getMessageLang(Attrs) ->
end, end,
Lang. Lang.
censorWord({Lang, Word} = MessageTerm) -> censorWord({Lang, Word} = _MessageTerm) ->
IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}), % we need unicode characters to normlize the word
NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}),
% we need bytewise format for bloom lookup
IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}),
if if
IsBadWord -> IsBadWord ->
"****"; "****";
@ -41,9 +44,11 @@ filterWords(L) ->
filterMessageText(MessageAttrs, MessageText) -> filterMessageText(MessageAttrs, MessageText) ->
Lang = getMessageLang(MessageAttrs), Lang = getMessageLang(MessageAttrs),
% we want to token-ize utf8 'words'
MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "),
MessageTerms = [{Lang, Word} || Word <- MessageWords], MessageTerms = [{Lang, Word} || Word <- MessageWords],
unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")). % we get back bytewise format terms (rather than utf8)
list_to_binary(string:join(filterWords(MessageTerms), " ")).
start(_Host, Opts) -> start(_Host, Opts) ->
Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []),
@ -63,11 +68,11 @@ on_filter_packet(drop) ->
drop; drop;
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
FilteredMessageWords = filterMessageText(BodyAttr, MessageText), FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)),
{_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) ->
FilteredMessageWords = filterMessageText(BodyAttr, MessageText), FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)),
{_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}};
on_filter_packet(Msg) -> on_filter_packet(Msg) ->

View File

@ -1,3 +1,5 @@
%% -*- coding: utf-8 -*-
-module(normalize_leet). -module(normalize_leet).
-export([ -export([
@ -17,9 +19,9 @@ checkMetaChar(Char) ->
lists:member(Char, MetaChars). lists:member(Char, MetaChars).
replaceChar(true, Char, X, Word) -> replaceChar(true, Char, X, Word) ->
re:replace(Word, ["\\", Char], X, [global,{return,list}]); re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]);
replaceChar(false, Char, X, Word) -> replaceChar(false, Char, X, Word) ->
re:replace(Word, Char, X, [global,{return,list}]). re:replace(Word, Char, X, [unicode,global,{return,list}]).
replaceLetters([H|T], CharMap, Word) -> replaceLetters([H|T], CharMap, Word) ->
CurChar = [H], CurChar = [H],