From 5662e1c530551a0266f3ace8563bbd950366f768 Mon Sep 17 00:00:00 2001 From: Tom Quackenbush Date: Tue, 13 Sep 2016 15:22:37 -0400 Subject: [PATCH] fix lookup/substitution of unicode (example: chinese) characters --- .DS_Store | Bin 0 -> 6148 bytes mod_pottymouth/src/mod_pottymouth.erl | 15 ++++++++++----- mod_pottymouth/src/normalize_leet.erl | 6 ++++-- 3 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..04767b967bcfeca655a83489f10aee9f1f045125 GIT binary patch literal 6148 zcmeHKJ8r^25S>XVkRl;U%DnfQ6L;Oex%6>*G0ZZ+1CTcoguqPC+`D( zlQSyqcir*pv$caI(x?CxpaN8Y3Q&Q&708x4eY?973s3A2dY0#x9s6wt|fyI$dyvbT<2&U$Tuf50v04{nCNQ?PhD k26{Wj!P@b{lOnI!9`|cvALw+%oet#BfayY`0*_YU9}2k_q5uE@ literal 0 HcmV?d00001 diff --git a/mod_pottymouth/src/mod_pottymouth.erl b/mod_pottymouth/src/mod_pottymouth.erl index 6d6ddae..e50c9f8 100644 --- a/mod_pottymouth/src/mod_pottymouth.erl +++ b/mod_pottymouth/src/mod_pottymouth.erl @@ -27,8 +27,11 @@ getMessageLang(Attrs) -> end, Lang. -censorWord({Lang, Word} = MessageTerm) -> - IsBadWord = bloom_gen_server:member({Lang, normalize_leet_gen_server:normalize(MessageTerm)}), +censorWord({Lang, Word} = _MessageTerm) -> + % we need unicode characters to normlize the word + NormalizedWord = normalize_leet_gen_server:normalize({Lang, unicode:characters_to_list(list_to_binary(Word))}), + % we need bytewise format for bloom lookup + IsBadWord = bloom_gen_server:member({Lang, binary_to_list(unicode:characters_to_binary(NormalizedWord))}), if IsBadWord -> "****"; @@ -41,9 +44,11 @@ filterWords(L) -> filterMessageText(MessageAttrs, MessageText) -> Lang = getMessageLang(MessageAttrs), + % we want to token-ize utf8 'words' MessageWords = string:tokens(unicode:characters_to_list(MessageText, utf8), " "), MessageTerms = [{Lang, Word} || Word <- MessageWords], - unicode:characters_to_binary(string:join(filterWords(MessageTerms), " ")). + % we get back bytewise format terms (rather than utf8) + list_to_binary(string:join(filterWords(MessageTerms), " ")). start(_Host, Opts) -> Blacklists = gen_mod:get_opt(blacklists, Opts, fun(A) -> A end, []), @@ -63,11 +68,11 @@ on_filter_packet(drop) -> drop; on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), {_From, _To, {xmlel, <<"message">>, _Attrs, [_chatState, {xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet({_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, MessageText}] = _BodyCData} = _MessageBody] = _Els} = _Packet} = _Msg) -> - FilteredMessageWords = filterMessageText(BodyAttr, MessageText), + FilteredMessageWords = filterMessageText(BodyAttr, binary:bin_to_list(MessageText)), {_From, _To, {xmlel, <<"message">>, _Attrs, [{xmlel, <<"body">>, BodyAttr, [{xmlcdata, FilteredMessageWords}]}]}}; on_filter_packet(Msg) -> diff --git a/mod_pottymouth/src/normalize_leet.erl b/mod_pottymouth/src/normalize_leet.erl index fbd9bec..1f396af 100644 --- a/mod_pottymouth/src/normalize_leet.erl +++ b/mod_pottymouth/src/normalize_leet.erl @@ -1,3 +1,5 @@ +%% -*- coding: utf-8 -*- + -module(normalize_leet). -export([ @@ -17,9 +19,9 @@ checkMetaChar(Char) -> lists:member(Char, MetaChars). replaceChar(true, Char, X, Word) -> - re:replace(Word, ["\\", Char], X, [global,{return,list}]); + re:replace(Word, ["\\", Char], X, [unicode,global,{return,list}]); replaceChar(false, Char, X, Word) -> - re:replace(Word, Char, X, [global,{return,list}]). + re:replace(Word, Char, X, [unicode,global,{return,list}]). replaceLetters([H|T], CharMap, Word) -> CurChar = [H],