Erlang:文本验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
%%%-------------------------------------------------------------------
%%% @author suyang
%%% @copyright (C) 2020, <COMPANY>
%%% @doc 文本验证
%%%
%%% @end
%%% Created : 27. 5月 2020 12:28
%%%-------------------------------------------------------------------
-module(valid_test).
-author("suyang").

%% API
-export([valid_init/0, word_valid/1, test_word/0, word_valid_replace/1]).

-define(ETS_VALID_CONTENT, ets_valid_content).

%% =============================================================================
%% API
%% =============================================================================
%% @doc 加载过滤词初始化
valid_init() ->
ets:new(?ETS_VALID_CONTENT, [named_table, public, set, {read_concurrency, true}]),
import_words(?ETS_VALID_CONTENT),
ok.

%% @doc 过滤词检测
word_valid([]) ->
false;
word_valid(Utf8String) when is_list(Utf8String) ->
Utf8Binary = unicode:characters_to_binary(Utf8String),
word_valid(Utf8Binary);
word_valid(Utf8Binary) when is_binary(Utf8Binary) ->
UniString = unicode:characters_to_list(Utf8Binary, unicode),
word_valid(UniString, ?ETS_VALID_CONTENT).

word_valid([], _EtsName) ->
false;
word_valid(UniString, EtsName) ->
[HeadChar TailString] = UniString,
UniStrLen = length(UniString),
WordList = get_key_word(HeadChar, EtsName),
Match = fun(Word) ->
WordLen = length(Word),
if
WordLen > UniStrLen -> % 小于敏感词长度直接false
false;
WordLen =:= UniStrLen -> % 等于直接比较
UniString =:= Word;
true -> % 大于取词比较
HeadStr = lists:sublist(UniString, WordLen),
HeadStr =:= Word
end
end,
case lists:any(Match, WordList) of
true -> true;
false -> word_valid(TailString, EtsName)
end.

%% @doc 过滤词替换
word_valid_replace(Utf8String)->
UniString = unicode:characters_to_list(Utf8String, unicode),
replace_sensitive(UniString, [], ?ETS_VALID_CONTENT).

replace_sensitive([], LastReplaced, _EtsName) ->
LastReplaced;
replace_sensitive(Error, LastReplaced, _EtsName) when is_list(Error) =:= false ->
LastReplaced;
replace_sensitive(InputString, LastReplaced, EtsName) ->
private_replace_sensitive(InputString, LastReplaced, EtsName).

%% 测试
test_word() ->
[DescList] = io_lib:format("~ts", ["测试"]),
io:format("~p ~p~n", ["测试", word_valid_replace("测试")]),
io:format("~p~n", [word_valid(DescList)]),
[DescList1] = io_lib:format("~ts", ["毛泽东"]),
io:format("~p~n", [word_valid(DescList1)]),
[DescList2] = io_lib:format("~ts", ["测试毛泽东"]),
io:format("~p~n", [word_valid(DescList2)]),
io:format("~p ~p~n", ["测试毛泽东陈毅", word_valid_replace("测试毛泽东陈毅")]).

%% =============================================================================
%% Internal Functions
%% =============================================================================
%% @doc 加载过滤词
import_words(EtsName) ->
Terms = get_filter_content(),
Convert = fun(X) ->
unicode:characters_to_binary(X)
end,
Terms1 = lists:map(Convert, Terms),
lists:foreach(fun(X) -> add_word_to_ets(X, EtsName) end, Terms1),
ok.

add_word_to_ets(Word, EtsName) ->
UniString = unicode:characters_to_list(Word, unicode),
case UniString of
[] -> ignore;
_ ->
[HeadChar _Left] = UniString,
case ets:lookup(EtsName, HeadChar) of
[] -> ets:insert(EtsName, {HeadChar, [UniString]});
[{_H, OldList}] ->
case lists:member(UniString, OldList) of
false -> ets:insert(EtsName, {HeadChar, [UniString OldList]});
true -> ignore
end
end
end.

get_key_word(KeyChar, EtsName) ->
case ets:lookup(EtsName, KeyChar) of
[] -> [];
[{_H, WordList}] -> WordList
end.

get_filter_content() ->
[
"毛泽东","毛主席","主席","习主席","副主席","周恩来","刘少奇","朱德","彭德怀","林彪","刘伯承","陈毅","贺龙","聂荣臻","徐向前","罗荣桓","叶剑英","李大钊","陈独秀","孙中山","孙文","孙逸仙",
"邓小平","陈云","江泽民","李鹏","朱镕基","李瑞环","尉健行","李岚清","胡锦涛","罗干","温家宝","吴邦国","曾庆红"
].

%% @doc 检测过滤词并替换
private_replace_sensitive(InputString, LastReplaced, EtsName) ->
[HeadChar TailString] = InputString,
WordList = get_key_word(HeadChar, EtsName),
InputStrLen = length(InputString),
Match = fun(Word, Last) ->
match_replace(Word, Last, InputString, InputStrLen)
end,
case lists:foldl(Match, 0, WordList) of
0 ->
NewReplaced = LastReplaced ++ [HeadChar],
replace_sensitive(TailString, NewReplaced, EtsName);
SensWordLen ->
LeftString = lists:sublist(InputString, SensWordLen + 1, InputStrLen - SensWordLen),
NewReplaced = LastReplaced ++ "**",
replace_sensitive(LeftString, NewReplaced, EtsName)
end.