本文转载自微信公众号「UP技术控」,作者conan 。转载本文请联系UP技术控公众号。
敏感词、文字过滤是一个网站必不可少的功能,如何设计一个好的、高效的过滤算法是非常有必要的。
在实现文字过滤的算法中,DFA是唯一比较好的实现算法。DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。在实现敏感词过滤的算法中,我们必须要减少运算,而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。
下面看下在c#方法下实现方式
1、构建敏感词库类
- private bool LoadDictionary()
- {
- var wordList = new List
(); - if (_memoryLexicon == null)
- {
- _memoryLexicon = new WordGroup[char.MaxValue];
- var words = new SensitiveWordBll().GetAllWords();
- if (words == null)
- return false;
- foreach (string word in words)
- {
- wordList.Add(word);
- var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,
- Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);
- if (word != chineseWord)
- wordList.Add(chineseWord);
- }
- foreach (var word in wordList)
- {
- if (word.Length > 0)
- {
- var group = _memoryLexicon[word[0]];
- if (group == null)
- {
- group = new WordGroup();
- _memoryLexicon[word[0]] = group;
- }
- group.Add(word.Substring(1));
- }
- }
- }
- return true;
- }
2、构建敏感词检测类
- private bool Check(string blackWord)
- {
- _wordlenght = 0;
- //检测源下一位游标
- _nextCursor = _cursor + 1;
- var found = false;
- var continueCheck = 0;
- //遍历词的每一位做匹配
- for (var i = 0; i < blackWord.Length; i++)
- {
- //特殊字符偏移游标
- var offset = 0;
- if (_nextCursor >= _sourceText.Length)
- {
- if (i - 1 < blackWord.Length - 1)
- found = false;
- break;
- }
- else
- {
- //检测下位字符如果不是汉字 数字 字符 偏移量加1
- for (var y = _nextCursor; y < _sourceText.Length; y++)
- {
- if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))
- {
- offset++;
- //避让特殊字符,下位游标如果>=字符串长度 跳出
- if (_nextCursor + offset >= _sourceText.Length)
- break;
- _wordlenght++;
- }
- else break;
- }
- if (_nextCursor + offset >= _sourceText.Length)
- {
- found = false;
- break;
- }
- if (blackWord[i] == _sourceText[_nextCursor + offset])
- {
- found = true;
- continueCheck = 0;
- }
- else
- {
- // 匹配不到时尝试继续匹配4个字符
- if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)
- {
- continueCheck++;
- i--;
- }
- else
- {
- found = false;
- break;
- }
- }
- }
- _nextCursor = _nextCursor + 1 + offset;
- _wordlenght++;
- }
- return found;
- }
- }
3、测试与使用方法
- _illegalWords = new List
(); - if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))
- {
- return sourceText;
- }
-
- if (!string.IsNullOrEmpty(sourceText))
- _sourceText = sourceText;
- _cursor = 0;
- if (!LoadDictionary())
- {
- return _sourceText;
- }
-
- var tempString = _sourceText.ToCharArray();
- var sourceTextDbc = ToDBC(SourceText);
- for (var i = 0; i < SourceText.Length; i++)
- {
- //查询以该字为首字符的词组
- var group = _memoryLexicon[sourceTextDbc[i]];
- if (group != null)
- {
- for (var z = 0; z < group.Count(); z++)
- {
- string word = group.GetWord(z);
- if (word.Length == 0 || Check(word))
- {
- if (isFirstCheckedReturn)
- {
- return null;
- }
-
- var blackword = string.Empty;
- for (var pos = 0; pos < _wordlenght + 1; pos++)
- {
- blackword += tempString[pos + _cursor].ToString();
- tempString[pos + _cursor] = ReplaceChar;
- }
- _illegalWords.Add(blackword);
-
- _cursor = _cursor + _wordlenght;
- i = i + _wordlenght;
- break;
- }
- }
- }
- _cursor++;
- }
- return new string(tempString);
- var filter = new SensitiveWordFilter();
- filter.SourceText = "dddddd";
- var sourctText = filter.SourceText;
- filter.ResetMemoryLexicon();
- var datetime = DateTime.Now;
- var ss = filter.Filter();
- var datetime2 = DateTime.Now;
- var millisecond = (datetime2 - datetime).TotalMilliseconds;
- Console.WriteLine(millisecond);
- Console.WriteLine(ss);
- var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);
- var ssx = sourctText;
- var datetimex = DateTime.Now;
- foreach (var word in words)
- {
- if (word.Length > 0)
- ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*'));
- }
- var datetime2x = DateTime.Now;
- var millisecondx = (datetime2x - datetimex).TotalMilliseconds;
- Console.WriteLine(millisecondx);
- Console.WriteLine(ssx);