算法简介
将关键词构造成一颗树,每个字都是一个节点。
遍历需要过滤的语句,将语句的每个字都去树中查找,看看是否存在。
实现难点
构造一棵树简单,关键点是php
中遍历字符串需要自己正确的得到单个字符的长度。
简单遍历字符串的方法如下:
$strLen = mb_strlen($str); | |
for ($i = 0; $i < $strLen; $i++) { | |
echo mb_substr($str, $i, 1, "utf8"),PHP_EOL; | |
} |
该方法是利用mb_*
系列函数来正确截取每个字符,处理大量字符串时速度非常慢,我猜测是:mb_substr
每截取一个字符,都要计算该字符串之前,有多少个字符。
正确的遍历字符串的方式是按utf8
的编码规律来截取字符串,具体请看下文。
算法实现
/** | |
* 非法关键词检查 | |
*/ | |
class SensitiveWords | |
{ | |
protected $tree = null; | |
protected $callIsNumeric = true; | |
/** | |
* 非法词汇列表,一个非法词汇占用一行 | |
*/ | |
public function __construct($path = __DIR__ . '/sensitiveWords.txt') | |
{ | |
$this->tree = new WordNode(); | |
$file = fopen($path, "r"); | |
while (!feof($file)) { | |
$words = trim(fgets($file)); | |
if ($words == '') { | |
continue; | |
} | |
//存在纯数字的非法词汇 | |
if (is_numeric($words)) { | |
$this->callIsNumeric = false; | |
} | |
$this->setTree($words); | |
} | |
fclose($file); | |
} | |
protected function setTree($words) | |
{ | |
$array = $this->strToArr($words); | |
$tree = $this->tree; | |
$l = count($array) - 1; | |
foreach ($array as $k => $item) { | |
$tree = $tree->getChildAlways($item); | |
if ($l == $k) { | |
$tree->end = true; | |
} | |
} | |
} | |
/** | |
* 返回包含的非法词汇 | |
* @param string $str | |
* @return array | |
*/ | |
public function check($str) | |
{ | |
//先压缩字符串 | |
$str = trim(str_replace([' ', "\n", "\r"], ['', '', ''], $str)); | |
$ret = []; | |
loop: | |
$strLen = strlen($str); | |
if ($strLen === 0) { | |
return array_unique($ret); | |
} | |
//非法词汇中没有纯数字的非法词汇,待检测字符串又是纯数字的,则跳过不再检查 | |
if ($this->callIsNumeric && is_numeric($str)) { | |
return array_unique($ret); | |
} | |
//挨个字符进行判断 | |
$tree = $this->tree; | |
$words = ''; | |
for ($i = 0; $i < $strLen; $i++) { | |
//unicode范围 --> ord 范围 | |
//一字节 0-127 --> 0 - 127 | |
//二字节 128-2047 --> 194 - 223 | |
//三字节 2048-65535 --> 224 - 239 | |
//四字节 65536-1114111 --> 240 - 244 | |
//@see http://shouce.jb51.net/gopl-zh/ch3/ch3-05.html | |
$ord = ord($str[$i]); | |
if ($ord <= 127) { | |
$word = $str[$i]; | |
} elseif ($ord <= 223) { | |
$word = $str[$i] . $str[$i + 1]; | |
$i += 1; | |
} elseif ($ord <= 239) { | |
$word = $str[$i] . $str[$i + 1] . $str[$i + 2]; | |
$i += 2; | |
} elseif ($ord <= 244) { | |
//四字节 | |
$word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3]; | |
$i += 3; | |
} else { | |
//五字节php都溢出了 | |
//Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large | |
continue; | |
} | |
//判断当前字符 | |
$tree = $tree->getChild($word); | |
if (is_null($tree)) { | |
//当前字不存在,则截取后再次循环 | |
$str = substr($str, $i + 1); | |
goto loop; | |
} else { | |
$words .= $word; | |
if ($tree->end) { | |
$ret[] = $words; | |
} | |
} | |
} | |
return array_unique($ret); | |
} | |
protected function strToArr($str) | |
{ | |
$array = []; | |
$strLen = mb_strlen($str); | |
for ($i = 0; $i < $strLen; $i++) { | |
$array[] = mb_substr($str, $i, 1, "utf8"); | |
} | |
return $array; | |
} | |
} | |
/** | |
* 单个字符的节点 | |
*/ | |
class WordNode | |
{ | |
//是否为非法词汇末级节点 | |
public $end = false; | |
//子节点 | |
protected $child = []; | |
/** | |
* @param string $word | |
* @return WordNode | |
*/ | |
public function getChildAlways($word) | |
{ | |
if (!isset($this->child[$word])) { | |
$this->child[$word] = new self(); | |
} | |
return $this->child[$word]; | |
} | |
/** | |
* @param string $word | |
* @return WordNode|null | |
*/ | |
public function getChild($word) | |
{ | |
if ($word === '') { | |
return null; | |
} | |
if (isset($this->child[$word])) { | |
return $this->child[$word]; | |
} | |
return null; | |
} | |
} |