非法词汇过滤

算法简介

将关键词构造成一颗树，每个字都是一个节点。

遍历需要过滤的语句，将语句的每个字都去树中查找，看看是否存在。

实现难点

构造一棵树简单，关键点是php中遍历字符串需要自己正确的得到单个字符的长度。

简单遍历字符串的方法如下：

 
$strLen = mb_strlen($str);
for ($i = 0; $i < $strLen; $i++) {
    echo mb_substr($str, $i, 1, "utf8"),PHP_EOL;
}

该方法是利用mb_*系列函数来正确截取每个字符，处理大量字符串时速度非常慢，我猜测是：mb_substr每截取一个字符，都要计算该字符串之前，有多少个字符。

正确的遍历字符串的方式是按utf8的编码规律来截取字符串，具体请看下文。

算法实现

 
<?php
/**
 * 非法关键词检查
 */
class SensitiveWords
{
    protected $tree = null;
    protected $callIsNumeric = true;
    /**
     * 非法词汇列表，一个非法词汇占用一行
     */ 
    public function __construct($path = __DIR__ . '/sensitiveWords.txt')
    {
        $this->tree = new WordNode();
        $file = fopen($path, "r");
        while (!feof($file)) {
            $words = trim(fgets($file));
            if ($words == '') {
                continue;
            }
            //存在纯数字的非法词汇 
            if (is_numeric($words)) {
                $this->callIsNumeric = false;
            }
            $this->setTree($words);
        }
        fclose($file);
    }
 
    protected function setTree($words)
    {
        $array = $this->strToArr($words);
        $tree = $this->tree;
        $l = count($array) - 1;
        foreach ($array as $k => $item) {
            $tree = $tree->getChildAlways($item);
            if ($l == $k) {
                $tree->end = true;
            }
        }
    }
 
    /**
     * 返回包含的非法词汇
     * @param string $str
     * @return array
     */ 
    public function check($str)
    {
        //先压缩字符串 
        $str = trim(str_replace([' ', "\n", "\r"], ['', '', ''], $str));
        $ret = [];
        loop:
        $strLen = strlen($str);
        if ($strLen === 0) {
            return array_unique($ret);
        }
        //非法词汇中没有纯数字的非法词汇，待检测字符串又是纯数字的，则跳过不再检查 
        if ($this->callIsNumeric && is_numeric($str)) {
            return array_unique($ret);
        }
        //挨个字符进行判断 
        $tree = $this->tree;
        $words = '';
        for ($i = 0; $i < $strLen; $i++) {
            //unicode范围 --> ord 范围 
            //一字节 0-127 --> 0 - 127 
            //二字节 128-2047 --> 194 - 223 
            //三字节 2048-65535 --> 224 - 239 
            //四字节 65536-1114111 --> 240 - 244 
            //@see http://shouce.jb51.net/gopl-zh/ch3/ch3-05.html 
            $ord = ord($str[$i]);
            if ($ord <= 127) {
                $word = $str[$i];
            } elseif ($ord <= 223) {
                $word = $str[$i] . $str[$i + 1];
                $i += 1;
            } elseif ($ord <= 239) {
                $word = $str[$i] . $str[$i + 1] . $str[$i + 2];
                $i += 2;
            } elseif ($ord <= 244) {
                //四字节 
                $word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
                $i += 3;
            } else {
                //五字节php都溢出了 
                //Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large 
                continue;
            }
            //判断当前字符 
            $tree = $tree->getChild($word);
            if (is_null($tree)) {
                //当前字不存在，则截取后再次循环 
                $str = substr($str, $i + 1);
                goto loop;
            } else {
                $words .= $word;
                if ($tree->end) {
                    $ret[] = $words;
                }
            }
        }
        return array_unique($ret);
    }
 
    protected function strToArr($str)
    {
        $array = [];
        $strLen = mb_strlen($str);
        for ($i = 0; $i < $strLen; $i++) {
            $array[] = mb_substr($str, $i, 1, "utf8");
        }
        return $array;
    }
}
/**
 * 单个字符的节点
 */
class WordNode
{
    //是否为非法词汇末级节点 
    public $end = false;
    //子节点 
    protected $child = [];
 
    /**
     * @param string $word
     * @return WordNode
     */ 
    public function getChildAlways($word)
    {
        if (!isset($this->child[$word])) {
            $this->child[$word] = new self();
        }
        return $this->child[$word];
    }
 
    /**
     * @param string $word
     * @return WordNode|null
     */ 
    public function getChild($word)
    {
        if ($word === '') {
            return null;
        }
        if (isset($this->child[$word])) {
            return $this->child[$word];
        }
        return null;
    }
}

	$strLen = mb_strlen($str);
	for ($i = 0; $i < $strLen; $i++) {
	echo mb_substr($str, $i, 1, "utf8"),PHP_EOL;
	}

	<?php
	/**
	* 非法关键词检查
	*/
	class SensitiveWords
	{
	protected $tree = null;
	protected $callIsNumeric = true;
	/**
	* 非法词汇列表，一个非法词汇占用一行
	*/
	public function __construct($path = __DIR__ . '/sensitiveWords.txt')
	{
	$this->tree = new WordNode();
	$file = fopen($path, "r");
	while (!feof($file)) {
	$words = trim(fgets($file));
	if ($words == '') {
	continue;
	}
	//存在纯数字的非法词汇
	if (is_numeric($words)) {
	$this->callIsNumeric = false;
	}
	$this->setTree($words);
	}
	fclose($file);
	}

	protected function setTree($words)
	{
	$array = $this->strToArr($words);
	$tree = $this->tree;
	$l = count($array) - 1;
	foreach ($array as $k => $item) {
	$tree = $tree->getChildAlways($item);
	if ($l == $k) {
	$tree->end = true;
	}
	}
	}

	/**
	* 返回包含的非法词汇
	* @param string $str
	* @return array
	*/
	public function check($str)
	{
	//先压缩字符串
	$str = trim(str_replace([' ', "\n", "\r"], ['', '', ''], $str));
	$ret = [];
	loop:
	$strLen = strlen($str);
	if ($strLen === 0) {
	return array_unique($ret);
	}
	//非法词汇中没有纯数字的非法词汇，待检测字符串又是纯数字的，则跳过不再检查
	if ($this->callIsNumeric && is_numeric($str)) {
	return array_unique($ret);
	}
	//挨个字符进行判断
	$tree = $this->tree;
	$words = '';
	for ($i = 0; $i < $strLen; $i++) {
	//unicode范围 --> ord 范围
	//一字节 0-127 --> 0 - 127
	//二字节 128-2047 --> 194 - 223
	//三字节 2048-65535 --> 224 - 239
	//四字节 65536-1114111 --> 240 - 244
	//@see http://shouce.jb51.net/gopl-zh/ch3/ch3-05.html
	$ord = ord($str[$i]);
	if ($ord <= 127) {
	$word = $str[$i];
	} elseif ($ord <= 223) {
	$word = $str[$i] . $str[$i + 1];
	$i += 1;
	} elseif ($ord <= 239) {
	$word = $str[$i] . $str[$i + 1] . $str[$i + 2];
	$i += 2;
	} elseif ($ord <= 244) {
	//四字节
	$word = $str[$i] . $str[$i + 1] . $str[$i + 2] . $str[$i + 3];
	$i += 3;
	} else {
	//五字节php都溢出了
	//Parse error: Invalid UTF-8 codepoint escape sequence: Codepoint too large
	continue;
	}
	//判断当前字符
	$tree = $tree->getChild($word);
	if (is_null($tree)) {
	//当前字不存在，则截取后再次循环
	$str = substr($str, $i + 1);
	goto loop;
	} else {
	$words .= $word;
	if ($tree->end) {
	$ret[] = $words;
	}
	}
	}
	return array_unique($ret);
	}

	protected function strToArr($str)
	{
	$array = [];
	$strLen = mb_strlen($str);
	for ($i = 0; $i < $strLen; $i++) {
	$array[] = mb_substr($str, $i, 1, "utf8");
	}
	return $array;
	}
	}
	/**
	* 单个字符的节点
	*/
	class WordNode
	{
	//是否为非法词汇末级节点
	public $end = false;
	//子节点
	protected $child = [];

	/**
	* @param string $word
	* @return WordNode
	*/
	public function getChildAlways($word)
	{
	if (!isset($this->child[$word])) {
	$this->child[$word] = new self();
	}
	return $this->child[$word];
	}

	/**
	* @param string $word
	* @return WordNode\|null
	*/
	public function getChild($word)
	{
	if ($word === '') {
	return null;
	}
	if (isset($this->child[$word])) {
	return $this->child[$word];
	}
	return null;
	}
	}