字串 - PHP 敏感詞違法關鍵字檢測 演算法方案

來源:互聯網
上載者:User
已有6000條關鍵字,分3批次。
一批為替換 replace,一批為遇到需要審核 censor,最後一批為遇到就禁止發布banned。
設計資料表如下:

mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| Field       | Type                 | Null | Key | Default | Extra          |+-------------+----------------------+------+-----+---------+----------------+| id          | smallint(6) unsigned | NO   | PRI | NULL    | auto_increment || censortype  | smallint(6)          | NO   |     | 1       |                || find        | varchar(120)         | NO   | UNI |         |                || replacement | varchar(255)         | NO   |     |         |                || extra       | varchar(255)         | NO   |     |         |                || uptime      | int(11)              | YES  |     | NULL    |                || enable      | int(1)               | NO   |     | 1       |                |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)

由於有6000多關鍵字,使用 foreach 的 strstr?還是preg_match ?
追求效率,每小時提交量為10萬多文章。

剛剛寫的一種:

phpnamespace app\helpers;use app\models\other\Censor;use app\models\other\CensorLog;class CensorHelper{    public $id;    public $data;    public $match_banned;    public $match_censor;    public function __construct($id = 'censor')    {        $this->id = $id;        $this->match_banned = [];        $this->match_censor = [];        $this->data = $this->getData();    }    /**     * @description 擷取Regex     * @return array|mixed     */    public function getData()    {        $data = \Yii::$app->cache->get($this->id);        if (empty($data)) {            $words = Censor::find()                ->where(['enable' => 1])                ->andWhere([' != ', 'replacement', ''])                ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC])                ->asArray()                ->all();            $censor = [];            $banned = [];            $replace = [];            foreach ($words as $row) {                switch ($row['replacement']) {                    case '{censor}':                        $censor[] = $row['find'];                        break;                    case '{banned}':                        $banned[] = $row['find'];                        break;                    default:                        $replace['from'][] = $row['replacement'];                        $replace['to'][] = $row['find'];                        break;                }            }            if ($censor || $banned) {                $data = [                    'censor' => $this->generateRegularExpression($censor),                    'banned' => $this->generateRegularExpression($banned),                    'replace' => $replace,                ];                \Yii::$app->cache->set($this->id, $data);            }        }        return $data;    }    /**     * @describe 產生Regex     * @param array $words     * @return string     */    public function generateRegularExpression(array $words)    {        $regular = implode('|', array_map('preg_quote', $words));        return "/$regular/i";    }    public function check($string)    {        $this->banned($string);        $this->censor($string);    }    public function censor($string)    {        if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) {            $this->match_censor = array_merge($this->match_censor, $matches[0]);        }    }    public function banned($string)    {        if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) {            $this->match_banned = array_merge($this->match_banned, $matches[0]);        }    }    //重新載入    public function flush()    {        \Yii::$app->cache->delete($this->id);        $this->getData();    }    /**     * @describe 替換     * @param $string     * @return mixed     */    public function replace($string)    {        return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string;    }    /**     * @return string     */    public function getLevel()    {        if (!empty($this->match_banned)) {            return 'banned';        } else if (!empty($this->match_censor)) {            return 'censor';        } else {            return 'pass';        }    }    /**     * @describe 添加記錄     * @param $tableId     * @param $dataId     */    public function addLog($tableId, $dataId)    {        $log = new CensorLog();        $log->datatb = $tableId;        $log->dataid = $dataId;        $log->matchcensor = implode(',', $this->match_censor);        $log->matchbanned = implode(',', $this->match_banned);        $log->addtime = time();        if (!\Yii::$app->user->isGuest) {            $log->uid = \Yii::$app->user->getId();            $log->uname = \Yii::$app->user->getUname();        }        $log->ip = IpHelper::getIP();        $log->iploc = IpHelper::getLocation($log->ip);        $log->save();    }}

回複內容:

已有6000條關鍵字,分3批次。
一批為替換 replace,一批為遇到需要審核 censor,最後一批為遇到就禁止發布banned。
設計資料表如下:

mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| Field       | Type                 | Null | Key | Default | Extra          |+-------------+----------------------+------+-----+---------+----------------+| id          | smallint(6) unsigned | NO   | PRI | NULL    | auto_increment || censortype  | smallint(6)          | NO   |     | 1       |                || find        | varchar(120)         | NO   | UNI |         |                || replacement | varchar(255)         | NO   |     |         |                || extra       | varchar(255)         | NO   |     |         |                || uptime      | int(11)              | YES  |     | NULL    |                || enable      | int(1)               | NO   |     | 1       |                |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)

由於有6000多關鍵字,使用 foreach 的 strstr?還是preg_match ?
追求效率,每小時提交量為10萬多文章。

剛剛寫的一種:

phpnamespace app\helpers;use app\models\other\Censor;use app\models\other\CensorLog;class CensorHelper{    public $id;    public $data;    public $match_banned;    public $match_censor;    public function __construct($id = 'censor')    {        $this->id = $id;        $this->match_banned = [];        $this->match_censor = [];        $this->data = $this->getData();    }    /**     * @description 擷取Regex     * @return array|mixed     */    public function getData()    {        $data = \Yii::$app->cache->get($this->id);        if (empty($data)) {            $words = Censor::find()                ->where(['enable' => 1])                ->andWhere([' != ', 'replacement', ''])                ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC])                ->asArray()                ->all();            $censor = [];            $banned = [];            $replace = [];            foreach ($words as $row) {                switch ($row['replacement']) {                    case '{censor}':                        $censor[] = $row['find'];                        break;                    case '{banned}':                        $banned[] = $row['find'];                        break;                    default:                        $replace['from'][] = $row['replacement'];                        $replace['to'][] = $row['find'];                        break;                }            }            if ($censor || $banned) {                $data = [                    'censor' => $this->generateRegularExpression($censor),                    'banned' => $this->generateRegularExpression($banned),                    'replace' => $replace,                ];                \Yii::$app->cache->set($this->id, $data);            }        }        return $data;    }    /**     * @describe 產生Regex     * @param array $words     * @return string     */    public function generateRegularExpression(array $words)    {        $regular = implode('|', array_map('preg_quote', $words));        return "/$regular/i";    }    public function check($string)    {        $this->banned($string);        $this->censor($string);    }    public function censor($string)    {        if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) {            $this->match_censor = array_merge($this->match_censor, $matches[0]);        }    }    public function banned($string)    {        if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) {            $this->match_banned = array_merge($this->match_banned, $matches[0]);        }    }    //重新載入    public function flush()    {        \Yii::$app->cache->delete($this->id);        $this->getData();    }    /**     * @describe 替換     * @param $string     * @return mixed     */    public function replace($string)    {        return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string;    }    /**     * @return string     */    public function getLevel()    {        if (!empty($this->match_banned)) {            return 'banned';        } else if (!empty($this->match_censor)) {            return 'censor';        } else {            return 'pass';        }    }    /**     * @describe 添加記錄     * @param $tableId     * @param $dataId     */    public function addLog($tableId, $dataId)    {        $log = new CensorLog();        $log->datatb = $tableId;        $log->dataid = $dataId;        $log->matchcensor = implode(',', $this->match_censor);        $log->matchbanned = implode(',', $this->match_banned);        $log->addtime = time();        if (!\Yii::$app->user->isGuest) {            $log->uid = \Yii::$app->user->getId();            $log->uname = \Yii::$app->user->getUname();        }        $log->ip = IpHelper::getIP();        $log->iploc = IpHelper::getLocation($log->ip);        $log->save();    }}

trie 樹演算法最適合。

PHP 關鍵詞過濾擴充,該擴充依賴於 libdatrie(Trie 演算法的 C++ 實現)。

你這個敏感詞匹配,不需要用到正則,只用簡單的匹配或者替換就行了。

關鍵字分成三類存memcached。

然後對文章進行匹配,應該從最嚴厲的banned來匹配,接著是要censor的關鍵字,最後才是可以replace的敏感詞。

1 遇到就禁止發布 => str_pos
2 遇到需要審核 => str_pos
3 替換 => str_replace

  • 聯繫我們

    該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

    如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.