已有6000條關鍵字,分3批次。
一批為替換 replace,一批為遇到需要審核 censor,最後一批為遇到就禁止發布banned。
設計資料表如下:
mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| Field | Type | Null | Key | Default | Extra |+-------------+----------------------+------+-----+---------+----------------+| id | smallint(6) unsigned | NO | PRI | NULL | auto_increment || censortype | smallint(6) | NO | | 1 | || find | varchar(120) | NO | UNI | | || replacement | varchar(255) | NO | | | || extra | varchar(255) | NO | | | || uptime | int(11) | YES | | NULL | || enable | int(1) | NO | | 1 | |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)
由於有6000多關鍵字,使用 foreach 的 strstr?還是preg_match ?
追求效率,每小時提交量為10萬多文章。
剛剛寫的一種:
phpnamespace app\helpers;use app\models\other\Censor;use app\models\other\CensorLog;class CensorHelper{ public $id; public $data; public $match_banned; public $match_censor; public function __construct($id = 'censor') { $this->id = $id; $this->match_banned = []; $this->match_censor = []; $this->data = $this->getData(); } /** * @description 擷取Regex * @return array|mixed */ public function getData() { $data = \Yii::$app->cache->get($this->id); if (empty($data)) { $words = Censor::find() ->where(['enable' => 1]) ->andWhere([' != ', 'replacement', '']) ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC]) ->asArray() ->all(); $censor = []; $banned = []; $replace = []; foreach ($words as $row) { switch ($row['replacement']) { case '{censor}': $censor[] = $row['find']; break; case '{banned}': $banned[] = $row['find']; break; default: $replace['from'][] = $row['replacement']; $replace['to'][] = $row['find']; break; } } if ($censor || $banned) { $data = [ 'censor' => $this->generateRegularExpression($censor), 'banned' => $this->generateRegularExpression($banned), 'replace' => $replace, ]; \Yii::$app->cache->set($this->id, $data); } } return $data; } /** * @describe 產生Regex * @param array $words * @return string */ public function generateRegularExpression(array $words) { $regular = implode('|', array_map('preg_quote', $words)); return "/$regular/i"; } public function check($string) { $this->banned($string); $this->censor($string); } public function censor($string) { if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) { $this->match_censor = array_merge($this->match_censor, $matches[0]); } } public function banned($string) { if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) { $this->match_banned = array_merge($this->match_banned, $matches[0]); } } //重新載入 public function flush() { \Yii::$app->cache->delete($this->id); $this->getData(); } /** * @describe 替換 * @param $string * @return mixed */ public function replace($string) { return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string; } /** * @return string */ public function getLevel() { if (!empty($this->match_banned)) { return 'banned'; } else if (!empty($this->match_censor)) { return 'censor'; } else { return 'pass'; } } /** * @describe 添加記錄 * @param $tableId * @param $dataId */ public function addLog($tableId, $dataId) { $log = new CensorLog(); $log->datatb = $tableId; $log->dataid = $dataId; $log->matchcensor = implode(',', $this->match_censor); $log->matchbanned = implode(',', $this->match_banned); $log->addtime = time(); if (!\Yii::$app->user->isGuest) { $log->uid = \Yii::$app->user->getId(); $log->uname = \Yii::$app->user->getUname(); } $log->ip = IpHelper::getIP(); $log->iploc = IpHelper::getLocation($log->ip); $log->save(); }}
回複內容:
已有6000條關鍵字,分3批次。
一批為替換 replace,一批為遇到需要審核 censor,最後一批為遇到就禁止發布banned。
設計資料表如下:
mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| Field | Type | Null | Key | Default | Extra |+-------------+----------------------+------+-----+---------+----------------+| id | smallint(6) unsigned | NO | PRI | NULL | auto_increment || censortype | smallint(6) | NO | | 1 | || find | varchar(120) | NO | UNI | | || replacement | varchar(255) | NO | | | || extra | varchar(255) | NO | | | || uptime | int(11) | YES | | NULL | || enable | int(1) | NO | | 1 | |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)
由於有6000多關鍵字,使用 foreach 的 strstr?還是preg_match ?
追求效率,每小時提交量為10萬多文章。
剛剛寫的一種:
phpnamespace app\helpers;use app\models\other\Censor;use app\models\other\CensorLog;class CensorHelper{ public $id; public $data; public $match_banned; public $match_censor; public function __construct($id = 'censor') { $this->id = $id; $this->match_banned = []; $this->match_censor = []; $this->data = $this->getData(); } /** * @description 擷取Regex * @return array|mixed */ public function getData() { $data = \Yii::$app->cache->get($this->id); if (empty($data)) { $words = Censor::find() ->where(['enable' => 1]) ->andWhere([' != ', 'replacement', '']) ->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC]) ->asArray() ->all(); $censor = []; $banned = []; $replace = []; foreach ($words as $row) { switch ($row['replacement']) { case '{censor}': $censor[] = $row['find']; break; case '{banned}': $banned[] = $row['find']; break; default: $replace['from'][] = $row['replacement']; $replace['to'][] = $row['find']; break; } } if ($censor || $banned) { $data = [ 'censor' => $this->generateRegularExpression($censor), 'banned' => $this->generateRegularExpression($banned), 'replace' => $replace, ]; \Yii::$app->cache->set($this->id, $data); } } return $data; } /** * @describe 產生Regex * @param array $words * @return string */ public function generateRegularExpression(array $words) { $regular = implode('|', array_map('preg_quote', $words)); return "/$regular/i"; } public function check($string) { $this->banned($string); $this->censor($string); } public function censor($string) { if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) { $this->match_censor = array_merge($this->match_censor, $matches[0]); } } public function banned($string) { if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) { $this->match_banned = array_merge($this->match_banned, $matches[0]); } } //重新載入 public function flush() { \Yii::$app->cache->delete($this->id); $this->getData(); } /** * @describe 替換 * @param $string * @return mixed */ public function replace($string) { return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string; } /** * @return string */ public function getLevel() { if (!empty($this->match_banned)) { return 'banned'; } else if (!empty($this->match_censor)) { return 'censor'; } else { return 'pass'; } } /** * @describe 添加記錄 * @param $tableId * @param $dataId */ public function addLog($tableId, $dataId) { $log = new CensorLog(); $log->datatb = $tableId; $log->dataid = $dataId; $log->matchcensor = implode(',', $this->match_censor); $log->matchbanned = implode(',', $this->match_banned); $log->addtime = time(); if (!\Yii::$app->user->isGuest) { $log->uid = \Yii::$app->user->getId(); $log->uname = \Yii::$app->user->getUname(); } $log->ip = IpHelper::getIP(); $log->iploc = IpHelper::getLocation($log->ip); $log->save(); }}
trie 樹演算法最適合。
PHP 關鍵詞過濾擴充,該擴充依賴於 libdatrie(Trie 演算法的 C++ 實現)。
你這個敏感詞匹配,不需要用到正則,只用簡單的匹配或者替換就行了。
關鍵字分成三類存memcached。
然後對文章進行匹配,應該從最嚴厲的banned來匹配,接著是要censor的關鍵字,最後才是可以replace的敏感詞。
1 遇到就禁止發布 => str_pos
2 遇到需要審核 => str_pos
3 替換 => str_replace