PHP sensitive word filtering
-
- /**
- * Banned word filtering
- * Execution efficiency: each article takes 0.05 seconds.
- * @ Author liuxu
- *
- */
- Class Logic_BlackWord
- {
-
- Const APP_FORUM = 1;
- Const APP_BLOG = 2;
- Const APP_VOTE = 3;
-
- /**
- * Filter out banned words
- * @ Param unknown $ txt
- * @ Return Ambigous
- */
- Public function getHitList ($ txt)
- {
- $ HitList = array ();
-
- // Batch filter of banned words
- $ Max = $ this-> getMax ();
- If ($ max)
- {
- $ Size = 1000;
- $ Last = ceil ($ max/$ size );
- For ($ page = 1; $ page <= $ last; $ page ++)
- {
- $ Result = $ this-> getHitListByPage ($ txt, $ page, $ size );
- If ($ result) $ hitList = array_merge ($ hitList, $ result );
- }
- }
-
- $ HitList2 = array ();
- Foreach ($ hitList as $ hit => $ type)
- {
- $ HitList2 [$ type] [] = $ hit;
- }
-
- Return $ hitList2;
- }
-
- Private function getMax ()
- {
- $ Redis = Rds: factory ();
- $ MemKey = 'blackword _ max ';
- $ Max = $ redis-> get ($ memKey );
- If ($ max = false)
- {
- $ Max = 0;
- $ BlackWord = new Model_BlackWord_BlackWord ();
- $ Para ['field'] = "MAX (id) AS max ";
- $ Result = $ blackWord-> search ($ para );
- If (isset ($ result [0] ['Max ']) $ max = $ result [0] ['Max'];
-
- $ Redis-> setex ($ memKey, 300, $ max );
- }
-
- Return $ max;
- }
-
- /**
- * Block words by batch filtering
- * @ Param unknown $ txt
- * @ Param number $ page
- * @ Param number $ size
- * @ Return multitype: Ambigous
- */
- Private function getHitListByPage ($ txt, $ page = 1, $ size = 1000)
- {
- $ HitList = array ();
-
- // Get the banned tree in batches
- $ WordTree = $ this-> getWordTreeByPage ($ page, $ size );
-
- $ Txt = strip_tags ($ txt );
- $ Txt = preg_replace ('/[^ a-zA-Z0-9 \ x {4e00}-\ x {9fa5}]/iu', '', $ txt );
-
- $ Len = mb_strlen ($ txt, 'utf-8 ');
- For ($ I = 0; $ I <$ len; $ I ++)
- {
- $ Char = mb_substr ($ txt, $ I, 1, 'utf-8 ');
- If (isset ($ wordTree [$ char])
- {
- $ Result = $ this-> getHitListByTree (mb_substr ($ txt, $ I, 50, 'utf-8'), $ wordTree );
- If ($ result)
- {
- Foreach ($ result as $ hit => $ type)
- {
- $ HitList [$ hit] = $ type;
- }
- }
- }
- }
-
- Return $ hitList;
- }
-
- /**
- * Whether to ban words
- * @ Param str $ txt
- * @ Param arr $ wordTree
- * @ Return multitype: unknown
- */
- Private function getHitListByTree ($ txt, & $ wordTree)
- {
- $ Len = mb_strlen ($ txt, 'utf-8 ');
- $ Point = & $ wordTree;
- $ Hit = '';
- $ HitList = array ();
- For ($ I = 0; $ I <$ len; $ I ++)
- {
- $ Char = mb_substr ($ txt, $ I, 1, 'utf-8 ');
- If (isset ($ point [$ char])
- {
- $ Hit. = $ char;
- $ Point = & $ point [$ char];
-
- If (isset ($ point ['type']) // match successful
- {
- $ HitList [$ hit] = $ point ['type'];
- }
- }
- Else
- {
- Break;
- }
-
- }
-
- Return $ hitList;
- }
-
- /**
- * Get the banned word tree in batches
- * @ Param int $ page
- * @ Param int $ size
- * @ Return arr:
- */
- Private function getWordTreeByPage ($ page = 1, $ size = 1000)
- {
- $ Redis = Rds: factory ();
- $ MemKey = 'blackword _ tree _ '. $ page.' _ '. $ size;
- $ WordTree = $ redis-> get ($ memKey );
- If ($ wordTree = false)
- {
- $ WordTree = array ();
- $ BlackWord = new Model_BlackWord_BlackWord ();
- $ Start = ($ page-1) * $ size;
- $ End = $ start + $ size;
- $ Para ['where'] = "status = 1 AND id>". $ start. "AND id <=". $ end;
- $ Result = $ blackWord-> search ($ para );
- If ($ result)
- {
- Foreach ($ result as $ value)
- {
- If ($ value ['word'])
- {
- $ Value ['word'] = preg_split ('/(? $ Point = & $ wordTree;
- Foreach ($ value ['word'] as $ char)
- {
- $ Point = & $ point [$ char];
- }
-
- $ Point ['type'] = $ value ['type'];
- }
- }
- }
-
- $ Redis-> setex ($ memKey, 300, $ wordTree );
- }
-
- Return $ wordTree;
- }
-
- }
-
|
PHP