Recently, from data analysis, many keywords need to be matched from a large number of short articles. If the brute force find is found, the CPU becomes a bottleneck, so I thought of the AC automatic machine.
The AC automatic machine is a classic data structure for multi-mode matching. The principle is to construct a fail pointer like KMP. However, the AC automatic machine is constructed on the Trie tree, but the principle is the same.
In order to match unicode, after unicode encoding, the indexes are indexed every four digits and changed to a 16-cross trie tree.
In fact, this kind of thing should be written in C/C ++, but I am not very likely to use python to call c, so I will use python to rewrite it.
Below is the code
[Python] # coding = UTF-8
KIND = 16
# BASE = ord ('A ')
Class Node ():
Def _ init _ (self ):
Self. fail = None
Self. next = [None] * KIND
Self. end = False
Class AC_Automachine ():
Def _ init _ (self ):
Self. root = Node ()
Self. queue = []
Def getIndex (self, char ):
Return ord (char) #-BASE
Def insert (self, string ):
P = self. root
For char in string:
Index = self. getIndex (char)
If p. next [index] = None:
P. next [index] = Node ()
P = p. next [index]
P. end = True
Def build_automachine (self ):
Self. root. fail = None
Self. queue. append (self. root)
While len (self. queue )! = 0:
Parent = self. queue [0]
Self. queue. pop (0)
For I, child in enumerate (parent. next ):
If child = None: continue
If parent = self. root:
Child. fail = self. root
Else:
Failp = parent. fail
While failp! = None:
If failp. next [I]! = None:
Child. fail = failp. next [I]
Break
Failp = failp. fail
If failp = None: child. fail = self. root
Self. queue. append (child)
Def matchOne (self, string ):
P = self. root
For char in string:
Index = self. getIndex (char)
While p. next [index] = None and p! = Self. root: p = p. fail
If p. next [index] = None: p = self. root
Else: p = p. next [index]
If p. end: return True
Return False
Class UnicodeAC_AutoMachine ():
Def _ init _ (self ):
Self. ac = AC_Automachine ()
Def getAcString (self, string ):
String = bytearray (string. encode ('utf-8 '))
Ac_string =''
For byte in string:
Ac_string + = chr (byte % 16)
Ac_string + = chr (byte/16)
# Print ac_string
Return ac_string
Def insert (self, string ):
If type (string )! = Unicode:
Raise Exception ('unicodeac _ AutoMachine: insert type not unicode ')
Ac_string = self. getAcString (string)
Self. ac. insert (ac_string)
Def build_automachine (self ):
Self. ac. build_automachine ()
Def matchOne (self, string ):
If type (string )! = Unicode:
Raise Exception ('unicodeac _ AutoMachine: insert type not unicode ')
Ac_string = self. getAcString (string)
Return self. ac. matchOne (ac_string)
Def main2 ():
Ac = UnicodeAC_AutoMachine ()
Ac. insert (u 'dianguang ')
Ac. insert (u'tasty ')
Ac. insert (u'play ')
Ac. build_automachine ()
Print ac. matchOne (u'hi, what is Ding yaguang doing ')
Print ac. matchOne (u' AB ')
Print ac. matchOne (u'cannot eat ')
Print ac. matchOne (u'the meal is delicious and there are plenty of delicious meals ,')
Print ac. matchOne (U' has a lot of fun ')
If _ name _ = '_ main __':
Main2 ()
# Coding = UTF-8
KIND = 16
# BASE = ord ('A ')
Class Node ():
Def _ init _ (self ):
Self. fail = None
Self. next = [None] * KIND
Self. end = False
Class AC_Automachine ():
Def _ init _ (self ):
Self. root = Node ()
Self. queue = []
Def getIndex (self, char ):
Return ord (char) #-BASE
Def insert (self, string ):
P = self. root
For char in string:
Index = self. getIndex (char)
If p. next [index] = None:
P. next [index] = Node ()
P = p. next [index]
P. end = True
Def build_automachine (self ):
Self. root. fail = None
Self. queue. append (self. root)
While len (self. queue )! = 0:
Parent = self. queue [0]
Self. queue. pop (0)
For I, child in enumerate (parent. next ):
If child = None: continue
If parent = self. root:
Child. fail = self. root
Else:
Failp = parent. fail
While failp! = None:
If failp. next [I]! = None:
Child. fail = failp. next [I]
Break
Failp = failp. fail
If failp = None: child. fail = self. root
Self. queue. append (child)
Def matchOne (self, string ):
P = self. root
For char in string:
Index = self. getIndex (char)
While p. next [index] = None and p! = Self. root: p = p. fail
If p. next [index] = None: p = self. root
Else: p = p. next [index]
If p. end: return True
Return False
Class UnicodeAC_AutoMachine ():
Def _ init _ (self ):
Self. ac = AC_Automachine ()
Def getAcString (self, string ):
String = bytearray (string. encode ('utf-8 '))
Ac_string =''
For byte in string:
Ac_string + = chr (byte % 16)
Ac_string + = chr (byte/16)
# Print ac_string
Return ac_string
Def insert (self, string ):
If type (string )! = Unicode:
Raise Exception ('unicodeac _ AutoMachine: insert type not unicode ')
Ac_string = self. getAcString (string)
Self. ac. insert (ac_string)
Def build_automachine (self ):
Self. ac. build_automachine ()
Def matchOne (self, string ):
If type (string )! = Unicode:
Raise Exception ('unicodeac _ AutoMachine: insert type not unicode ')
Ac_string = self. getAcString (string)
Return self. ac. matchOne (ac_string)
Def main2 ():
Ac = UnicodeAC_AutoMachine ()
Ac. insert (u 'dianguang ')
Ac. insert (u'tasty ')
Ac. insert (u'play ')
Ac. build_automachine ()
Print ac. matchOne (u'hi, what is Ding yaguang doing ')
Print ac. matchOne (u' AB ')
Print ac. matchOne (u'cannot eat ')
Print ac. matchOne (u'the meal is delicious and there are plenty of delicious meals ,')
Print ac. matchOne (U' has a lot of fun ')
If _ name _ = '_ main __':
Main2 ()
The test results are as follows:
The keyword number is 2000, and The length is 2-12.
The text length is 60-80, respectively, to test the speed of 4000 text and 300000 text
The speed is 5.5 s and 619.8 s, respectively.
For example, it's pretty good.
From New Day New Plan