#!/usr/bin/python
#encoding =GBK
Import Sys
Dictmaxlength = 5
Dctdict = {}
encoding= ' GBK '
‘‘‘
Initialize the dictionary, initialize the maximum length of the word
‘‘‘
def INITDCT (DCT):
Global Dctdict
Global Dictmaxlength
Dctobj = open (DCT)
For line in Dctobj:
line = Line.strip ()
Word = Line.split ("\ t") [0].strip ()
Dctdict[word] = line
Word = Word.strip (). Decode (encoding)
If dictmaxlength < Len (word):
Dictmaxlength = Len (word)
Dctobj.close ()
‘‘‘
Forward Maximum matching algorithm
‘‘‘
def maximunmathching (Sent):
Global Dictmaxlength
Global Dctdict
index = 0
j = 0
result = ""
Sent = Sent.strip (). Decode (encoding)
Sentlen = Len (Sent)
while (Index < Sentlen):
For I in range (dictmaxlength, 0,-1):
j = i + Index
If J > Sentlen:
j = Sentlen
Sub = Sent[index:j]
If Len (sub) > 1:
If Dctdict.has_key (Sub.encode (encoding)):
Index + = i
Result + = Sub.encode (encoding) + ""
Break
Else
Index + = i
If not sub.encode (encoding) = = "":
Result + = Sub.encode (encoding) + ""
Break
Return Result.strip ()
‘‘‘
Inverse Maximum matching algorithm
‘‘‘
def reversemaximunmathching (Sent):
Global Dctdict
Global Dictmaxlength
SB = ""
Sent = Sent.strip (). Decode (encoding)
index = Len (Sent)
j = 0
list = []
While index >= 0:
For I in range (dictmaxlength, 0,-1):
j = Index-i
If J < 0:j = 0
Sub = Sent[j:index]
If Len (sub) > 1:
If Dctdict.has_key (Sub.encode (encoding)):
List.append (Sub.encode (encoding))
index = Index-i
Break
Else
If not sub.encode (encoding) = = "":
List.append (Sub.encode (encoding))
index = Index-i
Break
List.reverse ()
Return "". Join (list)
‘‘‘
The less the dictionary word, the single dictionary word, the total number of words the better
‘‘‘
def segmenter (Sent):
MM = maximunmathching (Sent). Strip ()
RMM = reversemaximunmathching (Sent). Strip ()
if mm = = RMM:
return mm
Else
return Bmmresult (mm, RMM)
‘‘‘
The less the dictionary word, the single dictionary word, the total number of words the better
‘‘‘
def bmmresult (mm, RMM):
#print mm
#print RMM
Global Dctdict
Mmlist = Mm.split ("")
Rmmlist = Rmm.split ("")
oovnum_mm = 0
OOVNUM_RMM = 0
signum_mm = 0
SIGNUM_RMM = 0
totnum_mm = Len (mmlist)
TOTNUM_RMM = Len (rmmlist)
For word in mmlist:
If not Dctdict.has_key (word):
OOVNUM_MM + = 1
If Len (Word.decode (encoding)) = = 1:
SIGNUM_MM + = 1
For word in rmmlist:
If not Dctdict.has_key (word):
OOVNUM_RMM + = 1
If Len (Word.decode (encoding)) = = 1:
SIGNUM_RMM + = 1
Mmwmix = 0
Rmmnwmix = 0
If oovnum_mm > OOVNUM_RMM:
Rmmnwmix + = 1
Elif oovnum_mm < oovnum_rmm:
Mmwmix + = 1
If signum_mm > SIGNUM_RMM:
Rmmnwmix + = 1
Elif signum_mm < signum_rmm:
Mmwmix + = 1
If totnum_mm > TOTNUM_RMM:
Rmmnwmix + = 1
Elif totnum_mm < totnum_rmm:
Mmwmix + = 1
#print oovnum_mm, signum_mm, totnum_mm
#print oovnum_rmm, SIGNUM_RMM, TOTNUM_RMM
If Mmwmix < mmwmix:
return mm
Else
Return RMM
def handlefile (input, Output):
inputobj = open (Input)
outputobj = open (Output, "w")
index = 0
For line in Inputobj:
Index + = 1
If index% 100000 = = 0:
Print str (index) + "\ r"
line = Line.strip (). Lower ()
SEG = Segmenter (line)
Outputobj.write (Seg.strip () + "\ n")
Inputobj.close ()
Outputobj.close ()
if __name__ = = ' __main__ ':
If Len (sys.argv)! = 4:
Print "Usage%s dict[in] infile[in] outfile[out]."%sys.argv[0 "
Sys.exit (-1)
DCT = sys.argv[1]
input = sys.argv[2]
Output = Sys.argv[3]
INITDCT (DCT)
#sent = "Chien The People's Republic of China was established in 1949"
#print Segmenter (Sent)
Handlefile (input, Output)
Two-way maximal matching segmentation algorithm