Two-way maximal matching segmentation algorithm

Source: Internet
Author: User

#!/usr/bin/python

#encoding =GBK

Import Sys

Dictmaxlength = 5

Dctdict = {}

encoding= ' GBK '

‘‘‘

Initialize the dictionary, initialize the maximum length of the word

‘‘‘

def INITDCT (DCT):

Global Dctdict

Global Dictmaxlength

Dctobj = open (DCT)

For line in Dctobj:

line = Line.strip ()

Word = Line.split ("\ t") [0].strip ()

Dctdict[word] = line

Word = Word.strip (). Decode (encoding)

If dictmaxlength < Len (word):

Dictmaxlength = Len (word)

Dctobj.close ()

‘‘‘

Forward Maximum matching algorithm

‘‘‘

def maximunmathching (Sent):

Global Dictmaxlength

Global Dctdict

index = 0

j = 0

result = ""

Sent = Sent.strip (). Decode (encoding)

Sentlen = Len (Sent)

while (Index < Sentlen):

For I in range (dictmaxlength, 0,-1):

j = i + Index

If J > Sentlen:

j = Sentlen

Sub = Sent[index:j]

If Len (sub) > 1:

If Dctdict.has_key (Sub.encode (encoding)):

Index + = i

Result + = Sub.encode (encoding) + ""

Break

Else

Index + = i

If not sub.encode (encoding) = = "":

Result + = Sub.encode (encoding) + ""

Break

Return Result.strip ()

‘‘‘

Inverse Maximum matching algorithm

‘‘‘

def reversemaximunmathching (Sent):

Global Dctdict

Global Dictmaxlength

SB = ""

Sent = Sent.strip (). Decode (encoding)

index = Len (Sent)

j = 0

list = []

While index >= 0:

For I in range (dictmaxlength, 0,-1):

j = Index-i

If J < 0:j = 0

Sub = Sent[j:index]

If Len (sub) > 1:

If Dctdict.has_key (Sub.encode (encoding)):

List.append (Sub.encode (encoding))

index = Index-i

Break

Else

If not sub.encode (encoding) = = "":

List.append (Sub.encode (encoding))

index = Index-i

Break

List.reverse ()

Return "". Join (list)

‘‘‘

The less the dictionary word, the single dictionary word, the total number of words the better

‘‘‘

def segmenter (Sent):

MM = maximunmathching (Sent). Strip ()

RMM = reversemaximunmathching (Sent). Strip ()

if mm = = RMM:

return mm

Else

return Bmmresult (mm, RMM)

‘‘‘

The less the dictionary word, the single dictionary word, the total number of words the better

‘‘‘

def bmmresult (mm, RMM):

#print mm

#print RMM

Global Dctdict

Mmlist = Mm.split ("")

Rmmlist = Rmm.split ("")

oovnum_mm = 0

OOVNUM_RMM = 0

signum_mm = 0

SIGNUM_RMM = 0

totnum_mm = Len (mmlist)

TOTNUM_RMM = Len (rmmlist)

For word in mmlist:

If not Dctdict.has_key (word):

OOVNUM_MM + = 1

If Len (Word.decode (encoding)) = = 1:

SIGNUM_MM + = 1

For word in rmmlist:

If not Dctdict.has_key (word):

OOVNUM_RMM + = 1

If Len (Word.decode (encoding)) = = 1:

SIGNUM_RMM + = 1

Mmwmix = 0

Rmmnwmix = 0

If oovnum_mm > OOVNUM_RMM:

Rmmnwmix + = 1

Elif oovnum_mm < oovnum_rmm:

Mmwmix + = 1

If signum_mm > SIGNUM_RMM:

Rmmnwmix + = 1

Elif signum_mm < signum_rmm:

Mmwmix + = 1

If totnum_mm > TOTNUM_RMM:

Rmmnwmix + = 1

Elif totnum_mm < totnum_rmm:

Mmwmix + = 1

#print oovnum_mm, signum_mm, totnum_mm

#print oovnum_rmm, SIGNUM_RMM, TOTNUM_RMM

If Mmwmix < mmwmix:

return mm

Else

Return RMM

def handlefile (input, Output):

inputobj = open (Input)

outputobj = open (Output, "w")

index = 0

For line in Inputobj:

Index + = 1

If index% 100000 = = 0:

Print str (index) + "\ r"

line = Line.strip (). Lower ()

SEG = Segmenter (line)

Outputobj.write (Seg.strip () + "\ n")

Inputobj.close ()

Outputobj.close ()

if __name__ = = ' __main__ ':

If Len (sys.argv)! = 4:

Print "Usage%s dict[in] infile[in] outfile[out]."%sys.argv[0 "

Sys.exit (-1)

DCT = sys.argv[1]

input = sys.argv[2]

Output = Sys.argv[3]

INITDCT (DCT)

#sent = "Chien The People's Republic of China was established in 1949"

#print Segmenter (Sent)

Handlefile (input, Output)

Two-way maximal matching segmentation algorithm

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.