python MMSEG 分詞 實現

來源:互聯網
上載者:User

實現參見http://blog.csdn.net/acceptedxukai/article/details/7390300

修改了其中的幾個問題

1.取 自由語素度最大的chunk

這個是要取chunk中單個字的詞頻取對數後的和),而不是直接把sum chunk中每詞的詞頻和

2. 確定唯一的chunk後應該取這個剩下的chunk的第一個詞為分出的第一個詞
然後對除去第一個詞的剩下部分繼續重複以上步驟,直至分完整個句子,而不是直接取這個chunk裡的所有詞

  • Rule 1: Maximum matching (取 包含字數最多的chunk)
  • Rule 2: Largest average word length (取 詞平均字數最多的chunk)
  • Rule 3: Smallest variance of word lengths (取 詞長方差最小的chunk)
  • Rule 4: Largest sum of degree of morphemic freedom of one-character words (取 自由語素度最大的chunk)

修改後的代碼見

#coding=utf-8import mathclass Word:    def __init__(self,text = '',freq = 0):        self.text = text        self.freq = freq        self.length = len(text)class Chunk:    def __init__(self,w1,w2 = None,w3 = None):        self.words = []        self.words.append(w1)        if w2:            self.words.append(w2)        if w3:            self.words.append(w3)        #計算chunk的總長度    def totalWordLength(self):        length = 0        for word in self.words:            length += len(word.text)        return length        #計算平均長度    def averageWordLength(self):        return float(self.totalWordLength()) / float(len(self.words))        #計算標準差    def standardDeviation(self):        average = self.averageWordLength()        sum = 0.0        for word in self.words:            tmp = (len(word.text) - average)            sum += float(tmp) * float(tmp)        return sum        #自由語素度    def wordFrequency(self):        sum = 0        for word in self.words:            #sum += word.freq            sum += math.log(word.freq)        return sumclass ComplexCompare:        def takeHightest(self,chunks,comparator):        i = 1        for j in range(1, len(chunks)):            rlt = comparator(chunks[j], chunks[0])            if rlt > 0:                i = 0            if rlt >= 0:                chunks[i], chunks[j] = chunks[j], chunks[i]                i += 1        return chunks[0:i]        #以下四個函數是mmseg演算法的四種過濾原則,核心演算法    def mmFilter(self,chunks):        def comparator(a,b):            return a.totalWordLength() - b.totalWordLength()        return self.takeHightest(chunks,comparator)        def lawlFilter(self,chunks):        def comparator(a,b):            return a.averageWordLength() - b.averageWordLength()        return self.takeHightest(chunks,comparator)        def svmlFilter(self,chunks):        def comparator(a,b):            return b.standardDeviation() - a.standardDeviation()        return self.takeHightest(chunks, comparator)        def logFreqFilter(self,chunks):        def comparator(a,b):            return a.wordFrequency() - b.wordFrequency()        return self.takeHightest(chunks, comparator)  #載入片語字典和字元字典dictWord = {}maxWordLength = 0    def loadDictChars(filepath):    global maxWordLength    fsock = file(filepath)    for line in fsock.readlines():        freq, word = line.split(' ')        word = unicode(word.strip(), 'utf-8')        dictWord[word] = (len(word), int(freq))        maxWordLength = maxWordLength < len(word) and len(word) or maxWordLength    fsock.close()    def loadDictWords(filepath):    global maxWordLength    fsock = file(filepath)    for line in fsock.readlines():        #word = unicode(line.strip(), 'utf-8')        txt = unicode(line.strip(), 'utf-8')        word = txt.split(" ")[1]        #dictWord[word] = (len(word), 0)        dictWord[word] = (len(word), 1)        maxWordLength = maxWordLength < len(word) and len(word) or maxWordLength    fsock.close()#判斷該詞word是否在字典dictWord中    def getDictWord(word):    result = dictWord.get(word)    if result:        return Word(word,result[1])    return None#開始載入字典def run():    from os.path import join, dirname    loadDictChars(join(dirname(__file__), 'data', 'chars.dic'))    loadDictWords(join(dirname(__file__), 'data', 'words.dic'))class Analysis:        def __init__(self,text):        if isinstance(text,unicode):            self.text = text        else:            self.text = text.encode('utf-8')        self.cacheSize = 3        self.pos = 0        self.textLength = len(self.text)        self.cache = []        self.cacheIndex = 0        self.complexCompare = ComplexCompare()                #簡單小技巧,用到個緩衝,不知道具體有沒有用處        for i in range(self.cacheSize):            self.cache.append([-1,Word()])                #控制字典只載入一次        if not dictWord:            run()        def __iter__(self):        while True:            token = self.getNextToken()            if token == None:                raise StopIteration            yield token                def getNextChar(self):        return self.text[self.pos]    #判斷該字元是否是中文字元(不包括中文標點)    def isChineseChar(self,charater):        return 0x4e00 <= ord(charater) < 0x9fa6        #判斷是否是ASCII碼    def isASCIIChar(self, ch):        import string        if ch in string.whitespace:            return False        if ch in string.punctuation:            return False        return ch in string.printable        #得到下一個切割結果    def getNextToken(self):        while self.pos < self.textLength:            if self.isChineseChar(self.getNextChar()):                token = self.getChineseWords()            else :                token = self.getASCIIWords()+'/'            if len(token) > 0:                return token        return None        #切割出非中文詞    def getASCIIWords(self):        # Skip pre-word whitespaces and punctuations        #跳過中英文標點和空格        while self.pos < self.textLength:            ch = self.getNextChar()            if self.isASCIIChar(ch) or self.isChineseChar(ch):                break            self.pos += 1        #得到英文單詞的起始位置        start = self.pos                #找出英文單詞的結束位置        while self.pos < self.textLength:            ch = self.getNextChar()            if not self.isASCIIChar(ch):                break            self.pos += 1        end = self.pos                #Skip chinese word whitespaces and punctuations        #跳過中英文標點和空格        while self.pos < self.textLength:            ch = self.getNextChar()            if self.isASCIIChar(ch) or self.isChineseChar(ch):                break            self.pos += 1                #返回英文單詞        return self.text[start:end]        #切割出中文詞,並且做處理,用上述4種方法    def getChineseWords(self):        chunks = self.createChunks()        if len(chunks) > 1:            chunks = self.complexCompare.mmFilter(chunks)        if len(chunks) > 1:            chunks = self.complexCompare.lawlFilter(chunks)        if len(chunks) > 1:            chunks = self.complexCompare.svmlFilter(chunks)        if len(chunks) > 1:            chunks = self.complexCompare.logFreqFilter(chunks)        if len(chunks) == 0 :            return ''                #最後只有一種切割方法        word = chunks[0].words        token = ""        length = 0        #for x in word:        #    if x.length <> -1:        #        token += x.text + "/"        #        length += len(x.text)        x = word[0]        if x.length <> -1:            token += x.text + "/"            length += len(x.text)                self.pos += length        return token        #三重迴圈來枚舉切割方法,這裡也可以運用遞迴來實現    def createChunks(self):        chunks = []        originalPos = self.pos        words1 = self.getMatchChineseWords()                for word1 in words1:            self.pos += len(word1.text)            if self.pos < self.textLength:                words2 = self.getMatchChineseWords()                for word2 in words2:                    self.pos += len(word2.text)                    if self.pos < self.textLength:                        words3 = self.getMatchChineseWords()                        for word3 in words3:                            #print word3.length,word3.text                            if word3.length == -1:                                chunk = Chunk(word1,word2)                                print "Ture"                            else :                                chunk = Chunk(word1,word2,word3)                            chunks.append(chunk)                    elif self.pos == self.textLength:                        chunks.append(Chunk(word1,word2))                    self.pos -= len(word2.text)            elif self.pos == self.textLength:                chunks.append(Chunk(word1))            self.pos -= len(word1.text)                                        self.pos = originalPos        return chunks        #運用正向最大匹配演算法結合字典來切割中文文本    def getMatchChineseWords(self):        #use cache,check it         for i in range(self.cacheSize):            if self.cache[i][0] == self.pos:                return self.cache[i][1]                    originalPos = self.pos        words = []        index = 0        while self.pos < self.textLength:            if index >= maxWordLength :                break            if not self.isChineseChar(self.getNextChar()):                break            self.pos += 1            index += 1                        text = self.text[originalPos:self.pos]            #print "text:", text            word = getDictWord(text)            if word:                #print "word:", word.text                words.append(word)                        self.pos = originalPos        #沒有詞則放置個‘X’,將文本長度標記為-1        if not words:            word = Word()            word.length = -1            word.text = 'X'            words.append(word)                self.cache[self.cacheIndex] = (self.pos,words)        self.cacheIndex += 1        if self.cacheIndex >= self.cacheSize:            self.cacheIndex = 0        #print "words", words        return words
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.