實現參見http://blog.csdn.net/acceptedxukai/article/details/7390300
修改了其中的幾個問題
1.取 自由語素度最大的chunk
這個是要取chunk中單個字的詞頻取對數後的和),而不是直接把sum chunk中每詞的詞頻和
2. 確定唯一的chunk後應該取這個剩下的chunk的第一個詞為分出的第一個詞
然後對除去第一個詞的剩下部分繼續重複以上步驟,直至分完整個句子,而不是直接取這個chunk裡的所有詞
- Rule 1: Maximum matching (取 包含字數最多的chunk)
- Rule 2: Largest average word length (取 詞平均字數最多的chunk)
- Rule 3: Smallest variance of word lengths (取 詞長方差最小的chunk)
- Rule 4: Largest sum of degree of morphemic freedom of one-character words (取 自由語素度最大的chunk)
修改後的代碼見
#coding=utf-8import mathclass Word: def __init__(self,text = '',freq = 0): self.text = text self.freq = freq self.length = len(text)class Chunk: def __init__(self,w1,w2 = None,w3 = None): self.words = [] self.words.append(w1) if w2: self.words.append(w2) if w3: self.words.append(w3) #計算chunk的總長度 def totalWordLength(self): length = 0 for word in self.words: length += len(word.text) return length #計算平均長度 def averageWordLength(self): return float(self.totalWordLength()) / float(len(self.words)) #計算標準差 def standardDeviation(self): average = self.averageWordLength() sum = 0.0 for word in self.words: tmp = (len(word.text) - average) sum += float(tmp) * float(tmp) return sum #自由語素度 def wordFrequency(self): sum = 0 for word in self.words: #sum += word.freq sum += math.log(word.freq) return sumclass ComplexCompare: def takeHightest(self,chunks,comparator): i = 1 for j in range(1, len(chunks)): rlt = comparator(chunks[j], chunks[0]) if rlt > 0: i = 0 if rlt >= 0: chunks[i], chunks[j] = chunks[j], chunks[i] i += 1 return chunks[0:i] #以下四個函數是mmseg演算法的四種過濾原則,核心演算法 def mmFilter(self,chunks): def comparator(a,b): return a.totalWordLength() - b.totalWordLength() return self.takeHightest(chunks,comparator) def lawlFilter(self,chunks): def comparator(a,b): return a.averageWordLength() - b.averageWordLength() return self.takeHightest(chunks,comparator) def svmlFilter(self,chunks): def comparator(a,b): return b.standardDeviation() - a.standardDeviation() return self.takeHightest(chunks, comparator) def logFreqFilter(self,chunks): def comparator(a,b): return a.wordFrequency() - b.wordFrequency() return self.takeHightest(chunks, comparator) #載入片語字典和字元字典dictWord = {}maxWordLength = 0 def loadDictChars(filepath): global maxWordLength fsock = file(filepath) for line in fsock.readlines(): freq, word = line.split(' ') word = unicode(word.strip(), 'utf-8') dictWord[word] = (len(word), int(freq)) maxWordLength = maxWordLength < len(word) and len(word) or maxWordLength fsock.close() def loadDictWords(filepath): global maxWordLength fsock = file(filepath) for line in fsock.readlines(): #word = unicode(line.strip(), 'utf-8') txt = unicode(line.strip(), 'utf-8') word = txt.split(" ")[1] #dictWord[word] = (len(word), 0) dictWord[word] = (len(word), 1) maxWordLength = maxWordLength < len(word) and len(word) or maxWordLength fsock.close()#判斷該詞word是否在字典dictWord中 def getDictWord(word): result = dictWord.get(word) if result: return Word(word,result[1]) return None#開始載入字典def run(): from os.path import join, dirname loadDictChars(join(dirname(__file__), 'data', 'chars.dic')) loadDictWords(join(dirname(__file__), 'data', 'words.dic'))class Analysis: def __init__(self,text): if isinstance(text,unicode): self.text = text else: self.text = text.encode('utf-8') self.cacheSize = 3 self.pos = 0 self.textLength = len(self.text) self.cache = [] self.cacheIndex = 0 self.complexCompare = ComplexCompare() #簡單小技巧,用到個緩衝,不知道具體有沒有用處 for i in range(self.cacheSize): self.cache.append([-1,Word()]) #控制字典只載入一次 if not dictWord: run() def __iter__(self): while True: token = self.getNextToken() if token == None: raise StopIteration yield token def getNextChar(self): return self.text[self.pos] #判斷該字元是否是中文字元(不包括中文標點) def isChineseChar(self,charater): return 0x4e00 <= ord(charater) < 0x9fa6 #判斷是否是ASCII碼 def isASCIIChar(self, ch): import string if ch in string.whitespace: return False if ch in string.punctuation: return False return ch in string.printable #得到下一個切割結果 def getNextToken(self): while self.pos < self.textLength: if self.isChineseChar(self.getNextChar()): token = self.getChineseWords() else : token = self.getASCIIWords()+'/' if len(token) > 0: return token return None #切割出非中文詞 def getASCIIWords(self): # Skip pre-word whitespaces and punctuations #跳過中英文標點和空格 while self.pos < self.textLength: ch = self.getNextChar() if self.isASCIIChar(ch) or self.isChineseChar(ch): break self.pos += 1 #得到英文單詞的起始位置 start = self.pos #找出英文單詞的結束位置 while self.pos < self.textLength: ch = self.getNextChar() if not self.isASCIIChar(ch): break self.pos += 1 end = self.pos #Skip chinese word whitespaces and punctuations #跳過中英文標點和空格 while self.pos < self.textLength: ch = self.getNextChar() if self.isASCIIChar(ch) or self.isChineseChar(ch): break self.pos += 1 #返回英文單詞 return self.text[start:end] #切割出中文詞,並且做處理,用上述4種方法 def getChineseWords(self): chunks = self.createChunks() if len(chunks) > 1: chunks = self.complexCompare.mmFilter(chunks) if len(chunks) > 1: chunks = self.complexCompare.lawlFilter(chunks) if len(chunks) > 1: chunks = self.complexCompare.svmlFilter(chunks) if len(chunks) > 1: chunks = self.complexCompare.logFreqFilter(chunks) if len(chunks) == 0 : return '' #最後只有一種切割方法 word = chunks[0].words token = "" length = 0 #for x in word: # if x.length <> -1: # token += x.text + "/" # length += len(x.text) x = word[0] if x.length <> -1: token += x.text + "/" length += len(x.text) self.pos += length return token #三重迴圈來枚舉切割方法,這裡也可以運用遞迴來實現 def createChunks(self): chunks = [] originalPos = self.pos words1 = self.getMatchChineseWords() for word1 in words1: self.pos += len(word1.text) if self.pos < self.textLength: words2 = self.getMatchChineseWords() for word2 in words2: self.pos += len(word2.text) if self.pos < self.textLength: words3 = self.getMatchChineseWords() for word3 in words3: #print word3.length,word3.text if word3.length == -1: chunk = Chunk(word1,word2) print "Ture" else : chunk = Chunk(word1,word2,word3) chunks.append(chunk) elif self.pos == self.textLength: chunks.append(Chunk(word1,word2)) self.pos -= len(word2.text) elif self.pos == self.textLength: chunks.append(Chunk(word1)) self.pos -= len(word1.text) self.pos = originalPos return chunks #運用正向最大匹配演算法結合字典來切割中文文本 def getMatchChineseWords(self): #use cache,check it for i in range(self.cacheSize): if self.cache[i][0] == self.pos: return self.cache[i][1] originalPos = self.pos words = [] index = 0 while self.pos < self.textLength: if index >= maxWordLength : break if not self.isChineseChar(self.getNextChar()): break self.pos += 1 index += 1 text = self.text[originalPos:self.pos] #print "text:", text word = getDictWord(text) if word: #print "word:", word.text words.append(word) self.pos = originalPos #沒有詞則放置個‘X’,將文本長度標記為-1 if not words: word = Word() word.length = -1 word.text = 'X' words.append(word) self.cache[self.cacheIndex] = (self.pos,words) self.cacheIndex += 1 if self.cacheIndex >= self.cacheSize: self.cacheIndex = 0 #print "words", words return words