mmseg演算法是對最大匹配演算法的擴充。簡單來說,mmseg每次匹配時,總會多向後匹配兩個單詞,然後選擇這個三個單詞的總體匹配最優的。
mmseg 主要做了以下幾方面的擴充:
假設對字串C1C2...Cn進行分割
匹配時,從小到大,逐個匹配字典中以C1開頭的詞
每次連續匹配三個詞語(three-word chunk ),並列出所有可能的分割
選擇最匹配的three-word chunk(依次運用以下規則,一旦可以選出唯一結果則返回):
a.三個單詞的總長度最大
b.單詞平均長度最大
c.單詞長度的方差最小
b.單詞的詞頻總和最大
選取three-word chunk中的第一個單詞,然後重複1-4這四個步驟
代碼如下(結合上一篇一起看 http://hi.baidu.com/bithigher/item/cbd098c52123df0a5050584d ):
由於選用的語料庫中沒有單個漢字的資訊,所以分詞效果還不是非常理想,下面代碼中的函數 mmseg和上一篇中的函數 maxmath以及maxmatch_back是一樣的,都可以作為參數傳給 slove函數
git 地址: https://github.com/BitHigher/hfseg
# -*- coding: UTF-8 -*-import reimport sysd = {}def init(filename="SogouLabDic.dic"): f = open(filename, 'r') for line in f: word, freq = line.split('\t')[0:2] try: d[word.decode("gbk")] = int(freq) except: d[word] = int(freq)def maxmatch(s): maxlen = 5 l = len(s) p = 0 result = {} while p < l: length = min(maxlen, l-p) wlen = 1 for i in range(length, 0, -1): if d.has_key(s[p:p+i]): wlen = i break if wlen > 1: result.setdefault(s[p:p+wlen], 0) result[s[p:p+wlen]] += 1 p += wlen return resultdef maxmatch_back(s): maxlen = 5 l = len(s) result = {} while l > 0: length = min(maxlen, l) wlen = 1 for i in range(length, 0, -1): if d.has_key(s[l-i:l]): wlen = i break if wlen > 1: result.setdefault(s[l-wlen:l], 0) result[s[l-wlen:l]] += 1 l -= wlen return result def one_word(s, start, rest=3): result = [] maxlen = 5 l = len(s) for former in start: p = former[len(former)-1] if p >= l: result.append(former) break length = min(maxlen, l-p) num = 0 for i in range(1, length+1): if d.has_key(s[p:p+i]): result.append(former + [p+i]) num += 1 if num == 0: result.append(former + [p+1]) if rest > 1: return one_word(s, result, rest-1) else: return result def three_word_chunk(s, start): result = one_word(s, [[start]], 3) longest = 0 lset = [] for i in range(len(result)): cur = result[i][len(result[i])-1] - result[i][0] if cur > longest: longest = cur lset = [i] elif cur == longest: lset.append(i) if len(lset) == 1: return result[lset[0]] else: # get the longest averge longavg = 0 lavg = [] for i in range(len(lset)): cur = longest / float(len(result[lset[i]])-1) if cur > longavg: longavg = cur lavg = [lset[i]] elif cur == longavg: lavg.append(lset[i]) lset = lavg longest = longavg if len(lset) == 1: return result[lset[0]] else: # get the minmum dx mindk = sys.maxint dkset = [] for i in range(len(lset)): cur = 0 for j in range(1, len(result[lset[i]])): wordlen = result[lset[i]][j] - result[lset[i]][j-1] cur += pow((wordlen - longest), 2) if cur < mindk: mindk = cur dkset = [lset[i]] elif cur == mindk: dkset.append(lset[i]) lset = dkset longest = mindk if len(lset) == 1: return result[lset[0]] else: # get the maxmum frequency maxFre = 0 fset = [] for i in range(len(lset)): cur = 0 for j in range(1, len(result[i])): key = s[result[i][j-1]:result[i][j]] if d.has_key(key): cur += d[key] if cur > maxFre: maxFre = cur fset = [lset[i]] elif cur == maxFre: fset.append(lset[i]) lset = fset longest = maxFre if len(lset) == 1: return result[lset[0]] else:# print 'Really More than one...', lset return result[lset[0]]# look ahead two more wordsdef mmseg(s): maxlen = 5 l = len(s) p = 0 result = {} while p < l: chunk = three_word_chunk(s, p) if(len(chunk) < 2): break if chunk[1] - chunk[0] > 1: result.setdefault(s[chunk[0]:chunk[1]], 0) result[s[chunk[0]:chunk[1]]] += 1 p = chunk[1] return resultdef solve(s, segment=maxmatch): s = s.decode("utf8") return segment(s)