解析搜狗詞庫(python)

最後更新：2015-04-17 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：

#!/usr/bin/python# -*- coding: utf-8 -*-import structimport sysimport binascii import pdb#搜狗的scel詞庫就是儲存的文本的unicode編碼，每兩個位元組一個字元（中文漢字或者英文字母）#找出其每部分的位移位置即可#主要兩部分#1.全域拼音表，貌似是所有的拼音組合，字典序#       格式為(index,len,pinyin)的列表#       index: 兩個位元組的整數 代表這個拼音的索引#       len: 兩個位元組的整數 拼音的位元組長度#       pinyin: 當前的拼音，每個字元兩個位元組，總長len#       #2.漢語片語表#       格式為(same,py_table_len,py_table,{word_len,word,ext_len,ext})的一個列表#       same: 兩個位元組 整數 同音詞數量#       py_table_len:  兩個位元組 整數#       py_table: 整數列表，每個整數兩個位元組,每個整數代表一個拼音的索引##       word_len:兩個位元組 整數 代表中文片語位元組數長度#       word: 中文片語,每個中文漢字兩個位元組，總長度word_len#       ext_len: 兩個位元組 整數 代表擴充資訊的長度，好像都是10#       ext: 擴充資訊 前兩個位元組是一個整數(不知道是不是詞頻) 後八個位元組全是0##      {word_len,word,ext_len,ext} 一共重複same次 同音詞 相同拼音表#拼音表位移，startPy = 0x1540;#漢語片語表位移startChinese = 0x2628;#全域拼音表GPy_Table ={}#解析結果#元組(詞頻,拼音,中文片語)的列表GTable = []def byte2str(data):    ‘‘‘將原始位元組碼轉為字串‘‘‘    i = 0;    length = len(data)    ret = u‘‘    while i < length:        x = data[i] + data[i+1]        t = unichr(struct.unpack(‘H‘,x)[0])        if t == u‘\r‘:            ret += u‘\n‘        elif t != u‘ ‘:            ret += t        i += 2    return ret#擷取拼音表def getPyTable(data):    if data[0:4] != "\x9D\x01\x00\x00":        return None    data = data[4:]    pos = 0    length = len(data)    while pos < length:        index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        #print index,        pos += 2        l = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        #print l,        pos += 2        py = byte2str(data[pos:pos+l])        #print py        GPy_Table[index]=py        pos += l#擷取一個片語的拼音def getWordPy(data):    pos = 0    length = len(data)    ret = u‘‘    while pos < length:                index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        ret += GPy_Table[index]        pos += 2        return ret#擷取一個片語def getWord(data):    pos = 0    length = len(data)    ret = u‘‘    while pos < length:                index = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        ret += GPy_Table[index]        pos += 2        return ret#讀取中文表    def getChinese(data):    #import pdb    #pdb.set_trace()        pos = 0    length = len(data)    while pos < length:        #同音詞數量        same = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        #print ‘[same]:‘,same,                #拼音索引表長度        pos += 2        py_table_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]        #拼音索引表        pos += 2        py = getWordPy(data[pos: pos+py_table_len])        #中文片語        pos += py_table_len        for i in xrange(same):            #中文片語長度            c_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]            #中文片語            pos += 2              word = byte2str(data[pos: pos + c_len])            #擴充資料長度            pos += c_len                    ext_len = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]            #詞頻            pos += 2            count  = struct.unpack(‘H‘,data[pos]+data[pos+1])[0]            #儲存            GTable.append((count,py,word))                    #到下個詞的位移位置            pos +=  ext_lendef deal(file_name):    print ‘-‘*60    f = open(file_name,‘rb‘)    data = f.read()    f.close()            if data[0:12] !="\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00":        print "確認你選擇的是搜狗(.scel)詞庫?"        sys.exit(0)    #pdb.set_trace()        print "詞庫名：" ,byte2str(data[0x130:0x338])#.encode(‘GB18030‘)    print "詞庫類型：" ,byte2str(data[0x338:0x540])#.encode(‘GB18030‘)    print "描述資訊：" ,byte2str(data[0x540:0xd40])#.encode(‘GB18030‘)    print "詞庫樣本：",byte2str(data[0xd40:startPy])#.encode(‘GB18030‘)        getPyTable(data[startPy:startChinese])    getChinese(data[startChinese:])            if __name__ == ‘__main__‘:    #將要轉換的詞庫添加在這裡就可以了    o = [‘電腦詞彙大全【官方推薦】.scel‘,    ‘IT電腦.scel‘,    ‘電腦詞彙大全【官方推薦】.scel‘,    ‘北京市城市資訊精選.scel‘,    ‘常用餐飲詞彙.scel‘,    ‘成語.scel‘,    ‘成語俗語【官方推薦】.scel‘,    ‘法律詞彙大全【官方推薦】.scel‘,    ‘房地產詞彙大全【官方推薦】.scel‘,    ‘手機詞彙大全【官方推薦】.scel‘,    ‘網路流行新詞【官方推薦】.scel‘,    ‘歇後語集錦【官方推薦】.scel‘,    ‘飲食大全【官方推薦】.scel‘,    ]    #for f in o:    #    deal(f)        print sys.argv[1]    deal( sys.argv[1] )    #儲存結果      f = open(‘sougou.txt‘,‘w‘)    for count,py,word in GTable:        #GTable儲存著結果，是一個列表，每個元素是一個元組(詞頻,拼音,中文片語)，有需要的話可以儲存成自己需要個格式        #我沒排序，所以結果是按照上面輸入檔案的順序        f.write( unicode(‘{%(count)s}‘ %{‘count‘:count}+py+‘ ‘+ word).encode(‘GB18030‘) )#最終儲存檔案的編碼，可以自給改        f.write(‘\n‘)    f.close()

解析搜狗詞庫(python)

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More