| #!/usr/bin/python #-*-Coding:utf-8-*-
Import struct Import Sys Import Binascii Import PDB #搜狗的scel词库就是保存的文本的unicode编码, one character per two byte (Chinese character or English letter) #找出其每部分的偏移位置即可 #主要两部分 #1. Global phonetic table, looks like all phonetic combinations, dictionary order # format as a list of (Index,len,pinyin) # index: Two-byte integers represent the index of this phonetic alphabet # len: Two-byte integer phonetic byte length # Pinyin: current pinyin, two bytes per character, total length len # #2. Chinese phrase list # A list of the format (Same,py_table_len,py_table,{word_len,word,ext_len,ext}) # same: Two byte integer homonym number # Py_table_len: two byte integers # py_table: integer list, two bytes per integer, each integer representing a phonetic index # # Word_len: Two-byte integer representing Chinese phrase byte number length # Word: Chinese phrases, two bytes per Chinese character, total length Word_len # Ext_len: Two-byte integers represent the length of extended information, as if they were 10 # Ext: Extended information first two bytes is an integer (do not know is the word frequency) after eight bytes are all 0 # # {Word_len,word,ext_len,ext} total repetition same the same phonetic form
#拼音表偏移, Startpy = 0x1540;
#汉语词组表偏移 Startchinese = 0x2628;
#全局拼音表
Gpy_table ={}
#解析结果 A list of #元组 (word frequency, pinyin, Chinese phrases) Gtable = [] def byte2str (data): "Convert the original bytecode into a string" " i = 0; length = len (data) ret = U ' ' While I < length: x = Data[i] + data[i+1] t = UNICHR (struct.unpack (' H ', x) [0]) If t = = U ' \ r ': ret = U ' \ n ' Elif t!= u ': RET = t i + 2 return ret #获取拼音表 def getpytable (data):
If Data[0:4]!= "\x9d\x01\x00\x00": Return None data = Data[4:] pos = 0 length = len (data) While Pos < length: index = struct.unpack (' H ', data[pos]+data[pos+1]) [0] #print Index, POS + 2 L = struct.unpack (' H ', data[pos]+data[pos+1]) [0] #print L, POS + 2 PY = Byte2str (Data[pos:pos+l]) #print py Gpy_table[index]=py POS + L
#获取一个词组的拼音 def getwordpy (data): pos = 0 length = len (data) ret = U ' ' While Pos < length:
index = struct.unpack (' H ', data[pos]+data[pos+1]) [0] RET + Gpy_table[index] POS + 2 return ret
#获取一个词组 def getword (data): pos = 0 length = len (data) ret = U ' ' While Pos < length: index = struct.unpack (' H ', data[pos]+data[pos+1]) [0] RET + Gpy_table[index] POS + 2 return ret
#读取中文表 def getchinese (data): #import PDB #pdb. Set_trace ()
pos = 0 length = len (data) While Pos < length: #同音词数量 same = Struct.unpack (' H ', data[pos]+data[pos+1]) [0] #print ' [same]: ', same,
#拼音索引表长度 POS + 2 Py_table_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0] #拼音索引表 POS + 2 PY = getwordpy (Data[pos:pos+py_table_len])
#中文词组 POS + Py_table_len For I in Xrange (same): #中文词组长度 C_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0] #中文词组 POS + 2 Word = byte2str (Data[pos:pos + C_len]) #扩展数据长度 POS + C_len Ext_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0] #词频 POS + 2 Count = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#保存 Gtable.append ((Count,py,word)) #到下个词的偏移位置 POS + Ext_len
DEF deal (file_name): print '-' *60 f = open (file_name, ' RB ') data = F.read () F.close ()
If Data[0:12]!= "\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00": Print "Confirm that you selected the Sogou (. scel) thesaurus?" Sys.exit (0) #pdb. Set_trace ()
Print "Thesaurus name:", Byte2str (data[0x130:0x338]) #.encode (' GB18030 ') Print "Thesaurus type:", Byte2str (data[0x338:0x540]) #.encode (' GB18030 ') Print "Descriptive information:", Byte2str (DATA[0X540:0XD40]) #.encode (' GB18030 ') Print "Thesaurus example:", Byte2str (Data[0xd40:startpy]) #.encode (' GB18030 ')
Getpytable (Data[startpy:startchinese]) Getchinese (Data[startchinese:])
if __name__ = = ' __main__ ':
#将要转换的词库添加在这里就可以了 o = [' Computer Glossary ' official recommendation '. Scel ', ' It computer. Scel ', "Scel", the official recommendation of the "Computer Vocabulary encyclopedia". ' Beijing city information selection. Scel ', ' Common catering vocabulary. Scel ', ' Idiom. Scel ', "The idiom" is officially recommended ". Scel ', "Scel", "the official recommendation of the legal lexicon". "Real Estate Glossary" Official recommendation ". Scel ', "Mobile Vocabulary Encyclopedia" official recommendation ". Scel ', "The new word", "official recommendation". Scel ', ' Allegorical Sayings Collection ' official recommendation '. Scel ', "Food Encyclopedia" officially recommended ". Scel ', ]
For f in O: Deal (f)
#保存结果 f = open (' Sougou.txt ', ' W ') For Count,py,word in gtable: #GTable保存着结果, is a list, each element is a tuple (word frequency, pinyin, Chinese phrases), if necessary, can be saved into a format you need #我没排序, so the result is in the order in which the files are entered F.write (Unicode ('% (count) s} '%{' count ': count}+py+ ' + word '). Encode (' GB18030 ')) #最终保存文件的编码, can be self-sufficient F.write (' \ n ') F.close () |