#!/usr/bin/python#-*-coding:utf-8-*-ImportstructImportSYSImportBinasciiImportPDB#Sogou's Scel thesaurus is the Unicode encoding of the saved text, one character per two bytes (Chinese characters or English letters)#find the offset position of each part#The main two parts#1. Global phonetic table, seems to be all phonetic combinations, dictionary order#list formatted as (Index,len,pinyin)#Index : A two-byte integer representing the index of this phonetic#Len: Two bytes of integer pinyin byte length#Pinyin: current pinyin, two bytes per character, total length Len# #2. Chinese phrases list#a list of formats (Same,py_table_len,py_table,{word_len,word,ext_len,ext})#same: Two byte integer homonyms number#Py_table_len: Two-byte integer#py_table: List of integers, two bytes per integer, each integer representing a phonetic index##Word_len: A two-byte integer representing the length of the Chinese phrase byte#Word: Chinese words, each Chinese character two bytes, total length Word_len#Ext_len: A two-byte integer representing the length of the extended information, as if it were all ten#Ext: Extended Information The first two bytes is an integer (do not know is the word frequency) after eight bytes are all 0##{word_len,word,ext_len,ext} repeats same times homonyms same phonetic table#Pinyin table Offset,Startpy = 0x1540;#Chinese phrase table offsetStartchinese = 0x2628;#Global Phonetic Tablegpy_table={}#parsing Results#List of tuples (word frequency, pinyin, Chinese phrases)GTable = []defbyte2str (data):" "Convert Raw bytecode to string" "I=0; Length=len (data) RET= u"' whileI <length:x= Data[i] + data[i+1] t= UNICHR (Struct.unpack ('H', X) [0])ifT = = u'\ r': Ret+ = U'\ n' elifT! = U' ': Ret+=t i+ = 2returnret#Get phonetic tabledefgetpytable (data):ifDATA[0:4]! ="\x9d\x01\x00\x00": returnNone Data= Data[4:] Pos=0 Length=len (data) whilePOS <Length:index= Struct.unpack ('H', data[pos]+data[pos+1]) [0]#Print Index,pos + = 2L= Struct.unpack ('H', data[pos]+data[pos+1]) [0]#print L,pos + = 2py= Byte2str (data[pos:pos+l])#Print pygpy_table[index]=Py Pos+=L#get the pinyin of a phrasedefgetwordpy (data): POS=0 Length=len (data) RET= u"' whilePOS <Length:index= Struct.unpack ('H', data[pos]+data[pos+1]) [0] ret+=Gpy_table[index] Pos+ = 2returnret#get a phrasedefGetword (data): POS=0 Length=len (data) RET= u"' whilePOS <Length:index= Struct.unpack ('H', data[pos]+data[pos+1]) [0] ret+=Gpy_table[index] Pos+ = 2returnret#reading Chinese tabledefGetchinese (data):#Import PDB #pdb.set_trace ()POS=0 Length=len (data) whilePOS <Length:#Number of homonymssame = Struct.unpack ('H', data[pos]+data[pos+1]) [0]#print ' [same]: ', same, #Phonetic Index Table lengthpos + = 2Py_table_len= Struct.unpack ('H', data[pos]+data[pos+1]) [0]#Phonetic Index Tablepos + = 2py= Getwordpy (data[pos:pos+Py_table_len]) #Chinese Phrasespos + =Py_table_len forIinchxrange (same):#Chinese phrase lengthC_len = Struct.unpack ('H', data[pos]+data[pos+1]) [0]#Chinese Phrasespos + = 2Word= Byte2str (Data[pos:pos +C_len]) #Extended Data Lengthpos + =C_len Ext_len= Struct.unpack ('H', data[pos]+data[pos+1]) [0]#Word frequencypos + = 2Count= Struct.unpack ('H', data[pos]+data[pos+1]) [0]#SaveGtable.append ((Count,py,word))#to the next word's offset positionpos + =Ext_lendefDeal (file_name):Print '-'*60F= Open (file_name,'RB') Data=F.read () f.close ( )ifDATA[0:12]! ="\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00": Print "confirm that you choose the Sogou (. scel) thesaurus?"sys.exit (0)#pdb.set_trace () Print "Word Bank Name:", Byte2str (data[0x130:0x338])#. Encode (' GB18030 ') Print "Thesaurus Type:", Byte2str (data[0x338:0x540])#. Encode (' GB18030 ') Print "Description Information:", Byte2str (DATA[0X540:0XD40])#. Encode (' GB18030 ') Print "Thesaurus Example:", Byte2str (Data[0xd40:startpy])#. Encode (' GB18030 ')getpytable (Data[startpy:startchinese]) Getchinese (Data[startchinese:])if __name__=='__main__': #The thesaurus that will be converted is added here.o = ['Computer Glossary "Official recommendation". Scel', 'it computers. Scel', 'Computer Glossary "Official recommendation". Scel', 'Beijing city information selection. Scel', 'Common dining vocabulary. Scel', 'idiom. Scel', 'the idiom "official recommendation". Scel', 'Legal Glossary "Official recommendation". Scel', 'Real Estate Vocabulary Encyclopedia "official recommendation". Scel', 'Mobile Vocabulary Encyclopedia "official recommendation". Scel', 'Internet Popular New word "official recommendation". Scel', 'the "official recommendation" of the allegorical sayings collection. Scel', 'Food Encyclopedia "official recommendation". Scel', ] #For f in O: #deal (f) PrintSys.argv[1] Deal (sys.argv[1] ) #Save Resultsf = open ('Sougou.txt','W') forCount,py,wordinchGTable:#gtable Save the result, is a list, each element is a tuple (word frequency, pinyin, Chinese phrases), if necessary, can be saved to a format of their own #I'm not sorted, so the result is in the order of the input files aboveF.write (Unicode ('{% (count) s}'%{'Count': count}+py+' '+ word). Encode ('GB18030') )#the encoding of the final saved file can be self-modifyingF.write ('\ n') F.close ()
Parsing Sogou Thesaurus (python)