Conversion from python Sogou Pinyin input dictionary to Google Pinyin input Dictionary

Source: Internet
Author: User

Class

The code is as follows Copy Code

Class sogou{
ctor (Sogoudata) {
if (#sogouData < 0x104/*_max_path*/) {
s =.. String.load (Sogoudata);
if (s) sogoudata = s;
Startpy = 0x1540+1;
Startchinese = 0x2628+1;
pytable = {};
}
if (.. String.left (sogoudata,12)!= ' \x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00 ')
return NULL, "data or path is not Sogou (. scel) Thesaurus";
};

GetDescription = function () {
return {
Name = Fromunicode (sub (s,0x131,0x338));
Dictype = Fromunicode (sub (s,0x339,0x540));
Description = Fromunicode (sub (S,0X541,0XD40));
Demo = Fromunicode (sub (s,0xd41,0x1540));
}
}

getpytable = function () {
var data = sub (s,startpy,startchinese-1);
If.. String.left (data,4)!= ' \x9d\x01\x00\x00 '
return null;
Data = sub (data,5,-1);
var pos = 1;
var len = #data;
while (Pos<len) {
index = (data[pos+1]<<8) +data[pos]
pos+=2
L = (data[pos+1]<<8) +data[pos]
POS + 2
PY = Fromunicode (sub (data,pos,pos+l))
Pytable[index] = py
POS + L
}
}

Getwordpy = function (data) {
var pos = 1;
var len = #data;
var ret = ""
while (Pos<len) {
var index = (data[pos+1]<<8) +data[pos]
ret = concat (Ret,pytable[index], "")
POS + 2
}
Return TrimRight (ret);
}

NextEntry = function () {
This.getpytable ();
var data = sub (s,startchinese,-1);
var pos = 1;
var len = #data;
var same = 0
return function () {
if (Pos>len) {
return null;
}
if (same==0) {
same = (data[pos+1]<<8) +data[pos];
POS + 2
Pytablelen = (data[pos+1]<<8) +data[pos];

POS + 2
PY = This.getwordpy (sub (Data,pos,pos+pytablelen))

POS + Pytablelen
}
CLen = (data[pos+1]<<8) +data[pos];

POS + 2

Word = Fromunicode (sub (data,pos,pos + cLen))

POS + cLen
Extlen = (data[pos+1]<<8) +data[pos];

POS + 2
Count = (data[pos+1]<<8) +data[pos];

POS + Extlen
same--
return Count,py,word;
};
}
}

Namespace sogou{
Fromunicode =.. String.fromunicode
Sub =.. String.sub
Concat =.. String.Concat
TrimRight =.. String.trimright
}

Call Method:

Import Console

Get Word Library description information
sg = Sogou ("E:\ Computer Vocabulary Encyclopedia", official recommendation ". Scel")
Console.vardump (Sg.getdescription ())

Traverse Sogou input Word library and save it as Google Input Word library
File =.. Io.open ("E:\google.txt", "W")
For (Count,py,word in Sg.nextentry ()) {
File.write (Word, ' \ t ', count, ' \ t ', py, ' \ n ')

}
File.close ()

Console.pause (True)

The code above is referenced from a section of Python code

Python code Backup:

The code is as follows Copy Code

#!/usr/bin/python
#-*-Coding:utf-8-*-


Import struct
Import Sys
Import Binascii
Import PDB
#搜狗的scel词库就是保存的文本的unicode编码, one character per two byte (Chinese character or English letter)
#找出其每部分的偏移位置即可
#主要两部分
#1. Global phonetic table, looks like all phonetic combinations, dictionary order
# format as a list of (Index,len,pinyin)
# index: Two-byte integers represent the index of this phonetic alphabet
# len: Two-byte integer phonetic byte length
# Pinyin: current pinyin, two bytes per character, total length len
#
#2. Chinese phrase list
# A list of the format (Same,py_table_len,py_table,{word_len,word,ext_len,ext})
# same: Two byte integer homonym number
# Py_table_len: two byte integers
# py_table: integer list, two bytes per integer, each integer representing a phonetic index
#
# Word_len: Two-byte integer representing Chinese phrase byte number length
# Word: Chinese phrases, two bytes per Chinese character, total length Word_len
# Ext_len: Two-byte integers represent the length of extended information, as if they were 10
# Ext: Extended information first two bytes is an integer (do not know is the word frequency) after eight bytes are all 0
#
# {Word_len,word,ext_len,ext} total repetition same the same phonetic form

#拼音表偏移,
Startpy = 0x1540;


#汉语词组表偏移
Startchinese = 0x2628;

#全局拼音表

Gpy_table ={}

#解析结果
A list of #元组 (word frequency, pinyin, Chinese phrases)
Gtable = []

def byte2str (data):
"Convert the original bytecode into a string" "
i = 0;
length = len (data)
ret = U ' '
While I < length:
x = Data[i] + data[i+1]
t = UNICHR (struct.unpack (' H ', x) [0])
If t = = U ' \ r ':
ret = U ' \ n '
Elif t!= u ':
RET = t
i + 2
return ret
#获取拼音表
def getpytable (data):

If Data[0:4]!= "\x9d\x01\x00\x00":
Return None
data = Data[4:]
pos = 0
length = len (data)
While Pos < length:
index = struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#print Index,
POS + 2
L = struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#print L,
POS + 2
PY = Byte2str (Data[pos:pos+l])
#print py
Gpy_table[index]=py
POS + L

#获取一个词组的拼音
def getwordpy (data):
pos = 0
length = len (data)
ret = U ' '
While Pos < length:

index = struct.unpack (' H ', data[pos]+data[pos+1]) [0]
RET + Gpy_table[index]
POS + 2
return ret

#获取一个词组
def getword (data):
pos = 0
length = len (data)
ret = U ' '
While Pos < length:

index = struct.unpack (' H ', data[pos]+data[pos+1]) [0]
RET + Gpy_table[index]
POS + 2
return ret

#读取中文表
def getchinese (data):
#import PDB
#pdb. Set_trace ()

pos = 0
length = len (data)
While Pos < length:
#同音词数量
same = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#print ' [same]: ', same,

#拼音索引表长度
POS + 2
Py_table_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#拼音索引表
POS + 2
PY = getwordpy (Data[pos:pos+py_table_len])

#中文词组
POS + Py_table_len
For I in Xrange (same):
#中文词组长度
C_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#中文词组
POS + 2
Word = byte2str (Data[pos:pos + C_len])
#扩展数据长度
POS + C_len
Ext_len = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]
#词频
POS + 2
Count = Struct.unpack (' H ', data[pos]+data[pos+1]) [0]

#保存
Gtable.append ((Count,py,word))

#到下个词的偏移位置
POS + Ext_len


DEF deal (file_name):
print '-' *60
f = open (file_name, ' RB ')
data = F.read ()
F.close ()


If Data[0:12]!= "\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00":
Print "Confirm that you selected the Sogou (. scel) thesaurus?"
Sys.exit (0)
#pdb. Set_trace ()

Print "Thesaurus name:", Byte2str (data[0x130:0x338]) #.encode (' GB18030 ')
Print "Thesaurus type:", Byte2str (data[0x338:0x540]) #.encode (' GB18030 ')
Print "Descriptive information:", Byte2str (DATA[0X540:0XD40]) #.encode (' GB18030 ')
Print "Thesaurus example:", Byte2str (Data[0xd40:startpy]) #.encode (' GB18030 ')

Getpytable (Data[startpy:startchinese])
Getchinese (Data[startchinese:])


if __name__ = = ' __main__ ':

#将要转换的词库添加在这里就可以了
o = [' Computer Glossary ' official recommendation '. Scel ',
' It computer. Scel ',
"Scel", the official recommendation of the "Computer Vocabulary encyclopedia".
' Beijing city information selection. Scel ',
' Common catering vocabulary. Scel ',
' Idiom. Scel ',
"The idiom" is officially recommended ". Scel ',
"Scel", "the official recommendation of the legal lexicon".
"Real Estate Glossary" Official recommendation ". Scel ',
"Mobile Vocabulary Encyclopedia" official recommendation ". Scel ',
"The new word", "official recommendation". Scel ',
' Allegorical Sayings Collection ' official recommendation '. Scel ',
"Food Encyclopedia" officially recommended ". Scel ',
]

For f in O:
Deal (f)

#保存结果
f = open (' Sougou.txt ', ' W ')
For Count,py,word in gtable:
#GTable保存着结果, is a list, each element is a tuple (word frequency, pinyin, Chinese phrases), if necessary, can be saved into a format you need
#我没排序, so the result is in the order in which the files are entered
F.write (Unicode ('% (count) s} '%{' count ': count}+py+ ' + word '). Encode (' GB18030 ')) #最终保存文件的编码, can be self-sufficient
F.write (' \ n ')
F.close ()

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.