Use Golang to read the StarDict dictionary idx file and output by line

Source: Internet
Author: User
This is a creation in Article, where the information may have evolved or changed.

StarDict dictionary idx file format:


Each entry contains three fields in a list of words:

WORD_STR; A utf-8 string terminated by '.

An utf-8 encoded string that ends with a '/' Terminator. The length of the word_str will be less than 256

Word_data_offset; Word data ' s offset in. dict file

The offset of the word data in the. dict file,

If The version is "3.0.0" and "idxoffsetbits=64",

Word_data_offset'll is 64-bits unsigned number in network byte order.


Word_data_size; Word data ' s total size in. dict file

The total size of the word data in the. dict file,word_data_size should be 32-bits unsigned number

In network byte order.


Package Mainimport (//"Bufio" "IO" "OS" "FMT" "StrConv") func main () {fi, err: = OS. Open ("Gaojihanyudacidian_fix.idx")/* Modern Chinese dictionary \\*/if err! = Nil {panic (ERR)} defer fi. Close () FO, err: = OS. Create ("output.txt") if err! = Nil {panic (err)} defer fo. Close ()/* Read 4 bytes at a time */read_buf: = Make ([]byte,4) Jiange: = make ([]byte,1) Huanghuang: = make ([]byte,1) jiange[0]      =9 huanghuang[0]=10 var pos,nextpos uint64 = 0,0/* Tag the current field to read which data is 1 is word_str to 2 is Word_data_offset For 3 is word_data_size*/var setp int = 1/* Record the number of characters that have been stored in WORD_STR (not including the last of the string), that is, next from Word_str[lenth_of_word_str] Start deposit */var lenth_of_word_str,charnum int = 0,0 WORD_STR: = make ([]byte,257) var tmpchar byte =0 var word_data_o Ffset UInt64 = 0 var word_data_size uint64 = 0 Count: =1 for{pos=nextposcount++ n,err: = fi. ReadAt (Read_buf, (Int64) (POS)) if err! = Nil && Err! = Io.  eof{Panic (ERR)}      /* File Format requirements: The description file ends when reading data less than 4 bytes */if n < 4{FMT. Printf ("\nfinish read\n") break}switch Setp {case 1://fmt. Println ("1:") tmpchar=read_buf[0]/* if the first character is '% ', the step has ended */if tmpchar! = 0{/* because there may be commas, so there may be only 1 or two bytes to form a character */if Tmpchar < 128{charnum=1/*charnum record UTF8 encoded characters */}else if Tmpchar < 194{panic (err)}else if Tmpchar < 224{charnum=2 }else If Tmpchar < 240{charnum=3}else{panic (err)}read_buf[charnum]=0str1: = (String) (Read_buf[0:charnum+1]) copy ( Word_str[lenth_of_word_str:lenth_of_word_str+charnum], Read_buf[:charnum]) lenth_of_word_str=lenth_of_word_str+ Charnumnextpos=nextpos+ (UInt64) (charNum) continue}else{word_str[lenth_of_word_str]=9nextpos=nextpos+1} Case 2: Word_data_offset =0word_data_offset = word_data_offset+ ((UInt64) (read_buf[0]) *16*16*16*16*16*16word_data_offset = word_data_offset+ ((UInt64) (read_buf[1)) *16*16*16*16word_data_offset = word_data_offset+ ((UInt64) (read_buf[2])) * 16*16word_data_offset = word_data_offset+ (UInt64) (Read_buf[3]) nextpos=nextpos+4//FMT. Printf ("Word_data_offset =%d \ n", word_data_offset/* (UInt64) (read_buf[3]) */) case 3:word_data_size =0word_data_size = Word_data_size+ (UInt64) (read_buf[0]) *16*16*16*16*16*16word_data_size = word_data_size+ (UInt64) (read_buf[1]) *16* 16*16*16word_data_size = word_data_size+ (UInt64) (read_buf[2]) *16*16word_data_size = word_data_size+ (UInt64) (Read_ BUF[3]) Nextpos=nextpos+4default:}/* can now be written to */if Setp = = 3{if _,err: = fo. Write (word_str[:lenth_of_word_str+1]); Err! = nil{Panic (err)} word_data_offset_str: = StrConv. Formatuint (word_data_offset,10) Word_data_size_str: = StrConv. Formatuint (word_data_size,10) if _,err: = fo. WriteString (WORD_DATA_OFFSET_STR); Err! = nil{Panic (err)} if _,err: = fo. Write (Jiange[0:1]); Err! = nil{Panic (err)}if _,err: = fo. WriteString (WORD_DATA_SIZE_STR); Err! = nil{Panic (err)}if _,err: = fo. Write (Huanghuang[0:1]);        Err! = nil{Panic (err)}lenth_of_word_str=0}setp=setp+1 if SETP > 4{SETP = 1}} 


Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.