使用golang讀取StarDict 詞典idx檔案並按行輸出

來源:互聯網
上載者:User
這是一個建立於 的文章,其中的資訊可能已經有所發展或是發生改變。

StarDict 詞典idx檔案格式:


每一個條目在單字清單中包含陸續三個域:

word_str;                 // a utf-8 string terminated by '\0'.

                                  // 一個 utf-8 編碼字串,以 '\0' 終止符結束。word_str 的長度將是小於 256 的

word_data_offset; // word data's offset in .dict file

                                  // 單詞資料在 .dict 檔案中的位移,

                                  //If the version is "3.0.0" and "idxoffsetbits=64", 

                                  //word_data_offset will be 64-bits unsigned number in network byte order. 


word_data_size;   // word data's total size in .dict file

                                  // 單詞資料在 .dict 檔案中的總大小,word_data_size should be 32-bits unsigned number

                                  // in network byte order.


package mainimport (//    "bufio"    "io"    "os"    "fmt"     "strconv")func main() {    fi, err := os.Open("gaojihanyudacidian_fix.idx")/*現代漢語詞典\\*/    if err != nil { panic(err) }    defer fi.Close()    fo, err := os.Create("output.txt")    if err != nil { panic(err) }    defer fo.Close()/*每次唯讀4個位元組*/    read_buf := make([]byte,4)    jiange := make([]byte,1)    huanghuang := make([]byte,1)    jiange[0]=9     huanghuang[0]=10       var pos,nextPos uint64 = 0,0    /*標記當下需要讀取的是哪個域的資料    為1是word_str      為2是word_data_offset      為3是word_data_size*/     var setp int = 1      /*記錄已經儲存在word_str中的字元個數(不包括字串最後的\0),也就是說下次從word_str[lenth_of_word_str]處開始存*/    var lenth_of_word_str,charNum int = 0,0    word_str := make([]byte,257)    var tmpChar byte =0    var word_data_offset uint64 = 0      var word_data_size uint64 = 0        count :=1    for{    pos=nextPoscount++        n,err := fi.ReadAt(read_buf,(int64)(pos))        if err != nil && err != io.EOF{            panic(err)        }        /*檔案格式要求:讀取資料小於4個位元組時說明檔案結束*/        if   n < 4{            fmt.Printf("\nfinish read\n")            break        }switch setp {case 1://fmt.Println("1:")tmpChar=read_buf[0]/*如果第一個字元是‘\0’說明該步驟已經結束*/if  tmpChar !=  0{/*因為可能有逗號,所以可能只有1個或者倆個位元組組成一個字元*/if tmpChar < 128{charNum=1/*charNum記錄utf8編碼的字元數*/}else if tmpChar < 194{panic(err)}else if tmpChar < 224{charNum=2}else if tmpChar < 240{charNum=3}else{panic(err)}read_buf[charNum]=0str1 := (string)(read_buf[0:charNum+1])copy(word_str[lenth_of_word_str:lenth_of_word_str+charNum], read_buf[:charNum])lenth_of_word_str=lenth_of_word_str+charNumnextPos=nextPos+(uint64)(charNum)continue}else{word_str[lenth_of_word_str]=9nextPos=nextPos+1}        case 2:word_data_offset  =0word_data_offset  = word_data_offset+((uint64)(read_buf[0]))*16*16*16*16*16*16word_data_offset  = word_data_offset+((uint64)(read_buf[1]))*16*16*16*16word_data_offset  = word_data_offset+((uint64)(read_buf[2]))*16*16word_data_offset  = word_data_offset+(uint64)(read_buf[3])nextPos=nextPos+4//        fmt.Printf("word_data_offset  =%d   \n",word_data_offset/*(uint64)(read_buf[3])*/)case 3:word_data_size  =0word_data_size  = word_data_size+(uint64)(read_buf[0])*16*16*16*16*16*16word_data_size  = word_data_size+(uint64)(read_buf[1])*16*16*16*16word_data_size  = word_data_size+(uint64)(read_buf[2])*16*16word_data_size  = word_data_size+(uint64)(read_buf[3])nextPos=nextPos+4default:        }        /*現在可以寫入了*/if setp == 3{if _,err := fo.Write( word_str[:lenth_of_word_str+1]); err != nil{    panic(err)}        word_data_offset_str := strconv.FormatUint(word_data_offset,10)          word_data_size_str := strconv.FormatUint(word_data_size,10)   if _,err := fo.WriteString(  word_data_offset_str); err != nil{    panic(err)} if _,err := fo.Write( jiange[0:1]); err != nil{    panic(err)}if _,err := fo.WriteString( word_data_size_str); err != nil{    panic(err)}if _,err := fo.Write( huanghuang[0:1]); err != nil{    panic(err)}lenth_of_word_str=0}        setp=setp+1        if setp > 4{setp = 1   }    }}


聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.