Go語言源碼中Replacer尋找部份的筆記

來源:互聯網
上載者:User
這是一個建立於 的文章,其中的資訊可能已經有所發展或是發生改變。用過strings.NewReplacer,replacer.Replace(),它按對傳入參數後,能依優先順序替換,並能處理中文字串參數.

覺得功能強大,特別好用.對它的尋找和優先順序怎麼處理有點興趣,花時間研究了下源碼,在這記錄一下個人理解.

  

package main //author:xcl//2014-1-20 記錄import ("fmt"    "strings")func main(){   patterns := []string{              "y","25",            "中","國",            "中工","傢伙",        }           /*    patterns := make([]string,270 * 2)    for i :=0;i< 270 *2;i++{          patterns[i] = fmt.Sprintf("%d",i)    }    */    replacer := strings.NewReplacer(patterns...)    format := "中(國)--中工(傢伙)"    strfmt := replacer.Replace(format)        NewReplacer(patterns...);    fmt.Println("\nmain() replacer.Replace old=",format)    fmt.Println("main() replacer.Replace new=",strfmt)}func NewReplacer(oldnew ...string){   r :=  makeGenericReplacer(oldnew)   val,keylen,found := r.lookup("中",true)   fmt.Println("\nNewReplacer() 中   val:",val," keylen:",keylen," found:",found)   val,keylen,found = r.lookup("中工",true)   fmt.Println("NewReplacer() 中工 val:",val," keylen:",keylen," found:",found)   val,keylen,found = r.lookup("y",false)   fmt.Println("NewReplacer() y    val:",val," keylen:",keylen," found:",found)   /*   val,keylen,found := r.lookup("2",true)   fmt.Println("\nNewReplacer() 2   val:",val," keylen:",keylen," found:",found)   val,keylen,found = r.lookup("3",true)   fmt.Println("\nNewReplacer() 3   val:",val," keylen:",keylen," found:",found)*/}type genericReplacer struct {    root trieNode  //一個字典樹    // tableSize is the size of a trie node's lookup table. It is the number    // of unique key bytes.    tableSize int    // mapping maps from key bytes to a dense index for trieNode.table.    mapping [256]byte  }func makeGenericReplacer(oldnew []string) *genericReplacer {    r := new(genericReplacer)    // Find each byte used, then assign them each an index.    for i := 0; i < len(oldnew); i += 2 { //步長2. 第一個為pattern         key := oldnew[i]        fmt.Println("\nmakeGenericReplacer() for key=",key)        //key[j]=utf8儲存漢字的三個編碼位置中的一個如228,則將其對應位置設定為1        //即 r.mapping[228] = 1        for j := 0; j < len(key); j++ {            r.mapping[key[j]] = 1               fmt.Println("makeGenericReplacer() key[",j,"]=",key[j])        }    }    for _, b := range r.mapping {         r.tableSize += int(b)      }    fmt.Println("makeGenericReplacer()  r.tableSize=",r.tableSize)     var index byte    for i, b := range r.mapping {        if b == 0 {            r.mapping[i] = byte(r.tableSize)        } else {            //依數組字元編碼位置,建立索引            r.mapping[i] = index            fmt.Println("makeGenericReplacer()  r.mapping[",i,"] =",r.mapping[i] )             index++        }    }    // Ensure root node uses a lookup table (for performance).    r.root.table = make([]*trieNode, r.tableSize)         //將key,val放入字典樹,注意priority=len(oldnew)-i,即越數組前面的,值越大.層級越高    for i := 0; i < len(oldnew); i += 2 {        r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)     }    return r}type trieNode struct {    value string    priority int    prefix string    next   *trieNode    table []*trieNode }func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {     fmt.Println("trieNode->add() val=",val," key=",key)     if key == "" {        if t.priority == 0 {            t.value = val            t.priority = priority            fmt.Println("trieNode->add() t.priority==",priority)        }        return    }    if t.prefix != "" { //處理已有首碼的node           // Need to split the prefix among multiple nodes.        var n int // length of the longest common prefix        for ; n < len(t.prefix) && n < len(key); n++ { //prefix與key的比較            if t.prefix[n] != key[n] {                break            }        }        if n == len(t.prefix) {  //相同,繼續放下面            t.next.add(key[n:], val, priority, r)        } else if n == 0 { //沒一個相同            // First byte differs, start a new lookup table here. Looking up            // what is currently t.prefix[0] will lead to prefixNode, and            // looking up key[0] will lead to keyNode.            var prefixNode *trieNode            if len(t.prefix) == 1 {  //如果prefix只是一個位元組的字元編碼,則掛在節點下面                prefixNode = t.next            } else {                    //如果不是,將餘下的建立一個trie樹                prefixNode = &trieNode{                    prefix: t.prefix[1:],                    next:   t.next,                }            }            keyNode := new(trieNode)            t.table = make([]*trieNode, r.tableSize) //lookup()中的if node.table != nil             t.table[r.mapping[t.prefix[0]]] = prefixNode             t.table[r.mapping[key[0]]] = keyNode                t.prefix = ""            t.next = nil            keyNode.add(key[1:], val, priority, r)         } else {            // Insert new node after the common section of the prefix.            next := &trieNode{                prefix: t.prefix[n:],                next:   t.next,            }            t.prefix = t.prefix[:n]            t.next = next            next.add(key[n:], val, priority, r)        }    } else if t.table != nil {        // Insert into existing table.        m := r.mapping[key[0]]        if t.table[m] == nil {            t.table[m] = new(trieNode)        }        t.table[m].add(key[1:], val, priority, r) //構建樹          } else {          t.prefix = key        t.next = new(trieNode)        t.next.add("", val, priority, r)    }}func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {    // Iterate down the trie to the end, and grab the value and keylen with    // the highest priority.    bestPriority := 0    node := &r.root    n := 0    for node != nil {         if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {            bestPriority = node.priority            val = node.value            keylen = n            found = true        }        if s == "" {            break        }        if node.table != nil {            index := r.mapping[s[0]]            if int(index) == r.tableSize { //字元編碼第一個位元組就沒在table中,中斷尋找                break            }            node = node.table[index]             s = s[1:]            n++        } else if node.prefix != "" && HasPrefix(s, node.prefix) {             //字元編碼非第一個位元組的節點會保留key在prefix中,所以通過分析prefix來繼續找其它位元組            n += len(node.prefix)            s = s[len(node.prefix):]            node = node.next //繼續找相同prefix以外其它字元        } else {            break        }    }    return}// HasPrefix tests whether the string s begins with prefix.func HasPrefix(s, prefix string) bool {    return len(s) >= len(prefix) && s[0:len(prefix)] == prefix}
記錄:

 ascii範圍內的只佔一個位元組,如y(121) 
utf8中每個漢字佔三個位元組.如中(228,184,173)

構建樹:
如果是新的第一個單詞或片語
先進  } else if t.table != nil {
然後再進 else,這中間會把 t.prefix = key,把key值存放在prefix,將""傳給下一個node
最後執行 if key == "" && t.priority == 0 { ,將 t.value = val
即key的字元編碼(第一個位元組)對應的root.table位置開始,依次指向另外的字元編譯node,中間node的prefix存下key值.
最末一個node,存下對應的val及priority.


如果是後傳入的單詞或片語,先從key字元編碼首個位元組對應的root.table位置開始,依次尋找,
 } else if t.table != nil {
如果已有首碼的,進行比較 if t.prefix != "" {
   1, 如目前prefix與key完全一致,則繼續構建樹子節點
   2. 如prefix與key完全不同,則另起爐灶,構建一條新的tree
      prefixNode 承上,keyNode 啟下
      至於為什麼t.table = make([]*trieNode, r.tableSize),是為了預留映射空間.
      所以它是這麼弄的,而不是t.table[0],t.table[1].
        t.table[r.mapping[t.prefix[0]]] = prefixNode
        t.table[r.mapping[key[0]]] = keyNode 
   3.有部份相同, 直接跳到t.prefix[n:],然後從key[n:]開始繼續構建樹子節點


priority:
 在這的定義是數字越大,優先順序別越高

 if key == "" { //字元編碼中間的位元組
    if t.priority == 0 { //如果有定義過priority的就略過,新加的,把現有的層級加上

//對應{中,中工}這種,雖然後面有"中工",但"中",的priority要高,所以"中工"對應的值雖找到但不會返回.
if node.priority > bestPriority { bestPriority = node.priority}

例如:中工(priority=4),中(priority=2)
patterns:
    "中工","傢伙",
    "中","國",
則:
lookup() bestPriority: 0  node.priority: 0  value:   prefix:
lookup() bestPriority: 0  node.priority: 0  value:   prefix: ��
lookup() bestPriority: 0  node.priority: 2  value: 國  prefix: 工
NewReplacer() 中 val: 國  keylen: 3  found: true
lookup() bestPriority: 0  node.priority: 0  value:   prefix:
lookup() bestPriority: 0  node.priority: 0  value:   prefix: ��
lookup() bestPriority: 0  node.priority: 2  value: 國  prefix: 工
lookup() bestPriority: 2  node.priority: 4  value: 傢伙  prefix:
NewReplacer() 中工 val: 傢伙  keylen: 6  found: true
main() replacer.Replace old= 中(國)--中工(傢伙)
main() replacer.Replace new= 國(國)--傢伙(傢伙)


如果調整下順序,把中->國提前,則會發現,下面的結果:
patterns:
    "中","國",
    "中工","傢伙",   
則:
lookup() bestPriority: 0  node.priority: 0  value:   prefix:
lookup() bestPriority: 0  node.priority: 0  value:   prefix: ��
lookup() bestPriority: 0  node.priority: 4  value: 國  prefix: 工
NewReplacer() 中 val: 國  keylen: 3  found: true
lookup() bestPriority: 0  node.priority: 0  value:   prefix:
lookup() bestPriority: 0  node.priority: 0  value:   prefix: ��
lookup() bestPriority: 0  node.priority: 4  value: 國  prefix: 工
lookup() bestPriority: 4  node.priority: 2  value: 傢伙  prefix:
NewReplacer() 中工 val: 國  keylen: 3  found: true
main() replacer.Replace old= 中(國)--中工(傢伙)
main() replacer.Replace new= 國(國)--國工(傢伙)

       還有,剛發現 lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {}中
定義在傳回值中的變數,原來可以直接在函數中使用,

至於返回,直接return就行了,都不用寫全傳回值的,好省事.


MAIL: xcl_168@aliyun.com

BLOG:http://blog.csdn.net/xcl168




相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.