這是一個建立於 的文章,其中的資訊可能已經有所發展或是發生改變。用過strings.NewReplacer,replacer.Replace(),它按對傳入參數後,能依優先順序替換,並能處理中文字串參數.
覺得功能強大,特別好用.對它的尋找和優先順序怎麼處理有點興趣,花時間研究了下源碼,在這記錄一下個人理解.
package main //author:xcl//2014-1-20 記錄import ("fmt" "strings")func main(){ patterns := []string{ "y","25", "中","國", "中工","傢伙", } /* patterns := make([]string,270 * 2) for i :=0;i< 270 *2;i++{ patterns[i] = fmt.Sprintf("%d",i) } */ replacer := strings.NewReplacer(patterns...) format := "中(國)--中工(傢伙)" strfmt := replacer.Replace(format) NewReplacer(patterns...); fmt.Println("\nmain() replacer.Replace old=",format) fmt.Println("main() replacer.Replace new=",strfmt)}func NewReplacer(oldnew ...string){ r := makeGenericReplacer(oldnew) val,keylen,found := r.lookup("中",true) fmt.Println("\nNewReplacer() 中 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("中工",true) fmt.Println("NewReplacer() 中工 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("y",false) fmt.Println("NewReplacer() y val:",val," keylen:",keylen," found:",found) /* val,keylen,found := r.lookup("2",true) fmt.Println("\nNewReplacer() 2 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("3",true) fmt.Println("\nNewReplacer() 3 val:",val," keylen:",keylen," found:",found)*/}type genericReplacer struct { root trieNode //一個字典樹 // tableSize is the size of a trie node's lookup table. It is the number // of unique key bytes. tableSize int // mapping maps from key bytes to a dense index for trieNode.table. mapping [256]byte }func makeGenericReplacer(oldnew []string) *genericReplacer { r := new(genericReplacer) // Find each byte used, then assign them each an index. for i := 0; i < len(oldnew); i += 2 { //步長2. 第一個為pattern key := oldnew[i] fmt.Println("\nmakeGenericReplacer() for key=",key) //key[j]=utf8儲存漢字的三個編碼位置中的一個如228,則將其對應位置設定為1 //即 r.mapping[228] = 1 for j := 0; j < len(key); j++ { r.mapping[key[j]] = 1 fmt.Println("makeGenericReplacer() key[",j,"]=",key[j]) } } for _, b := range r.mapping { r.tableSize += int(b) } fmt.Println("makeGenericReplacer() r.tableSize=",r.tableSize) var index byte for i, b := range r.mapping { if b == 0 { r.mapping[i] = byte(r.tableSize) } else { //依數組字元編碼位置,建立索引 r.mapping[i] = index fmt.Println("makeGenericReplacer() r.mapping[",i,"] =",r.mapping[i] ) index++ } } // Ensure root node uses a lookup table (for performance). r.root.table = make([]*trieNode, r.tableSize) //將key,val放入字典樹,注意priority=len(oldnew)-i,即越數組前面的,值越大.層級越高 for i := 0; i < len(oldnew); i += 2 { r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) } return r}type trieNode struct { value string priority int prefix string next *trieNode table []*trieNode }func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { fmt.Println("trieNode->add() val=",val," key=",key) if key == "" { if t.priority == 0 { t.value = val t.priority = priority fmt.Println("trieNode->add() t.priority==",priority) } return } if t.prefix != "" { //處理已有首碼的node // Need to split the prefix among multiple nodes. var n int // length of the longest common prefix for ; n < len(t.prefix) && n < len(key); n++ { //prefix與key的比較 if t.prefix[n] != key[n] { break } } if n == len(t.prefix) { //相同,繼續放下面 t.next.add(key[n:], val, priority, r) } else if n == 0 { //沒一個相同 // First byte differs, start a new lookup table here. Looking up // what is currently t.prefix[0] will lead to prefixNode, and // looking up key[0] will lead to keyNode. var prefixNode *trieNode if len(t.prefix) == 1 { //如果prefix只是一個位元組的字元編碼,則掛在節點下面 prefixNode = t.next } else { //如果不是,將餘下的建立一個trie樹 prefixNode = &trieNode{ prefix: t.prefix[1:], next: t.next, } } keyNode := new(trieNode) t.table = make([]*trieNode, r.tableSize) //lookup()中的if node.table != nil t.table[r.mapping[t.prefix[0]]] = prefixNode t.table[r.mapping[key[0]]] = keyNode t.prefix = "" t.next = nil keyNode.add(key[1:], val, priority, r) } else { // Insert new node after the common section of the prefix. next := &trieNode{ prefix: t.prefix[n:], next: t.next, } t.prefix = t.prefix[:n] t.next = next next.add(key[n:], val, priority, r) } } else if t.table != nil { // Insert into existing table. m := r.mapping[key[0]] if t.table[m] == nil { t.table[m] = new(trieNode) } t.table[m].add(key[1:], val, priority, r) //構建樹 } else { t.prefix = key t.next = new(trieNode) t.next.add("", val, priority, r) }}func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) { // Iterate down the trie to the end, and grab the value and keylen with // the highest priority. bestPriority := 0 node := &r.root n := 0 for node != nil { if node.priority > bestPriority && !(ignoreRoot && node == &r.root) { bestPriority = node.priority val = node.value keylen = n found = true } if s == "" { break } if node.table != nil { index := r.mapping[s[0]] if int(index) == r.tableSize { //字元編碼第一個位元組就沒在table中,中斷尋找 break } node = node.table[index] s = s[1:] n++ } else if node.prefix != "" && HasPrefix(s, node.prefix) { //字元編碼非第一個位元組的節點會保留key在prefix中,所以通過分析prefix來繼續找其它位元組 n += len(node.prefix) s = s[len(node.prefix):] node = node.next //繼續找相同prefix以外其它字元 } else { break } } return}// HasPrefix tests whether the string s begins with prefix.func HasPrefix(s, prefix string) bool { return len(s) >= len(prefix) && s[0:len(prefix)] == prefix}
記錄:
ascii範圍內的只佔一個位元組,如y(121)
utf8中每個漢字佔三個位元組.如中(228,184,173)
構建樹:
如果是新的第一個單詞或片語
先進 } else if t.table != nil {
然後再進 else,這中間會把 t.prefix = key,把key值存放在prefix,將""傳給下一個node
最後執行 if key == "" && t.priority == 0 { ,將 t.value = val
即key的字元編碼(第一個位元組)對應的root.table位置開始,依次指向另外的字元編譯node,中間node的prefix存下key值.
最末一個node,存下對應的val及priority.
如果是後傳入的單詞或片語,先從key字元編碼首個位元組對應的root.table位置開始,依次尋找,
} else if t.table != nil {
如果已有首碼的,進行比較 if t.prefix != "" {
1, 如目前prefix與key完全一致,則繼續構建樹子節點
2. 如prefix與key完全不同,則另起爐灶,構建一條新的tree
prefixNode 承上,keyNode 啟下
至於為什麼t.table = make([]*trieNode, r.tableSize),是為了預留映射空間.
所以它是這麼弄的,而不是t.table[0],t.table[1].
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
3.有部份相同, 直接跳到t.prefix[n:],然後從key[n:]開始繼續構建樹子節點
priority:
在這的定義是數字越大,優先順序別越高
if key == "" { //字元編碼中間的位元組
if t.priority == 0 { //如果有定義過priority的就略過,新加的,把現有的層級加上
//對應{中,中工}這種,雖然後面有"中工",但"中",的priority要高,所以"中工"對應的值雖找到但不會返回.
if node.priority > bestPriority { bestPriority = node.priority}
例如:中工(priority=4),中(priority=2)
patterns:
"中工","傢伙",
"中","國",
則:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 國 prefix: 工
NewReplacer() 中 val: 國 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 國 prefix: 工
lookup() bestPriority: 2 node.priority: 4 value: 傢伙 prefix:
NewReplacer() 中工 val: 傢伙 keylen: 6 found: true
main() replacer.Replace old= 中(國)--中工(傢伙)
main() replacer.Replace new= 國(國)--傢伙(傢伙)
如果調整下順序,把中->國提前,則會發現,下面的結果:
patterns:
"中","國",
"中工","傢伙",
則:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 國 prefix: 工
NewReplacer() 中 val: 國 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 國 prefix: 工
lookup() bestPriority: 4 node.priority: 2 value: 傢伙 prefix:
NewReplacer() 中工 val: 國 keylen: 3 found: true
main() replacer.Replace old= 中(國)--中工(傢伙)
main() replacer.Replace new= 國(國)--國工(傢伙)
還有,剛發現 lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {}中
定義在傳回值中的變數,原來可以直接在函數中使用,
至於返回,直接return就行了,都不用寫全傳回值的,好省事.
MAIL: xcl_168@aliyun.com
BLOG:http://blog.csdn.net/xcl168