PHP similar_text (), levenshtein (), and lcs () support Chinese Characters,
The native similar_text () function and levenshtein () function of PHP do not support Chinese characters. I wrote
Similar_text () Chinese character edition
1 <? Php 2 // split string 3 function split_str ($ str) {4 preg_match_all ("/. /u ", $ str, $ arr); 5 return $ arr [0]; 6} 7 8 // similarity detection 9 function similar_text_cn ($ str1, $ str2) {10 $ arr_1 = array_unique (split_str ($ str1); 11 $ arr_2 = array_unique (split_str ($ str2); 12 $ similarity = count ($ arr_2) -count (array_diff ($ arr_2, $ arr_1); 13 14 return $ similarity; 15}
Levenshtein () Chinese character edition
1 <? Php 2 // split string 3 function mbStringToArray ($ string, $ encoding = 'utf-8') {4 $ arrayResult = array (); 5 6 while ($ iLen = mb_strlen ($ string, $ encoding) {7 array_push ($ arrayResult, mb_substr ($ string, 0, 1, $ encoding )); 8 $ string = mb_substr ($ string, 1, $ iLen, $ encoding); 9} 10 11 return $ arrayResult; 12} 13 14 // The editing distance is 15 function levenshtein_cn ($ str1, $ str2, $ costReplace = 1, $ encoding = 'utf-8') {1 6 $ count_same_letter = 0; 17 $ d = array (); 18 19 $ mb_len1 = mb_strlen ($ str1, $ encoding); 20 $ mb_len2 = mb_strlen ($ str2, $ encoding); 21 22 $ mb_str1 = mbStringToArray ($ str1, $ encoding); 23 $ mb_str2 = mbStringToArray ($ str2, $ encoding); 24 25 for ($ i1 = 0; $ i1 <= $ mb_len1; $ i1 ++) {26 $ d [$ i1] = array (); 27 $ d [$ i1] [0] = $ i1; 28} 29 30 for ($ i2 = 0; $ i2 <= $ mb_len2; $ i2 ++) {31 $ d [0] [$ i2] = $ i2; 32} 3 3 34 for ($ i1 = 1; $ i1 <= $ mb_len1; $ i1 ++) {35 for ($ i2 = 1; $ i2 <= $ mb_len2; $ i2 ++) {36 // $ cost = ($ str1 [$ i1-1] = $ str2 [$ i2-1])? 0: 1; 37 if ($ mb_str1 [$ i1-1] ===$ mb_str2 [$ i2-1]) {38 $ cost = 0; 39 $ count_same_letter ++; 40} else {41 $ cost = $ costReplace; // replace 42} 43 44 $ d [$ i1] [$ i2] = min ($ d [$ i1-1] [$ i2] + 1, // insert 45 $ d [$ i1] [$ i2-1] + 1, // Delete 46 $ d [$ i1-1] [$ i2-1] + $ cost ); 47} 48} 49 50 return $ d [$ mb_len1] [$ mb_len2]; 51 // return array ('distance '=> $ d [$ mb_len1] [$ mb_len2], 'count _ same_letter '=> $ count_same_letter); 52}
Longest Common subsequence LCS ()
1 <? Php 2 // Longest Common subsequence English Version 3 function LCS_en ($ str_1, $ str_2) {4 $ len_1 = strlen ($ str_1); 5 $ len_2 = strlen ($ str_2 ); 6 $ len = $ len_1> $ len_2? $ Len_1: $ len_2; 7 8 $ dp = array (); 9 for ($ I = 0; $ I <= $ len; $ I ++) {10 $ dp [$ I] = array (); 11 $ dp [$ I] [0] = 0; 12 $ dp [0] [$ I] = 0; 13} 14 15 for ($ I = 1; $ I <= $ len_1; $ I ++) {16 for ($ j = 1; $ j <= $ len_2; $ j ++) {17 if ($ str_1 [$ I-1] ==$ str_2 [$ j-1]) {18 $ dp [$ I] [$ j] = $ dp [$ I-1] [$ j-1] + 1; 19} else {20 $ dp [$ I] [$ j] = $ dp [$ I-1] [$ j]> $ dp [$ I] [$ j-1 ]? $ Dp [$ I-1] [$ j]: $ dp [$ I] [$ j-1]; 21} 22} 23} 24 25 return $ dp [$ len_1] [$ len_2]; 26} 27 28 // split string 29 function mbStringToArray ($ string, $ encoding = 'utf-8') {30 $ arrayResult = array (); 31 32 while ($ iLen = mb_strlen ($ string, $ encoding )) {33 array_push ($ arrayResult, mb_substr ($ string, 0, 1, $ encoding); 34 $ string = mb_substr ($ string, 1, $ iLen, $ encoding ); 35} 36 37 return $ arrayResult; 38} 39 40 // Longest Common subsequence Chinese Version 41 function LCS_cn ($ str1, $ str2, $ encoding = 'utf-8') {42 $ mb_len1 = mb_strlen ($ str1, $ encoding); 43 $ mb_len2 = mb_strlen ($ str2, $ encoding); 44 45 $ mb_str1 = mbStringToArray ($ str1, $ encoding); 46 $ mb_str2 = mbStringToArray ($ str2, $ encoding); 47 48 $ len = $ mb_len1> $ mb_len2? $ Mb_len1: $ mb_len2; 49 50 $ dp = array (); 51 for ($ I = 0; $ I <= $ len; $ I ++) {52 $ dp [$ I] = array (); 53 $ dp [$ I] [0] = 0; 54 $ dp [0] [$ I] = 0; 55} 56 57 for ($ I = 1; $ I <= $ mb_len1; $ I ++) {58 for ($ j = 1; $ j <= $ mb_len2; $ j ++) {59 if ($ mb_str1 [$ I-1] = $ mb_str2 [$ j-1]) {60 $ dp [$ I] [$ j] = $ dp [$ I-1] [$ j-1] + 1; 61} else {62 $ dp [$ I] [$ j] = $ dp [$ I-1] [$ j]> $ dp [$ I] [$ j-1 ]? $ Dp [$ I-1] [$ j]: $ dp [$ I] [$ j-1]; 63} 64} 65} 66 67 return $ dp [$ mb_len1] [$ mb_len2]; 68}