Similar_text () Chinese character version
Copy Code code as follows:
<?php
Split string
function Split_str ($STR) {
Preg_match_all ("/./u", $str, $arr);
return $arr [0];
}
Similarity detection
function Similar_text_cn ($str 1, $str 2) {
$arr _1 = Array_unique (Split_str ($str 1));
$arr _2 = Array_unique (Split_str ($str 2));
$similarity = count ($arr _2)-Count (Array_diff ($arr _2, $arr _1));
return $similarity;
}
Levenshtein () Chinese character version
Copy Code code as follows:
<?php
Split string
function Mbstringtoarray ($string, $encoding = ' UTF-8 ') {
$arrayResult = Array ();
while ($iLen = Mb_strlen ($string, $encoding)) {
Array_push ($arrayResult, Mb_substr ($string, 0, 1, $encoding));
$string = Mb_substr ($string, 1, $iLen, $encoding);
}
return $arrayResult;
}
Edit Distance
function Levenshtein_cn ($str 1, $str 2, $costReplace = 1, $encoding = ' UTF-8 ') {
$count _same_letter = 0;
$d = Array ();
$MB _len1 = Mb_strlen ($str 1, $encoding);
$MB _len2 = Mb_strlen ($str 2, $encoding);
$MB _str1 = Mbstringtoarray ($str 1, $encoding);
$MB _str2 = Mbstringtoarray ($str 2, $encoding);
for ($i 1 = 0; $i 1 <= $mb _len1 $i 1++) {
$d [$i 1] = array ();
$d [$i 1][0] = $i 1;
}
for ($i 2 = 0; $i 2 <= $mb _len2 $i 2++) {
$d [0][$i 2] = $i 2;
}
for ($i 1 = 1; $i 1 <= $mb _len1 $i 1++) {
for ($i 2 = 1; $i 2 <= $mb _len2 $i 2++) {
$cost = ($str 1[$i 1-1] = = $str 2[$i 2-1])? 0:1;
if ($MB _str1[$i 1-1] = = = $MB _str2[$i 2-1]) {
$cost = 0;
$count _same_letter++;
} else {
$cost = $costReplace; Replace
}
$d [$i 1][$i 2] = min ($d [$i 1-1][$i 2] + 1,//insert
$d [$i 1][$i 2-1] + 1,//delete
$d [$i 1-1][$i 2-1] + $cost);
}
}
return $d [$MB _len1][$mb _len2];
return array (' distance ' => $d [$mb _len1][$mb _len2], ' count_same_letter ' => $count _same_letter);
}
Longest common subsequence LCS ()
Copy Code code as follows:
<?php
The longest common child sequence English version
function Lcs_en ($str _1, $str _2) {
$len _1 = strlen ($str _1);
$len _2 = strlen ($str _2);
$len = $len _1 > $len _2? $len _1: $len _2;
$DP = Array ();
for ($i = 0; $i <= $len; $i + +) {
$DP [$i] = array ();
$DP [$i][0] = 0;
$DP [0][$i] = 0;
}
for ($i = 1; $i <= $len _1; $i + +) {
for ($j = 1; $j <= $len _2; $j + +) {
if ($str _1[$i-1] = = $str _2[$j-1]) {
$DP [$i] [$j] = $DP [$i -1][$j-1] + 1;
} else {
$DP [$i] [$j] = $DP [$i -1][$j] > $DP [$i] [$j-1]? $DP [$i -1][$j]: $DP [$i] [$j-1];
}
}
}
return $DP [$len _1][$len _2];
}
Split string
function Mbstringtoarray ($string, $encoding = ' UTF-8 ') {
$arrayResult = Array ();
while ($iLen = Mb_strlen ($string, $encoding)) {
Array_push ($arrayResult, Mb_substr ($string, 0, 1, $encoding));
$string = Mb_substr ($string, 1, $iLen, $encoding);
}
return $arrayResult;
}
The longest common child sequence Chinese version
function Lcs_cn ($str 1, $str 2, $encoding = ' UTF-8 ') {
$MB _len1 = Mb_strlen ($str 1, $encoding);
$MB _len2 = Mb_strlen ($str 2, $encoding);
$MB _str1 = Mbstringtoarray ($str 1, $encoding);
$MB _str2 = Mbstringtoarray ($str 2, $encoding);
$len = $mb _len1 > $MB _len2? $MB _len1: $MB _len2;
$DP = Array ();
for ($i = 0; $i <= $len; $i + +) {
$DP [$i] = array ();
$DP [$i][0] = 0;
$DP [0][$i] = 0;
}
for ($i = 1; $i <= $mb _len1; $i + +) {
for ($j = 1; $j <= $mb _len2; $j + +) {
if ($MB _str1[$i-1] = = $MB _str2[$j-1]) {
$DP [$i] [$j] = $DP [$i -1][$j-1] + 1;
} else {
$DP [$i] [$j] = $DP [$i -1][$j] > $DP [$i] [$j-1]? $DP [$i -1][$j]: $DP [$i] [$j-1];
}
}
}
return $DP [$MB _len1][$mb _len2];
}