Encoding of GBK and UTF8 in PHP
I. Encoding range
1. GBK (GB2312/GB18030)
\ X00-\ xffGBK dubyte encoding range
\ X20-\ x7fASCII
\ Xa1-\ xffChinese
\ X80-\ xffChinese
2. UTF-8 (Unicode)
\ U4e00-\ u9fa5 (Chinese)
\ X3130-\ x318F (Korean
\ XAC00-\ xD7A3 (Korean)
\ U0800-\ u4e00 (Japanese)
Ps: Korean is a character greater than [\ u9fa5]
Regular Expression example:
Preg_replace ("/([\ x80-\ xff])/", "", $ str );
Preg_replace ("/([u4e00-u9fa5])/", "", $ str );
Ii. code example
// Determine whether the content contains Chinese characters-GBK (PHP)
Function check_is_chinese ($ s ){
Return preg_match ('/[\ x80-\ xff]./', $ s );
}
// Obtain the string length-GBK (PHP)
Function gb_strlen ($ str ){
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
$ S = substr ($ str, $ I, 1 );
If (preg_match ("/[\ x80-\ xff]/", $ s) ++ $ I;
+ + $ Count;
}
Return $ count;
}
// Truncate the string-GBK (PHP)
Function gb_substr ($ str, $ len ){
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
If ($ count = $ len) break;
If (preg_match ("/[\ x80-\ xff]/", substr ($ str, $ I, 1) ++ $ I;
+ + $ Count;
}
Return substr ($ str, 0, $ I );
}
// Count the length of the string-UTF8 (PHP)
Function utf8_strlen ($ str ){
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
$ Value = ord ($ str [$ I]);
If ($ value & gt; 127 ){
$ Count ++;
If ($ value >=192 & $ value <= 223) $ I ++;
Elseif ($ value >=224 & $ value <= 239) $ I = $ I + 2;
Elseif ($ value >=240 & $ value <= 247) $ I = $ I + 3;
Else die ('not a UTF-8 compatible string ');
}
$ Count ++;
}
Return $ count;
}
// Truncate the string-UTF8 (PHP)
Function utf8_substr ($ str, $ position, $ length ){
$ Start_position = strlen ($ str );
$ Start_byte = 0;
$ End_position = strlen ($ str );
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
If ($ count >=$ position & $ start_position> $ I ){
$ Start_position = $ I;
$ Start_byte = $ count;
}
If ($ count-$ start_byte)> = $ length ){
$ End_position = $ I;
Break;
}
$ Value = ord ($ str [$ I]);
If ($ value & gt; 127 ){
$ Count ++;
If ($ value >=192 & $ value <= 223) $ I ++;
Elseif ($ value >=224 & $ value <= 239) $ I = $ I + 2;
Elseif ($ value >=240 & $ value <= 247) $ I = $ I + 3;
Else die ('not a UTF-8 compatible string ');
}
$ Count ++;
}
Return (substr ($ str, $ start_position, $ end_position-$ start_position ));
}
// String Length statistics-UTF8 [three Chinese characters, two Russian and Korean characters, and one letter] (Ruby)
Def utf8_string_length (str)
Temp = CGI: unescape (str)
I = 0;
J = 0;
Temp. length. times {| t |
If temp [t] <127
I + = 1
Elseif temp [t]> = 127 and temp [t] <224
J + = 1
If 0 = (j % 2)
I + = 2
J = 0
End
Else
J + = 1
If 0 = (j % 3)
I + = 2
J = 0
End
End
}
Return I
}
// Determine if there is a Korean-UTF-8 (JavaScript)
Function checkKoreaChar (str ){
For (I = 0; I <str. length; I ++ ){
If (str. charCodeAt (I)> 0x3130 & str. charCodeAt (I) <0x318F) | (str. charCodeAt (I)> = 0xAC00 & str. charCodeAt (I) <= 0xD7A3 ))){
Return true;
}
}
Return false;
}
// Determine whether a Chinese character-GBK (JavaScript) exists)
Function check_chinese_char (s ){
Return (s. length! = S. replace (/[^ \ x00-\ xff]/g, "**"). length );
Iii. References
In addition:
Public function csubstr ($ str, $ start = 0, $ length, $ charset = "UTF-8", $ suffix = true)
{
If (function_exists ("mb_substr "))
Return mb_substr ($ str, $ start, $ length, $ charset );
$ Re ['utf-8'] = "/[\ x01-\ x7f] | [\ xc2-\ xdf] [\ x80-\ xbf] | [\ xe0 -\ xef] [\ x80-\ xbf] {2} | [\ xf0-\ xff] [\ x80-\ xbf] {3 }/";
$ Re ['gb2312'] = "/[\ x01-\ x7f] | [\ xb0-\ xf7] [\ xa0-\ xfe]/";
$ Re ['gbk'] = "/[\ x01-\ x7f] | [\ x81-\ xfe] [\ x40-\ xfe]/";
$ Re ['big5'] = "/[\ x01-\ x7f] | [\ x81-\ xfe] ([\ x40-\ x7e] | \ xa1-\ xfe]) /";
Preg_match_all ($ re [$ charset], $ str, $ match );
$ Slice = join ("", array_slice ($ match [0], $ start, $ length ));
If ($ suffix) return $ slice ."... ";
Return $ slice;
}