1. encoding range 1.GBK( GB2312/GB18030) x00-xffGBK double byte encoding range x20-x7fASCIIxa1-xff Chinese x80-xff Chinese 2. UTF-8 (Unicode) u4e00-u9fa5 (Chinese) x3130-x318F (Korean) xAC00-xD7A3 (Korean) u080
I. encoding range
1. GBK (GB2312/GB18030)
X00-xff GBK dubyte encoding range
X20-x7f (ASCII)
Xa1-xff (Chinese)
X80-xff (Chinese)
2. UTF-8 (Unicode)
U4e00-u9fa5)
X3130-x318F (Korean)
XAC00-xD7A3 (Korean)
U0800-u4e00 (Japanese)
Ps: Korean is a character greater than [u9fa5]
Regular expression example:
Preg_replace ("/([x80-xff])/", "", $ str );
Preg_replace ("/([u4e00-u9fa5])/", "", $ str );
II. code example
// Determine whether the content contains Chinese characters-GBK (PHP)
Function check_is_chinese ($ s ){
Return preg_match ('/[x80-xff]./', $ s );
}
// Obtain the string length-GBK (PHP)
Function gb_strlen ($ str ){
$ Count = 0;
For ($ I = 0; $ I $ S = substr ($ str, $ I, 1 );
If (preg_match ("/[x80-xff]/", $ s) ++ $ I;
+ + $ Count;
}
Return $ count;
}
// Truncate the string-GBK (PHP)
Function gb_substr ($ str, $ len ){
$ Count = 0;
For ($ I = 0; $ I If ($ count = $ len) break;
If (preg_match ("/[x80-xff]/", substr ($ str, $ I, 1) + + $ I;
+ + $ Count;
}
Return substr ($ str, 0, $ I );
}
// Count the length of the string-UTF8 (PHP)
Function utf8_strlen ($ str ){
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
$ Value = ord ($ str [$ I]);
If ($ value & gt; 127 ){
$ Count ++;
If ($ value >=192 & $ value <= 223) $ I ++;
Elseif ($ value >=224 & $ value <= 239) $ I = $ I + 2;
Elseif ($ value >=240 & $ value <= 247) $ I = $ I + 3;
Else die ('not a UTF-8 compatible string ');
}
$ Count ++;
}
Return $ count;
}
// Truncate the string-UTF8 (PHP)
Function utf8_substr ($ str, $ position, $ length ){
$ Start_position = strlen ($ str );
$ Start_byte = 0;
$ End_position = strlen ($ str );
$ Count = 0;
For ($ I = 0; $ I <strlen ($ str); $ I ++ ){
If ($ count >=$ position & $ start_position> $ I ){
$ Start_position = $ I;
$ Start_byte = $ count;
}
If ($ count-$ start_byte)> = $ length ){
$ End_position = $ I;
Break;
}
$ Value = ord ($ str [$ I]);
If ($ value & gt; 127 ){
$ Count ++;
If ($ value >=192 & $ value <= 223) $ I ++;
Elseif ($ value >=224 & $ value <= 239) $ I = $ I + 2;
Elseif ($ value >=240 & $ value <= 247) $ I = $ I + 3;
Else die ('not a UTF-8 compatible string ');
}
$ Count ++;
}
Return (substr ($ str, $ start_position, $ end_position-$ start_position ));
}
// String length statistics-UTF8 [three Chinese characters, two Russian and Korean characters, and one letter] (Ruby)
Def utf8_string_length (str)
Temp = CGI: unescape (str)
I = 0;
J = 0;
Temp. length. times {| t |
If temp [t] <127
I + = 1
Elseif temp [t]> = 127 and temp [t] <224
J + = 1
If 0 = (j % 2)
I + = 2
J = 0
End
Else
J + = 1
If 0 = (j % 3)
I + = 2
J = 0
End
End
}
Return I
}
// Determine if it contains Korean-UTF-8 (javascript)
Function checkKoreaChar (str ){
For (I = 0; I If (str. charCodeAt (I)> 0x3130 & str. charCodeAt (I) <0x318F) | (str. charCodeAt (I)> = 0xAC00 & str. charCodeAt (I) <= 0xD7A3 ))){
Return true;
}
}
Return false;
}
// Determine whether a Chinese character-GBK (javascript) exists)
Function check_chinese_char (s ){
Return (s. length! = S. replace (/[^ x00-xff]/g, "**"). length );
}