PHP obtains the first letter of Chinese Pinyin that can be encoded with GBK and jumps
'Gbk', 'out' => 'utf-8'); // exception handling // when there is no Pinyin initial letter other = false, returns the original string. Otherwise, it is set to this value protected $ other = '! '; // Other = false; public function _ construct ($ flag = true) {$ this-> flag = $ flag; // load Resources $ this-> source ['gk221'] = file_get_contents (DATA_PATH. 'Word/gk2-2-1.txt '); $ this-> source ['gk31'] = file_get_contents (DATA_PATH. 'Word/gk3-1.txt '); $ this-> source ['gk41'] = file_get_contents (DATA_PATH. 'Word/gk4-1.txt '); $ this-> pos = json_decode (file_get_contents (DATA_PATH. 'Word/pos.txt '), true);}/*** get the first letter of pinyin * @ param string/arr $ dat A data, it can be string and array * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr return array */public function getInitial ($ data, $ in = 'gbk', $ out = 'utf-8') {if (is_string ($ data) {return self: getInitialByStr ($ data, $ in, $ out);} elseif (is_array ($ data) {return self: getInitialByArr ($ data, $ in, $ out );}} /*** get the first letter of pinyin * @ param string $ data string data * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr returns an array */ Public function getInitialByStr ($ str, $ in = 'gbk', $ out = 'utf-8 ') {$ this-> charset ['in'] = strtolower ($ in); $ this-> charset ['out'] = strtolower ($ out ); if ($ this-> flag! = True) {$ this-> temp ['fws '] = array ();} switch ($ this-> charset ['in']) {case 'gbk ': return self: _ getInitialInGBK ($ str); break; case 'utf-8': return self: _ getInitialInUTF8 ($ str); break; default: # code... break;} // historical data if ($ this-> flag! = True) {unset ($ this-> temp ['fws ']);} /*** get the first letter of pinyin * @ param array $ data array data * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr returns an array * /public function getInitialByArr ($ arr, $ in = 'gbk', $ out = 'utf-8') {$ this-> charset ['in'] = strtolower ($ in ); $ this-> charset ['out'] = strtolower ($ out); if ($ this-> flag! = True) {$ this-> temp ['fws '] = array ();} switch ($ this-> charset ['in']) {case 'gbk ': return self: _ getInitialInGBKArr ($ arr); break; case 'utf-8': return self: _ getInitialInUTF8Arr ($ arr); break; default: # code... break;} // historical data if ($ this-> flag! = True) {unset ($ this-> temp ['fws ']);} /*** process the first letter of the gbk encoded string * @ param string $ str string * @ return array */protected function _ getInitialInGBK ($ str) {// stores the string pinyin $ w = array (); $ I = 0; $ str_length = strlen ($ str ); // string bytes while ($ I = 0x81) {// gbk region $ nstr = substr ($ str, $ I, 2 ); $ I = $ I + 2;} else {$ nstr = substr ($ str, $ I, 1); $ I = $ I + 1 ;} $ this-> word = iconv ('gbk', 'utf-8', $ nstr); if (isset ($ this-> temp ['fws '] [$ n Str]) {$ w [] = $ this-> temp ['fws '] [$ nstr];} else {$ w [] = self :: _ preGetInitial ($ nstr) ;}return $ w ;} /*** process the first letter of the gbk encoded array * @ param array $ arr string single-word array * @ return array */protected function _ getInitialInGBKArr ($ arr) {// stores the string pinyin $ w = array (); foreach ($ arr as $ key => $ word) {$ this-> word = iconv ('gbk ', 'utf-8', $ word); if (isset ($ this-> temp ['fws '] [$ word]) {$ w [] = $ this-> temp ['fws '] [$ word];} else {$ w [] = self: _ pr EGetInitial ($ word) ;}} return $ w ;} /*** process the first letter of UTF-8 encoded string * @ param string $ str string * @ return array */protected function _ getInitialInUTF8 ($ str) {// stores the string pinyin $ w = array (); $ nstr = ''; $ I = 0; $ str_length = strlen ($ str ); // The number of bytes of the string while ($ I = 252) {// if the ASCII bit is high and 252 $ nstr = substr ($ str, $ I, 6 ); // according to the UTF-8 encoding specification, count 6 consecutive characters as a single character $ I = $ I + 6; // The actual Byte is counted as 6} elseif ($ ascnum> = 248) {// if the ASCII bit height is 248 $ nstr = substr ($ s Tr, $ I, 5); // according to the UTF-8 encoding specification, count 5 consecutive characters as a single character $ I = $ I + 5; // The actual Byte count is 5} elseif ($ ascnum> = 240) {// if the ASCII bit height is 240 $ nstr = substr ($ str, $ I, 4 ); // according to the UTF-8 encoding specification, count 4 consecutive characters as a single character $ I = $ I + 4; // The actual Byte is counted as 4} elseif ($ ascnum> = 224) {// if the ASCII bits are high with 224 $ nstr = substr ($ str, $ I, 3); // according to the UTF-8 encoding specification, count three consecutive characters as a single character $ I = $ I + 3; // calculate the actual Byte as 3} elseif ($ ascnum> = 192) {// if the ASCII bit height is 192 $ nstr = substr ($ str, $ I, 2); // depending on the UTF-8 Encoding specification. two consecutive characters are counted as a single character $ I = $ I + 2; // The actual Byte is counted as 2} else {// In other cases, including uppercase letters, lowercase letters and halfwidth punctuation marks, %, &, @, m, w, etc $ nstr = substr ($ str, $ I, 1); $ I = $ I + 1; // The actual number of bytes is 1} $ this-> word = $ nstr; // Encode and convert to GBK $ nstr = iconv ('utf-8', 'gbk ', $ nstr); if (isset ($ this-> temp ['fws '] [$ nstr]) {$ w [] = $ this-> temp ['fws '] [$ nstr];} else {$ w [] = self :: _ preGetInitial ($ nstr) ;}return $ w ;}/ *** process the first letter of the UTF-8 encoded array * @ param array $ arr string single-word array * @ return Array */protected function _ getInitialInUTF8Arr ($ arr) {// stores the string pinyin $ w = array (); foreach ($ arr as $ key => $ word) {$ this-> word = $ word; $ nword = iconv ('utf-8', 'gbk', $ word ); if (isset ($ this-> temp ['fws '] [$ nword]) {$ w [] = $ this-> temp ['fws '] [$ nword];} else {$ w [] = self :: _ preGetInitial ($ nword);} return $ w;}/*** single-character preprocessing * @ param string $ word, gbk encoding * @ return string first letter, encoding depends on $ this-> charset ['out'] */protecte D function _ preGetInitial ($ word) {$ fw = self: _ getInitial ($ word); // The first letter of the returned UTF-8 encoded data, if ($ fw! = False) {$ nstr = $ this-> temp ['fws '] [$ word] = iconv ('utf-8 ', $ this-> charset ['out'], $ fw );} else {$ nstr = $ this-> temp ['fws '] [$ word] = iconv ('gbk', $ this-> charset ['out'], $ word);} return $ nstr;}/*** obtain the core function * @ param string $ word, gbk encoding * @ return string, UTF-8 encoding */protected function _ getInitial ($ word) {$ high = ord ($ word {0}); $ low = ord ($ word {1 }); // extract the first letter of 20902 Chinese characters $ hexc = $ high * 256 + $ low; // GBK/2: gb2312 Chinese character table (pinyin sorting), low a0 start if ($ hexc> = 0xB0A1 and $ hexc = 0xA0) {// A total of 3755 words return self :: _ getInGBK21 ($ hexc);} // GBK/2: gb2312 Chinese character table. if ($ hexc> = 0xD8A1 and $ hexc = 0xA0) {// A total of 3008 words return self: _ getInGBK ('gk221');} // GBK/3: expand the Chinese character table (the size of the UCS code) if ($ hexc> = 0x8140 and $ hexc = 0xAA40 and $ hexc other;}/*** obtain the first letter * GBK/2: gb2312 Chinese character table (pinyin sequence) * A total of 3755 characters * @ param int $ hexc GBK encoding value * @ return string first letter, UTF-8 encoding */protected function _ GetInGBK21 ($ hexc) {// pinyin starting with no I, u, v $ char = array ("", // fill position "A", "B ", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M ", "N", "O", "P", "Q", "R", "S", "T", "W", "X", "Y ", "Z"); $ hcs = array (0xB0A1, 0xb2c1, 0xb4ee, 0xb6ea, 0xb7a2, primary, primary, 0xbbf7, 0xbfa6, 0xc0ac, 0xc2e8, 0xc4c3, 0xc5b6, 0xc5be, 0xc6da, 0xc8bb, 0xc8f6, 0 xcbfa, 0 xcdda, 0xcef4, 0xd1b9, 0xd4d1); if ($ key = array_search ($ hexc, $ hcs )) {return $ char [$ key];} else {$ hcs [] = $ hexc; sort ($ Hcs); return $ char [array_search ($ hexc, $ hcs)] ;}} /*** obtain the first letter ** @ param string $ type the type of the GBK region to which the single word belongs * @ return string first letter, UTF-8 encoding */protected function _ getInGBK ($ type) {// pinyin $ char = array ("", // fill position "A", "B", "C", "D ", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O ", "P", "Q", "R", "S", "T", "W", "X", "Y", "Z "); $ str = str_replace ("\ r \ n", '', $ this-> source [$ type]); $ p = stripos ($ str, $ this-> word) + 3; // right-side // stripos ($ str, $ word ), Left $ str = ''; if ($ key = array_search ($ p, $ this-> pos [$ type]) {return $ char [$ key];} else {$ pos = $ this-> pos [$ type]; $ pos [] = $ p; sort ($ pos); return $ char [array_search ($ p, $ pos)] ;}}}?>