PHP obtains the first letter of Chinese Pinyin that can be encoded with GBK-PHP source code

Source: Internet
Author: User
PHP obtains the first letter of Chinese Pinyin that can be encoded with GBK and jumps

'Gbk', 'out' => 'utf-8'); // exception handling // when there is no Pinyin initial letter other = false, returns the original string. Otherwise, it is set to this value protected $ other = '! '; // Other = false; public function _ construct ($ flag = true) {$ this-> flag = $ flag; // load Resources $ this-> source ['gk221'] = file_get_contents (DATA_PATH. 'Word/gk2-2-1.txt '); $ this-> source ['gk31'] = file_get_contents (DATA_PATH. 'Word/gk3-1.txt '); $ this-> source ['gk41'] = file_get_contents (DATA_PATH. 'Word/gk4-1.txt '); $ this-> pos = json_decode (file_get_contents (DATA_PATH. 'Word/pos.txt '), true);}/*** get the first letter of pinyin * @ param string/arr $ dat A data, it can be string and array * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr return array */public function getInitial ($ data, $ in = 'gbk', $ out = 'utf-8') {if (is_string ($ data) {return self: getInitialByStr ($ data, $ in, $ out);} elseif (is_array ($ data) {return self: getInitialByArr ($ data, $ in, $ out );}} /*** get the first letter of pinyin * @ param string $ data string data * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr returns an array */ Public function getInitialByStr ($ str, $ in = 'gbk', $ out = 'utf-8 ') {$ this-> charset ['in'] = strtolower ($ in); $ this-> charset ['out'] = strtolower ($ out ); if ($ this-> flag! = True) {$ this-> temp ['fws '] = array ();} switch ($ this-> charset ['in']) {case 'gbk ': return self: _ getInitialInGBK ($ str); break; case 'utf-8': return self: _ getInitialInUTF8 ($ str); break; default: # code... break;} // historical data if ($ this-> flag! = True) {unset ($ this-> temp ['fws ']);} /*** get the first letter of pinyin * @ param array $ data array data * @ param string $ in data encoding * @ param string $ out output encoding * @ return arr returns an array * /public function getInitialByArr ($ arr, $ in = 'gbk', $ out = 'utf-8') {$ this-> charset ['in'] = strtolower ($ in ); $ this-> charset ['out'] = strtolower ($ out); if ($ this-> flag! = True) {$ this-> temp ['fws '] = array ();} switch ($ this-> charset ['in']) {case 'gbk ': return self: _ getInitialInGBKArr ($ arr); break; case 'utf-8': return self: _ getInitialInUTF8Arr ($ arr); break; default: # code... break;} // historical data if ($ this-> flag! = True) {unset ($ this-> temp ['fws ']);} /*** process the first letter of the gbk encoded string * @ param string $ str string * @ return array */protected function _ getInitialInGBK ($ str) {// stores the string pinyin $ w = array (); $ I = 0; $ str_length = strlen ($ str ); // string bytes while ($ I = 0x81) {// gbk region $ nstr = substr ($ str, $ I, 2 ); $ I = $ I + 2;} else {$ nstr = substr ($ str, $ I, 1); $ I = $ I + 1 ;} $ this-> word = iconv ('gbk', 'utf-8', $ nstr); if (isset ($ this-> temp ['fws '] [$ n Str]) {$ w [] = $ this-> temp ['fws '] [$ nstr];} else {$ w [] = self :: _ preGetInitial ($ nstr) ;}return $ w ;} /*** process the first letter of the gbk encoded array * @ param array $ arr string single-word array * @ return array */protected function _ getInitialInGBKArr ($ arr) {// stores the string pinyin $ w = array (); foreach ($ arr as $ key => $ word) {$ this-> word = iconv ('gbk ', 'utf-8', $ word); if (isset ($ this-> temp ['fws '] [$ word]) {$ w [] = $ this-> temp ['fws '] [$ word];} else {$ w [] = self: _ pr EGetInitial ($ word) ;}} return $ w ;} /*** process the first letter of UTF-8 encoded string * @ param string $ str string * @ return array */protected function _ getInitialInUTF8 ($ str) {// stores the string pinyin $ w = array (); $ nstr = ''; $ I = 0; $ str_length = strlen ($ str ); // The number of bytes of the string while ($ I = 252) {// if the ASCII bit is high and 252 $ nstr = substr ($ str, $ I, 6 ); // according to the UTF-8 encoding specification, count 6 consecutive characters as a single character $ I = $ I + 6; // The actual Byte is counted as 6} elseif ($ ascnum> = 248) {// if the ASCII bit height is 248 $ nstr = substr ($ s Tr, $ I, 5); // according to the UTF-8 encoding specification, count 5 consecutive characters as a single character $ I = $ I + 5; // The actual Byte count is 5} elseif ($ ascnum> = 240) {// if the ASCII bit height is 240 $ nstr = substr ($ str, $ I, 4 ); // according to the UTF-8 encoding specification, count 4 consecutive characters as a single character $ I = $ I + 4; // The actual Byte is counted as 4} elseif ($ ascnum> = 224) {// if the ASCII bits are high with 224 $ nstr = substr ($ str, $ I, 3); // according to the UTF-8 encoding specification, count three consecutive characters as a single character $ I = $ I + 3; // calculate the actual Byte as 3} elseif ($ ascnum> = 192) {// if the ASCII bit height is 192 $ nstr = substr ($ str, $ I, 2); // depending on the UTF-8 Encoding specification. two consecutive characters are counted as a single character $ I = $ I + 2; // The actual Byte is counted as 2} else {// In other cases, including uppercase letters, lowercase letters and halfwidth punctuation marks, %, &, @, m, w, etc $ nstr = substr ($ str, $ I, 1); $ I = $ I + 1; // The actual number of bytes is 1} $ this-> word = $ nstr; // Encode and convert to GBK $ nstr = iconv ('utf-8', 'gbk ', $ nstr); if (isset ($ this-> temp ['fws '] [$ nstr]) {$ w [] = $ this-> temp ['fws '] [$ nstr];} else {$ w [] = self :: _ preGetInitial ($ nstr) ;}return $ w ;}/ *** process the first letter of the UTF-8 encoded array * @ param array $ arr string single-word array * @ return Array */protected function _ getInitialInUTF8Arr ($ arr) {// stores the string pinyin $ w = array (); foreach ($ arr as $ key => $ word) {$ this-> word = $ word; $ nword = iconv ('utf-8', 'gbk', $ word ); if (isset ($ this-> temp ['fws '] [$ nword]) {$ w [] = $ this-> temp ['fws '] [$ nword];} else {$ w [] = self :: _ preGetInitial ($ nword);} return $ w;}/*** single-character preprocessing * @ param string $ word, gbk encoding * @ return string first letter, encoding depends on $ this-> charset ['out'] */protecte D function _ preGetInitial ($ word) {$ fw = self: _ getInitial ($ word); // The first letter of the returned UTF-8 encoded data, if ($ fw! = False) {$ nstr = $ this-> temp ['fws '] [$ word] = iconv ('utf-8 ', $ this-> charset ['out'], $ fw );} else {$ nstr = $ this-> temp ['fws '] [$ word] = iconv ('gbk', $ this-> charset ['out'], $ word);} return $ nstr;}/*** obtain the core function * @ param string $ word, gbk encoding * @ return string, UTF-8 encoding */protected function _ getInitial ($ word) {$ high = ord ($ word {0}); $ low = ord ($ word {1 }); // extract the first letter of 20902 Chinese characters $ hexc = $ high * 256 + $ low; // GBK/2: gb2312 Chinese character table (pinyin sorting), low a0 start if ($ hexc> = 0xB0A1 and $ hexc = 0xA0) {// A total of 3755 words return self :: _ getInGBK21 ($ hexc);} // GBK/2: gb2312 Chinese character table. if ($ hexc> = 0xD8A1 and $ hexc = 0xA0) {// A total of 3008 words return self: _ getInGBK ('gk221');} // GBK/3: expand the Chinese character table (the size of the UCS code) if ($ hexc> = 0x8140 and $ hexc = 0xAA40 and $ hexc other;}/*** obtain the first letter * GBK/2: gb2312 Chinese character table (pinyin sequence) * A total of 3755 characters * @ param int $ hexc GBK encoding value * @ return string first letter, UTF-8 encoding */protected function _ GetInGBK21 ($ hexc) {// pinyin starting with no I, u, v $ char = array ("", // fill position "A", "B ", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M ", "N", "O", "P", "Q", "R", "S", "T", "W", "X", "Y ", "Z"); $ hcs = array (0xB0A1, 0xb2c1, 0xb4ee, 0xb6ea, 0xb7a2, primary, primary, 0xbbf7, 0xbfa6, 0xc0ac, 0xc2e8, 0xc4c3, 0xc5b6, 0xc5be, 0xc6da, 0xc8bb, 0xc8f6, 0 xcbfa, 0 xcdda, 0xcef4, 0xd1b9, 0xd4d1); if ($ key = array_search ($ hexc, $ hcs )) {return $ char [$ key];} else {$ hcs [] = $ hexc; sort ($ Hcs); return $ char [array_search ($ hexc, $ hcs)] ;}} /*** obtain the first letter ** @ param string $ type the type of the GBK region to which the single word belongs * @ return string first letter, UTF-8 encoding */protected function _ getInGBK ($ type) {// pinyin $ char = array ("", // fill position "A", "B", "C", "D ", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O ", "P", "Q", "R", "S", "T", "W", "X", "Y", "Z "); $ str = str_replace ("\ r \ n", '', $ this-> source [$ type]); $ p = stripos ($ str, $ this-> word) + 3; // right-side // stripos ($ str, $ word ), Left $ str = ''; if ($ key = array_search ($ p, $ this-> pos [$ type]) {return $ char [$ key];} else {$ pos = $ this-> pos [$ type]; $ pos [] = $ p; sort ($ pos); return $ char [array_search ($ p, $ pos)] ;}}}?>

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.