完善的漢字轉拼音php轉換類

來源:互聯網
上載者:User

辦法是利用矩陣,漢字的組成方式是兩個ascii字元,一個高位碼,一個低位碼,界限分別是128-264 64-128
每個漢字拼音最長為8個字元,由此組成二維矩陣進行查詢,弊端是無法解決多音字問題

class pinyin{

/*
是否將拼音檔案讀取到記憶體內,損耗少許記憶體,幾百kb的樣子,速度可以略有提升,
*/
var $ismemorycache = 1;

/*
是否只擷取首字母
*/
var $isfrist = 1;

/*
拼音矩陣檔案地址
*/
var $path = "py.qdb";

/*
記憶體拼音矩陣
*/
var $memorycache;

/*
拼音檔案控制代碼
*/
var $handle;

/*
轉換髮生錯誤盒子
*/
var $errormsgbox;

/*
轉換結果
*/
var $result;


var $array = array();
var $n_t = array("ā" => "a","á" => "a","ǎ" => "a","à" => "a","?" => "a",
"ō" => "o","ó" => "o","ǒ" => "o","ò" => "o",
"ē" => "e","é" => "e","ě" => "e","è" => "e","ê" => "e",
"ī" => "i","í" => "i","ǐ" => "i","ì" => "i",
"ū" => "u","ú" => "u","ǔ" => "u","ù" => "u",
"ǖ" => "v","ǘ" => "v","ǚ" => "v","ǜ" => "v","ü" => "v"
);

/*
轉換入口
@params $str 所需轉換字元,$istonemark 是否保留音標 $suffix 尾綴,預設為空白格
*/
function chinesetopinyin($str,$istonemark = 0,$suffix = ""){
$this->py($str,$istonemark,$suffix);
return $this -> result;
}

function get(){
return $this -> result;
}


function py($str,$n = 0,$s = ""){
$strlength = strlen($str);
if($strlength == 0){ return ""; }
$this->result = "";
if(is_array($str)){
foreach($str as $key => $val){
$str[$key] = $this->py($val,$n,$s);
}
return;
}

if(empty($this->handle)){
if(!file_exists($this->path)){
$this->addoneerrormsg(1,"拼音檔案路徑不存在");
return false;

}

if(is_array($str)){
foreach($str as $key => $val){
$str[$key] = $this->py($val,$n,$s);
}
}


if($this -> ismemorycache){
if(!$this->memorycache){
$this->memorycache = file_get_contents($this->path);
for($i = 0 ; $i < $strlength ; $i++){
$ord1 = ord(substr($str,$i,1));
if($ord1 > 128){
$ord2 = ord(substr($str, ++$i, 1));
if(!isset($this->array[$ord1][$ord2])){
$leng = ($ord1 - 129) * ((254 - 63) * 8 + 2) + ($ord2 - 64) * 8;
$this->array[$ord1][$ord2] = trim(substr($this->memorycache,$leng,8));
}
$strtrlen = $this->isfrist ? 1 : 8;
$this->result .= substr($this ->array[$ord1][$ord2],0,$strtrlen).$s;
}else{
$this->result .= substr($str,$i,1);
}

}
}
}else{
$this->handle = fopen($this->path,"r");
for($i = 0 ; $i < $strlength ; $i++){
$ord1 = ord(substr($str,$i,1));
if($ord1 > 128){
$ord2 = ord(substr($str, ++$i, 1));
if(!isset($this->array[$ord1][$ord2])){
$leng = ($ord1 - 129) * ((254 - 63) * 8 + 2) + ($ord2 - 64) * 8;
fseek($this -> handle,$leng);
$this->array[$ord1][$ord2] = trim(fgets($this->handle,8));

}
$strtrlen = $this->isfrist ? 1 : 8;

$this->result .= substr($this ->array[$ord1][$ord2],0,$strtrlen).$s;
}else{ $this->result .= substr($str,$i,1); }

}
}

if(!$n){ $this -> result = strtr($this -> result,$this -> n_t);}
}
}
function addoneerrormsg($no,$reason){

$this->errormsgbox[] = "<b>error:</b>" . $no . "," . $reason;
}

function showerrormsg(){

foreach($this->errormsgbox as $val){
echo $val."rnrn</br></br>";
}
}

function __destruct(){
if(is_array($this->errormsgbox)){
$this->showerrormsg();
}
}

}

之前遇見過這個難題,發現流傳的代碼都不怎麼完善,漢字型檔總共有20k+的漢字,大多數的是拿幾百個常用漢字打算糊弄過去,在火星文流傳的今天,是不行的。
還有種讀取詞典然後轉換的,每行一個漢字|拼音,這種弊端非常大,速度慢,耗費巨大記憶體,僅僅explode一下讀入數組,再迴圈一次,就能耗費上百m的記憶體,如果一個單頁面耗費上百m,負載稍微大點只能淚奔了。

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.