PHP製作百度詞典查詞採集器_php執行個體

來源:互聯網
上載者:User

百度dict 採集樣本

寫的採集百度dict詞典翻譯後的所有結果資料,當然附帶了13.5w單詞庫和採集簡單的案例,這裡我把寫出的主要類dict.class.php放出來,項目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~麼麼噠,這東西用的人很少,所以有用的兄弟拿走了哈~

<?php/** * dict.class.php 採集百度詞典翻譯內容 * * @copyright      (C) 2014 widuu * @license       http://www.widuu.com * @lastmodify     2014-2-15 */  header("content-type:text/html;charset=utf8");class Dict{private $word;//顯示的條數private static $num = 10;public function __construct(){}/**   * 公用返回百度採集資料的方法   * @param string 英文單詞   * retun array( *symbol" => 音標 *"pro" => 發音 *"example"=> 例句 *"explain"=> 簡明釋義 *"synonym"=> 同反義詞 *"phrase" => 短語數組 *)   * */public function content($word){ $this -> word = $word; $symbol = $this -> Pronounced(); $pro = $this->getSay(); $example = $this -> getExample(); $explain = $this -> getExplain(); $synonym = $this -> getSynonym(); $phrase = $this -> getPhrase(); $result = array("symbol" => $symbol,//音標"pro" => $pro,//發音"example"=> $example,//例句"explain"=> $explain,//簡明釋義"synonym"=> $synonym,//同反義詞"phrase" => $phrase //短語數組);return $result;}/**   * 遠程擷取百度翻譯內容   * get function curl   * retun string   * */private function getContent(){ $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; $ch = curl_init(); $url = "http://dict.baidu.com/s?wd=".$this->word; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_USERAGENT,$useragent);curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HTTPGET, 1);curl_setopt($ch, CURLOPT_AUTOREFERER,1);curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_TIMEOUT, 30);$result = curl_exec($ch);if (curl_errno($curl)) {echo 'Errno'.curl_error($curl);}curl_close($ch);return $result;}/**   * 擷取百度翻譯發音   * retun array(英,美)   * */private function Pronounced(){$data = $this -> getContent();preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);return array('en' => $pronounced[1][0],'us' => $pronounced[1][1]);}/** * 擷取百度翻譯發音 * return array(英,美) * */private function getSay(){$data = $this -> getContent();preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);return array('en' => $pronounced[1][0],'us' => $pronounced[1][1]);}/**   * 擷取百度翻譯例句   * return array() 多維陣列 例句   *  */private function getExample(){$str = "";$data = $this -> getContent();preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);  $data1 = "[[[".ltrim($example[1][0],"[");  $data2 = explode("[[[",$data1);  $num = count(array_filter($data2));foreach($data2 as $key => $value){ $data3 = explode("[[","[[".$value); foreach ($data3 as $k => $v) { preg_match_all("/\[\"(.*)\",/Us","[".$v, $match); if(!empty($match[1])){ $str .= implode($match[1]," ")."@"; } }}$data4 = trim($str,"@");$data5 = explode("@", $data4);$result = array_chunk($data5, 2);return $result;}/**   * 擷取簡明釋義   * return array (x => "詞性",b => "附屬")   *  **/private function getExplain(){$data = $this -> getContent();preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/Us",$data,$explain);$r_data = $explain[1][0];preg_match_all("/\<p\>\<strong\>(?P<adj>.*)\<\/strong\>\<span\>(?P<name>.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);preg_match_all("/\<span\>(?P<tag>[^\>]+)\:\<a(\s+)href\=\"(.*)\"\>(?P<word>.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);$result = array();foreach ($a_data["adj"] as $key => $value) {$result[$value] = $a_data["name"][$key];}$word_b = array();foreach ($b_data["tag"] as $key => $value) {$word_b[$value] = strip_tags($b_data["word"][$key]);}$result_data = array("x" => $result,"b" => $word_b); return $result_data;}/**   * 擷取同義字   * return array(0 => "同義字", 1 => "反義詞") 一般為多維陣列   *  */private function getSynonym(){$data = $this -> getContent();preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div(\s+)class\=\"source\">/Us",$data,$synonym);$content = $synonym[1][0];$data1 = explode("</dl>", $content);$result = array();$data2 = array();foreach ($data1 as $key => $value) {preg_match_all("/\<strong\>(?P<adj>.*)\ \;\<\/strong\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?<content>.*)\<\/ul\>/Us", $value, $r_data);$data2[$key]["adj"] = $r_data["adj"];$data2[$key]["content"] = $r_data["content"];}foreach ($data2 as $key => $value) {foreach ($value["content"] as $k => $v) {if(!empty($v)){preg_match_all("/\<li\>\<p\>(?P<title>.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data);foreach ($v_data['title'] as $m => $d) {$data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m]));$result[$key][$value["adj"][$k]][$d] = $data;}}}} return $result;}/**   * 擷取短語片語   * return array (key => value) 一維或者多維陣列   *  */private function getPhrase(){$num = self::$num;$data = $this -> getContent();preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase);$data = explode("</dd>",$phrase[1][0]);$data1 = array_slice($data,0,$num);$result = array();foreach ($data1 as $key => $value) {$data2 = explode("</p>", $value);$n = count($data2);if($n<=3){$result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]);}else{$data3 = array_slice($data2,0,$n-1);$data4 = array_slice($data2,0,2);$res = array_diff($data3,$data4);$data5 = array_chunk($res,2);$key_value = trim(str_replace(" ","",strip_tags($data4[0])));$result[$key_value] = strip_tags($data4[1]);foreach ($data5 as $key => $value) {foreach ($value as $k => $v) {$value[$k] = strip_tags($v);}$array = array($result[$key_value],$value);if (array_key_exists($key_value, $result)){$result[$key_value] = $array;}}}}return $result;}/** * 將數群組轉換為字串 * * @param  array  $data    數組 * @param  bool  $isformdata 如果為0,則不使用new_stripslashes處理,選擇性參數,預設為1 * @return  string 返回字串,如果,data為空白,則返回空 */private function array2string($data, $isformdata = 1) {  if($data == '') return '';  if($isformdata) $data = $this->new_stripslashes($data);  return addslashes(var_export($data, TRUE));}/** * 返回經stripslashes處理過的字串或數組 * @param $string 需要處理的字串或數組 * @return mixed */private function new_stripslashes($string) {  if(!is_array($string)) return stripslashes($string);  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);  return $string;}}// $word = new dict("express");// $word ->content();

以上就是本文的全部內容了,非常實用的功能,希望小夥伴們能夠喜歡。

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.