標籤:操作 四種 .json continue function over xmla xml檔案 public
public function insertAction() {ini_set(‘max_execution_time‘, ‘0‘);// error_reporting(E_ALL);// ini_set(‘display_errors‘, ‘Off‘);// 插入之前首先更新目錄檔案$getHomeList = $this->getXmlAction();$arr_code = array(1 => ‘插入成功‘,-1 => ‘插入失敗!請檢查再試!‘,-2 => ‘擷取xml檔案失敗!請檢查再試!‘,);showApiCode($arr_code);//把目錄改成對應的ID$getHomeList = array_combine(array_column($getHomeList, ‘name‘), array_column($getHomeList, ‘id‘));// 添加顏色欄位$color = array(0 => ‘#a56d57‘,1 => ‘#4c889c‘,2 => ‘#658965‘,);//串連資料庫$ArticleModel = new ArticleModel();//建立dom對象$dom = new DOMDocument();//建立抓取對象$Utils_CaptureWebContent = new Utils_CaptureWebContent(‘‘);//載入xml.rss檔案// $xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘));$xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘), true);foreach ($xml as &$value) {// 擷取標題$title = $value[‘title‘];// 擷取描述$summary = $value[‘description‘];//擷取分類名字$category_name = $value[‘category‘];$send_time = strtotime($value[‘pubDate‘]);$utime = $ctime = time();//添加一級分類id$article_category = $getHomeList[$category_name];$category_color = $color[$article_category % 3];// 測試的連結$content_url = $value[‘link‘];$id = sprintf("%u", crc32($content_url));$out = $this->getDataAction($content_url);$out = preg_replace(array(‘/<head>([\s\S]+?)<\/head>/i‘), array(‘<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head>‘), $out[‘output‘]);@$dom->loadHTML($out);$xpath = new DOMXPath($dom);// 截取最後一次/後面的字元,根據這個長度來判斷屬於哪一個類型$str = strlen(strrchr($content_url, ‘/‘));$html = $Utils_CaptureWebContent->captureGet($content_url);$html = $Utils_CaptureWebContent->formatHtml($html);// 對網站進行分類,分三類,分別處理,擷取其中的from_site,content,category_name(二級分類)if ($str < 2) {// 第一類(非標準連結):http://kjs.mep.gov.cn/hjbhbz/bzwb/dqhjbh/jcgfffbz/} else if ($str < 10) {// 第二類(標準連結):http://www.gdczepb.gov.cn/detail/24441$site = $xpath->query("//div[@class=‘cdaylist‘]/ul/li");//擷取來源地址$from_site = $site->item(0)->nodeValue;if (strlen(trim($from_site)) < 10) {$from_site = ‘來源:資訊‘;}// 擷取二級分類外面的那個div$cate_html = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "cnav", $html);$cate_html = preg_replace(‘/ /‘, ‘‘, $cate_html);$category_name = substr($cate_html, strripos($cate_html, ‘>‘) + 1);if (!$category_name) {//如果上面擷取不到,則說明二級分類在a標籤裡面,擷取最後一個a標籤裡面的內容$cate_name = $xpath->query("//div[@class=‘cnav‘]/a");$category_name = $cate_name->item($cate_name->length - 1)->nodeValue;}//擷取內容$content = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "contents", $html);} else {// 第三類(次標準連結):http://kjs.mep.gov.cn/hjbhbz/bzwb/stzl/201109/t20110919_217415.htm//擷取來源地址,沒有資料,直接指定來源為科技司$from_site = ‘來源:科學技術司‘;$content = $Utils_CaptureWebContent->matchAllHtmlElement("table", "class", "txtnormal", $html);$content = join($content[0], ‘‘);$category_name = $Utils_CaptureWebContent->matchAllHtmlElement("a", "class", "dtdir12 CurrChnlCls", $html);$category_name = $category_name[1][3];}//內容裡面的圖片也有多種src,//第一種: upload ;//第二種:/upload ;//第三種:./upload/檔案名稱;//第四種:直接檔案;//第五種:./檔案名稱 這種;//正則匹配href和src$src_pat = ‘/src="(\.?\/?upload.+?)"/‘;$href_pat = ‘/href="(\.?\/?upload.+?)"/‘;// 擷取首碼$host = parse_url($content_url);$host = ‘http://‘ . $host[‘host‘] . ‘/‘;$host_name = dirname($content_url) . ‘/‘;$content = preg_replace(array("/style=\".+?\"/i", "/width=\".+?\"/i", "/<style([\s\S]+?)<\/style>/i", "/<script([\s\S]+?)<\/script>/i"), ‘‘, $content); //去除樣式// $content = preg_replace(array($src_pat, $href_pat), array($host . "$1", $host . "$1"), $content);$content = preg_replace(array($src_pat, $href_pat), array(‘src="‘ . $host . "$1" . ‘"‘, ‘href="‘ . $host . "$1" . ‘"‘), $content);$src_pat2 = ‘/src="([^http].*?)"/is‘;$href_pat2 = ‘/href="([^http].*?)"/is‘;// 第二次替換,把非http開頭的都加上detail替換掉$content = preg_replace(array($src_pat2, $href_pat2), array(‘src="‘ . $host_name . "$1" . ‘"‘, ‘href="‘ . $host_name . "$1" . ‘"‘), $content);$src_one = ‘/<img[^>]*src="([^>"]*)"/is‘;preg_match($src_one, $content, $cover_url);$old_data = $ArticleModel->getItem($id);$content = empty(trim($content)) ? $old_data[‘content‘] : htmlspecialchars($content);$data_check = sprintf("%u", crc32(join(‘‘, array($title, $content))));$params = array("id" => $id,"link" => $content_url,"article_category" => $article_category,"title" => $title,"summary" => $summary,"content" => $content,"send_time" => $send_time,"from_site" => $from_site,"ctime" => $ctime,"utime" => $utime,"category_name" => $category_name,"category_color" => $category_color,"cover_url" => $cover_url[1],"data_check" => $data_check,);// echo "<pre>";// print_r($params);// echo "</pre>";$i = 0;if (!empty($content)) {try {$ArticleModel->add($params);echo $i;} catch (Exception $e) {$old_check = $old_data[‘data_check‘];if ($old_check != $data_check) {$ArticleModel->update($params, " id = {$id} ");$i++;// echo ‘<h1 color="red">插入的資料與之前的不樣!執行更新操作。</h1><br>‘;}}} else {continue;}$need = array(‘title‘ => $title,‘content‘ => htmlspecialchars_decode($content),‘from_site‘ => $from_site,‘send_time‘ => $send_time,);$data_test = array(‘info‘ => $need,);ob_start();$this->display("/article/infoContent.phtml", $data_test);$id_html = ob_get_clean();file_put_contents(PROJECT_ROOT . ‘/html/article/a‘ . $id . ‘.html‘, $id_html);usleep(700000);}printf("本次更新了 %s 條資料", $i);}
php原生態產生靜態緩衝,配合crontab定時重新整理緩衝,不需要第三方模板
php原生態產生靜態快取頁面,定時更新