標籤:php 多線程 爬蟲類
代碼:<?php/*** @desc:多線程爬蟲類* @author [Lee] <[<[email protected]>]>* @property* 1、calltrigger 觸發爬蟲程式的回呼函數* 2、calltodo 處理商務邏輯的回呼函數 如:把抓取到的內容處理後存到資料庫* 3、timeout 逾時時間,預設5秒* 4、depth 重新導向深度,預設3* 5、name 上傳檔案的名字,預設file* 6、cookie 類比登入時cookie儲存在本地的檔案,預設cookie_n.txt* @method* 1、ssl 是否設定https true:是 false:否* 2、auth 啟用驗證 user:使用者名稱 pass:密碼* 3、login 類比登入,擷取cookie* 4、cookie 使用cookie登入* 5、header 佈建要求頭 data:要求標頭數組* 6、proxy 設定伺服器代理 url:Proxy 伺服器url port:Proxy 伺服器連接埠* 7、agent 設定瀏覽器代理 browse:代理瀏覽器 預設:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)* 8、get 類比get請求 data:傳遞的資料* 9、post 類比post請求 data:傳遞的資料* 10、json 類比json請求 data:傳遞的資料* 11、upload 類比表單上傳 files:上傳的檔案 array|string* 12、download 下載檔案 dir:要下載的檔案 格式:a/b* 13、run 執行 depth:深度*/class crawl{public $calltrigger = ‘trigger‘; # 觸發爬蟲程式的回呼函數public $calltodo = ‘todo‘; # 處理商務邏輯的回呼函數 public $timeout = 5; # 逾時時間,預設5秒public $depth = 3; # 重新導向深度,預設3public $name = ‘file‘; # 上傳檔案的名字,預設filepublic $cookie = ‘cookie.txt‘; # 類比登入時cookie儲存在本地的檔案,預設cookie_nprivate $schemes = array();private $hosts = array();private $paths = array();private $querys = array();private $options = array();private $chs;private $fps;private $handle;private $urls = array();/* @desc:內部方法,擷取頁面中的超連結 @param content 頁面內容 @return urls 擷取到的超連結 */private function geturl($content){ $preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); return $urls;}/* @desc:內部方法,修複不完整的url @param url 原始url @param url 修複好的url */private function reviseurl($url){ $info = parse_url($url); $scheme = $info["scheme"]?:‘http‘; $user = $info["user"]; $pass = $info["pass"]; $host = $info["host"]; $port = $info["port"]; $path = $info["path"]; $url = $scheme . ‘://‘; if ($user && $pass) { $url .= $user . ":" . $pass . "@"; } $url .= $host; if ($port) { $url .= ":" . $port; } $url .= $path; return $url;}/* @desc:內部方法,調用回呼函數進行業務處理 @param content 傳入到回呼函數的參數 */private function todo($content){ $calltodo = $this->calltodo; call_user_func($calltodo,$content);}/* @desc:觸發爬蟲程式的回呼函數 @param urls 待處理的url數組 @param depth 處理深度 */private function trigger($urls,$depth){ $calltrigger = $this->calltrigger; call_user_func($calltrigger,$urls,$depth);}/* @desc:內部方法 設定get請求參數 @param data 請求資料 */private function setget($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = ($querys[$k] || !empty($data))?"?":""; $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k].$data; $this->options[$k][CURLOPT_URL] = $qurl; } return $this;}/* @desc:內部方法 設定post請求參數 @param data 請求資料 */private function setpost($data){ $schemes = $this->schemes; $hosts = $this->hosts; $paths = $this->paths; $querys = $this->querys; foreach($this->chs as $k=>$v){ $sep = $query?"?":""; $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k]; $this->options[$k][CURLOPT_URL] = $qurl; $this->options[$k][CURLOPT_POST] = 1; $this->options[$k][CURLOPT_POSTFIELDS] = $data; } return $this;}/* @desc:內部方法 設定最終請求參數 */private function setopt(){ $options = $this->options; foreach($options as $k=>$v){ curl_setopt_array( $this->chs[$k], $v ); } return $this;}/* @desc:構造方法 設定初始請求參數 @param urls 請求地址數組 */public function __construct($urls){ $this->urls = $urls; $this->handle = curl_multi_init(); foreach($urls as $k=>$v){ $info = parse_url($v); $this->schemes[$k] = $info[‘scheme‘]?:‘http‘; $this->hosts[$k] = $info[‘host‘]; $this->paths[$k] = $info[‘path‘]; $this->querys[$k] = $info[‘query‘]; $this->chs[$k] = curl_init(); $this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout; $this->options[$k][CURLOPT_RETURNTRANSFER] = 1; $this->options[$k][CURLOPT_FOLLOWLOCATION] = 1; $this->options[$k][CURLINFO_HEADER_OUT] = true; $this->options[$k][CURLOPT_ENCODING] = ‘gzip‘; $this->options[$k][CURLOPT_MAXREDIRS] = $this->depth; curl_multi_add_handle ($this->handle,$this->chs[$k]); }}/* @desc:是否設定https請求 @param bool true:https請求 false:http請求 */public function ssl($bool = false){ if($bool){ foreach($this->chs as $k=>$v){ $this->scheme[$k] = ‘https‘; $this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1; $this->options[$k][CURLOPT_SSL_VERIFYPEER] = false; } } return $this;}/* @desc:設定驗證使用者名稱、密碼 @param user 使用者名稱 @param pass 密碼 */public function auth($user,$pass){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERPWD] = $user.‘:‘.$pass; } return $this;}/* @desc:類比登入 */public function login(){ $cookie = $this->cookie; $arr = explode(‘.‘,$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEJAR] = $name.‘_‘.$k.‘.‘.$ext; $this->options[$k][CURLOPT_RETURNTRANSFER] = 0; } return $this;}/* @desc:帶cookie登入 */public function cookie(){ $cookie = $this->cookie; $arr = explode(‘.‘,$cookie); $name = $arr[0]; $ext = $arr[1]; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_COOKIEFILE] = $name.‘_‘.$k.‘.‘.$ext; } return $this;}/* @desc:佈建要求頭資訊 @param data 要求標頭 */public function header($data){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array(); $this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data); } return $this;}/* @desc:設定Proxy 伺服器 @param url Proxy 伺服器url @param port Proxy 伺服器連接埠 */public function proxy($url,$port){ $info = parse_url($url); $scheme = $info[‘scheme‘]?:‘http‘; $host = $info[‘host‘]; $path = $info[‘path‘]; $purl = $scheme.‘://‘.$host.$path.‘:‘.$port; foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_PROXY] = $purl; } return $this;}/* @desc:設定代理瀏覽器 @param browse 代理瀏覽器 */public function agent($browse = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)‘){ foreach($this->chs as $k=>$v){ $this->options[$k][CURLOPT_USERAGENT] = $browse; } return $this;}/* @desc:類比get請求 @param data 請求資料 */public function get($data = array()){ $data = http_build_query($data); $this->setget($data); return $this;}/* @desc:類比post請求 @param data 請求資料 */public function post($data = array()){ $this->setpost($data); return $this;}/* @desc:類比json請求 @param data 請求資料 */public function json($data = array()){ $data = json_encode($data); $header = array( ‘Content-Type: application/json‘, ‘Content-Length:‘ . strlen($data) ); $this->header($header); $this->setpost($data); return $this;}/* @desc:類比表單上傳 @param files 檔案路徑 */public function upload($files){ $data = array(); $name = $this->name; if(is_array($files)){ foreach($files as $k=>$v){ $data["{$name}[{$k}]"]=new CURLFile($v); } }else{ $data["{$name}"]=new CURLFile($files); } $this->setpost($data); return $this;}/* @desc:下載檔案 @param dir 隱藏檔目錄 */public function download($dir = ‘‘){ $paths = $this->paths; if($dir && !is_dir($dir)){ mkdir($dir,0755,true); } foreach($this->paths as $k=>$v){ $name = strrchr($v, ‘/‘); $dsep = $dir?‘/‘:‘‘; $this->fps[$k]=fopen(‘.‘.$dsep.$dir.$name, ‘w‘); $this->options[$k][CURLOPT_FILE] = $this->fps[$k]; } $this->setget(‘‘); return $this;}/* @desc:執行方法 @param depth 深度 預設2 */public function run($depth = 2){ $this->setopt(); $chs = $this->chs; $handle = $this->handle; $urls = $this->urls; if($depth > 0){ $depth--; $active = null; $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } while ($active && $mrc == CURLM_OK) { if (curl_multi_select($handle) != -1) { usleep(100); } $mrc = curl_multi_exec($handle, $active); while ($mrc == CURLM_CALL_MULTI_PERFORM) { $mrc = curl_multi_exec($handle, $active); } } foreach ($chs as $k => $v) { if (curl_error($chs[$k]) == "") { $content = curl_multi_getcontent($chs[$k]); $this->todo($content); $aurls = $this->geturl($content); $urls[$k] = $this->reviseurl($urls[$k]); if (is_array($aurls) && !empty($aurls)) { foreach ($aurls as $k1=>$u) { if (preg_match(‘/^http/‘, $u)) { $returl[$k1] = $u; } else { $real = $urls[$k] . ‘/‘ . $u; $returl[$k1] = $real; } } $this->trigger($returl,$depth); } } curl_multi_remove_handle($handle, $chs[$k]); curl_close($chs[$k]); } curl_multi_close($handle); }}}
測試:function todo($content){echo ‘ok‘.PHP_EOL;}$urls=array(‘www.baidu.com‘, ‘www.taobao.com‘);function trigger($urls = array(),$depth = 2){$crawl = new crawl($urls);$crawl->get()->run($depth);}trigger($urls);
輸出:okokokokokokokokokokokokokok
php多線程爬蟲類