php多線程爬蟲類

來源:互聯網
上載者:User

標籤:php   多線程   爬蟲類   

  • 代碼:
    <?php/*** @desc:多線程爬蟲類* @author [Lee] <[<[email protected]>]>* @property* 1、calltrigger    觸發爬蟲程式的回呼函數* 2、calltodo       處理商務邏輯的回呼函數 如:把抓取到的內容處理後存到資料庫* 3、timeout        逾時時間,預設5秒* 4、depth          重新導向深度,預設3* 5、name           上傳檔案的名字,預設file* 6、cookie         類比登入時cookie儲存在本地的檔案,預設cookie_n.txt* @method* 1、ssl            是否設定https           true:是  false:否* 2、auth           啟用驗證                user:使用者名稱    pass:密碼* 3、login          類比登入,擷取cookie* 4、cookie         使用cookie登入* 5、header         佈建要求頭              data:要求標頭數組* 6、proxy          設定伺服器代理          url:Proxy 伺服器url   port:Proxy 伺服器連接埠* 7、agent          設定瀏覽器代理          browse:代理瀏覽器 預設:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)* 8、get            類比get請求             data:傳遞的資料* 9、post           類比post請求            data:傳遞的資料* 10、json          類比json請求            data:傳遞的資料* 11、upload        類比表單上傳            files:上傳的檔案   array|string* 12、download      下載檔案                dir:要下載的檔案  格式:a/b* 13、run           執行                    depth:深度*/class crawl{public $calltrigger = ‘trigger‘;  #  觸發爬蟲程式的回呼函數public $calltodo = ‘todo‘;  #  處理商務邏輯的回呼函數 public $timeout = 5;  #  逾時時間,預設5秒public $depth = 3;  #  重新導向深度,預設3public $name = ‘file‘;  #  上傳檔案的名字,預設filepublic $cookie = ‘cookie.txt‘;  #  類比登入時cookie儲存在本地的檔案,預設cookie_nprivate $schemes = array();private $hosts = array();private $paths = array();private $querys = array();private $options = array();private $chs;private $fps;private $handle;private $urls = array();/* @desc:內部方法,擷取頁面中的超連結 @param content 頁面內容 @return urls 擷取到的超連結 */private function geturl($content){    $preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘;    $bool = preg_match_all($preg,$content,$res);    $urls = array();    if($bool){        $urls = $res[1];    }    $urls = array_unique($urls);    return $urls;}/* @desc:內部方法,修複不完整的url @param url 原始url @param url 修複好的url */private function reviseurl($url){    $info = parse_url($url);    $scheme = $info["scheme"]?:‘http‘;    $user = $info["user"];    $pass = $info["pass"];    $host = $info["host"];    $port = $info["port"];    $path = $info["path"];    $url = $scheme . ‘://‘;    if ($user && $pass) {        $url .= $user . ":" . $pass . "@";    }    $url .= $host;    if ($port) {        $url .= ":" . $port;    }     $url .= $path;    return $url;}/* @desc:內部方法,調用回呼函數進行業務處理 @param content 傳入到回呼函數的參數 */private function todo($content){    $calltodo = $this->calltodo;    call_user_func($calltodo,$content);}/* @desc:觸發爬蟲程式的回呼函數 @param urls 待處理的url數組 @param depth 處理深度 */private function trigger($urls,$depth){    $calltrigger = $this->calltrigger;    call_user_func($calltrigger,$urls,$depth);}/* @desc:內部方法 設定get請求參數 @param data 請求資料 */private function setget($data){    $schemes = $this->schemes;    $hosts = $this->hosts;    $paths = $this->paths;    $querys = $this->querys;    foreach($this->chs as $k=>$v){        $sep = ($querys[$k] || !empty($data))?"?":"";        $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k].$data;        $this->options[$k][CURLOPT_URL] = $qurl;    }    return $this;}/* @desc:內部方法 設定post請求參數 @param data 請求資料 */private function setpost($data){    $schemes = $this->schemes;    $hosts = $this->hosts;    $paths = $this->paths;    $querys = $this->querys;    foreach($this->chs as $k=>$v){        $sep = $query?"?":"";        $qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k];        $this->options[$k][CURLOPT_URL] = $qurl;        $this->options[$k][CURLOPT_POST] = 1;        $this->options[$k][CURLOPT_POSTFIELDS] = $data;    }    return $this;}/* @desc:內部方法 設定最終請求參數 */private function setopt(){    $options = $this->options;    foreach($options as $k=>$v){        curl_setopt_array(                $this->chs[$k],                $v            );    }    return $this;}/* @desc:構造方法 設定初始請求參數 @param urls 請求地址數組 */public function __construct($urls){    $this->urls = $urls;    $this->handle = curl_multi_init();    foreach($urls as $k=>$v){        $info = parse_url($v);        $this->schemes[$k] = $info[‘scheme‘]?:‘http‘;        $this->hosts[$k] = $info[‘host‘];        $this->paths[$k] = $info[‘path‘];        $this->querys[$k] = $info[‘query‘];        $this->chs[$k] = curl_init();        $this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout;        $this->options[$k][CURLOPT_RETURNTRANSFER] = 1;        $this->options[$k][CURLOPT_FOLLOWLOCATION] = 1;        $this->options[$k][CURLINFO_HEADER_OUT] = true;        $this->options[$k][CURLOPT_ENCODING] = ‘gzip‘;        $this->options[$k][CURLOPT_MAXREDIRS] = $this->depth;        curl_multi_add_handle ($this->handle,$this->chs[$k]);    }}/* @desc:是否設定https請求 @param bool true:https請求 false:http請求 */public function ssl($bool = false){    if($bool){        foreach($this->chs as $k=>$v){            $this->scheme[$k] = ‘https‘;            $this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1;            $this->options[$k][CURLOPT_SSL_VERIFYPEER] = false;        }    }    return $this;}/* @desc:設定驗證使用者名稱、密碼 @param user 使用者名稱 @param pass 密碼 */public function auth($user,$pass){    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_USERPWD] = $user.‘:‘.$pass;    }    return $this;}/* @desc:類比登入 */public function login(){    $cookie = $this->cookie;    $arr = explode(‘.‘,$cookie);    $name = $arr[0];    $ext = $arr[1];    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_COOKIEJAR] = $name.‘_‘.$k.‘.‘.$ext;        $this->options[$k][CURLOPT_RETURNTRANSFER] = 0;    }    return $this;}/* @desc:帶cookie登入 */public function cookie(){    $cookie = $this->cookie;    $arr = explode(‘.‘,$cookie);    $name = $arr[0];    $ext = $arr[1];    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_COOKIEFILE] = $name.‘_‘.$k.‘.‘.$ext;    }    return $this;}/* @desc:佈建要求頭資訊 @param data 要求標頭 */public function header($data){    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array();        $this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data);    }    return $this;}/* @desc:設定Proxy 伺服器 @param url Proxy 伺服器url @param port Proxy 伺服器連接埠 */public function proxy($url,$port){    $info = parse_url($url);    $scheme = $info[‘scheme‘]?:‘http‘;    $host = $info[‘host‘];    $path = $info[‘path‘];    $purl = $scheme.‘://‘.$host.$path.‘:‘.$port;    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_PROXY] = $purl;    }    return $this;}/* @desc:設定代理瀏覽器 @param browse 代理瀏覽器 */public function agent($browse = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)‘){    foreach($this->chs as $k=>$v){        $this->options[$k][CURLOPT_USERAGENT] = $browse;    }    return $this;}/* @desc:類比get請求 @param data 請求資料 */public function get($data = array()){    $data = http_build_query($data);    $this->setget($data);    return $this;}/* @desc:類比post請求 @param data 請求資料 */public function post($data = array()){    $this->setpost($data);    return $this;}/* @desc:類比json請求 @param data 請求資料 */public function json($data = array()){    $data = json_encode($data);    $header = array(            ‘Content-Type: application/json‘,            ‘Content-Length:‘ . strlen($data)        );    $this->header($header);    $this->setpost($data);    return $this;}/* @desc:類比表單上傳 @param files 檔案路徑 */public function upload($files){    $data = array();    $name = $this->name;    if(is_array($files)){        foreach($files as $k=>$v){            $data["{$name}[{$k}]"]=new CURLFile($v);        }    }else{        $data["{$name}"]=new CURLFile($files);    }    $this->setpost($data);    return $this;}/* @desc:下載檔案 @param dir 隱藏檔目錄 */public function download($dir = ‘‘){    $paths = $this->paths;    if($dir && !is_dir($dir)){        mkdir($dir,0755,true);    }    foreach($this->paths as $k=>$v){        $name = strrchr($v, ‘/‘);        $dsep = $dir?‘/‘:‘‘;        $this->fps[$k]=fopen(‘.‘.$dsep.$dir.$name, ‘w‘);        $this->options[$k][CURLOPT_FILE] = $this->fps[$k];    }    $this->setget(‘‘);    return $this;}/* @desc:執行方法 @param depth 深度 預設2 */public function run($depth = 2){    $this->setopt();    $chs = $this->chs;    $handle = $this->handle;    $urls = $this->urls;    if($depth > 0){        $depth--;        $active = null;        $mrc = curl_multi_exec($handle, $active);        while ($mrc == CURLM_CALL_MULTI_PERFORM) {            $mrc = curl_multi_exec($handle, $active);        }        while ($active && $mrc == CURLM_OK) {            if (curl_multi_select($handle) != -1) {                  usleep(100);            }            $mrc = curl_multi_exec($handle, $active);            while ($mrc == CURLM_CALL_MULTI_PERFORM) {                $mrc = curl_multi_exec($handle, $active);            }        }        foreach ($chs as $k => $v) {            if (curl_error($chs[$k]) == "") {                $content = curl_multi_getcontent($chs[$k]);                $this->todo($content);                $aurls = $this->geturl($content);                $urls[$k] = $this->reviseurl($urls[$k]);                if (is_array($aurls) && !empty($aurls)) {                    foreach ($aurls as $k1=>$u) {                        if (preg_match(‘/^http/‘, $u)) {                            $returl[$k1] = $u;                        } else {                            $real = $urls[$k] . ‘/‘ . $u;                            $returl[$k1] = $real;                        }                    }                    $this->trigger($returl,$depth);                }            }            curl_multi_remove_handle($handle, $chs[$k]);              curl_close($chs[$k]);        }        curl_multi_close($handle);    }}}
  • 測試:
    function todo($content){echo ‘ok‘.PHP_EOL;}$urls=array(‘www.baidu.com‘,  ‘www.taobao.com‘);function trigger($urls = array(),$depth = 2){$crawl = new crawl($urls);$crawl->get()->run($depth);}trigger($urls);
  • 輸出:
    okokokokokokokokokokokokokok
  • php多線程爬蟲類

    聯繫我們

    該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

    如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.