***PHP多線程pthreads 實現QQ號碼爬蟲

來源:互聯網
上載者:User

標籤:

通過空間曆史瀏覽,爬出查看你空間的人(一般限制20人,除非開通黃鑽),然後在爬出這20人的瀏覽記錄,依次向下爬,你可以控制爬行深度。
這裡僅僅給出怕中程式碼片段,你可以進一步最佳化,將QQ分類儲存。通過QQ相互瀏覽關係,可以通過繪圖工具繪製好友網路。等等

代碼涉及pthreads 如果不清楚請閱讀:《PHP 進階編程之多線程》
http://netkiller.github.io/journal/thread.php.html

 

<?php/*Homepage: http://netkiller.github.ioAuthor: Neo <[email protected]>*/if(!extension_loaded(‘pthreads‘)) die (‘Please install pthreads‘); include_once(‘Snoopy.class.php‘); class CrawlerWorker extends Worker {     protected  static $dbh;    public function __construct() {     }    public function run(){    /*        $dbhost = ‘db.example.com‘;         // 資料庫伺服器        $dbuser = ‘example.com‘;            // 資料庫使用者名稱        $dbpw = ‘password‘;                 // 資料庫密碼        $dbname = ‘example‘;                // 資料庫名         self::$dbh  = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array(            PDO::MYSQL_ATTR_INIT_COMMAND => ‘SET NAMES \‘UTF8\‘‘,            PDO::MYSQL_ATTR_COMPRESS => true,            PDO::ATTR_PERSISTENT => true            )        );    */    }    protected function getInstance(){        return self::$dbh;    } } /* the collectable class implements machinery for Pool::collect */class Crawler extends Stackable {    public $depth = 3;    private static $level = 0;    public function __construct($qq) {        $this->qq = $qq;    }    public function run() {         try {            $dbh  = $this->worker->getInstance();            $this->recursion(array($this->qq));        }        catch(PDOException $e) {            $error = sprintf("%s,%s\n", $mobile, $id );            file_put_contents("mobile_error.log", $error, FILE_APPEND);        }        //printf("runtime: %s, %s\n", date(‘Y-m-d H:i:s‘), $this->worker->getThreadId());        //$lst = $this->qzone($this->qq);        //print_r($lst);    }    public function recursion($qqs){                 if( self::$level <= $this->depth){            self::$level++;        }else if(self::$level > 0){            self::$level--;        }        printf("Level: %s\n", self::$level);        //sleep(1);        usleep(mt_rand(10000,1000000));        if(self::$level >= $this->depth){            return;        }                 foreach($qqs as $uin) {            $lst = $this->qzone($uin);            print_r($lst);            $this->recursion($lst);        }    }     public function qzone($qq){        $url = ‘http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin=‘.$qq.‘&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D‘;        $snoopy = new Snoopy;                  // need an proxy?        //$snoopy->proxy_host = "my.proxy.host";        //$snoopy->proxy_port = "8080";                  // set browser and referer:        $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";        $snoopy->referer = "http://m.qzone.com/";                  // set some cookies:        //$snoopy->cookies["SessionID"] = ‘238472834723489‘;        //$snoopy->cookies["favoriteColor"] = "blue";                  // set an raw-header:        $snoopy->rawheaders["Pragma"] = "no-cache";                  // set some internal variables:        $snoopy->maxredirs = 2;        $snoopy->offsiteok = false;        $snoopy->expandlinks = false;                  // set username and password (optional)        //$snoopy->user = "joe";        //$snoopy->pass = "bloe";                  // fetch the text of the website www.google.com:        if($snoopy->fetchtext($url)){             // other methods: fetch, fetchform, fetchlinks, submittext and submitlinks             // response code:            //print "response code: ".$snoopy->response_code."<br/>\n";                      // print the headers:            //print "<b>Headers:</b><br/>";            //while(list($key,$val) = each($snoopy->headers)){            //  print $key.": ".$val."<br/>\n";            //}             // print the texts of the website:            //print_r( json_decode($snoopy->results) );                         $results = array();            $tmp = json_decode($snoopy->results);                         if($tmp){                if(property_exists($tmp, ‘data‘)){                    foreach( $tmp->data->list as $lst ){                        $results[] = $lst->uin;                    }                }            }            return ($results);                     }        else {            print "Snoopy: error while fetching document: ".$snoopy->error."\n";        }           }} $pool = new Pool(100, \CrawlerWorker::class, []); #foreach (range(1000, 100000) as $number) {#   $pool->submit(new Crawler($number));#} $pool->submit(new Crawler(‘13721218‘));$pool->submit(new Crawler(‘291379‘));//$pool->submit(new Crawler(‘xxx‘));//$pool->submit(new Crawler(‘xxx‘));//$pool->submit(new Crawler(‘xxx‘));// 以此類推//$pool->submit(new Crawler(‘nnn‘)); $pool->shutdown();?>

 

***PHP多線程pthreads 實現QQ號碼爬蟲

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.