PHP extensions: https://github.com/krakjoe/pthreads
PHP Manual: http://php.net/manual/zh/book.pthreads.php
After the extension is installed, you can use multiple threads to paste the following code that crawls the content of the Baidu online disk through search results:
The code is as follows: |
Copy code |
<? Php Include 'include/CurlLoad. class. Php'; // introduce the read Library /** * Multi-thread content capturing * @ Param array $ Set of url lists to be crawled * @ Return: the specified content is returned successfully. If the return value fails, the return value is NULL. */ Function vget ($ url ){ $ Ret = BaiduSRLinksGet ($ url, 1); // get the result list address If ($ ret! = Null ){ If (array_key_exists ("links", $ ret )){ $ Infos = array (); $ Number = count ($ ret ['link']); For ($ I = 0; $ I <$ number; $ I ++) {// create a thread object cyclically $ Thread_array [$ I] = new baidu_thread_run ($ ret ['link'] [$ I]); $ Thread_array [$ I]-> start (); } Foreach ($ thread_array as $ thread_array_key => $ thread_array_value) {// check whether the thread execution has ended While ($ thread_array [$ thread_array_key]-> isRunning ()){ Usleep (10 ); } If ($ thread_array [$ thread_array_key]-> join () {// if the execution ends, retrieve the result $ Temp = $ thread_array [$ thread_array_key]-> data; If ($ temp! = Null) $ Infos ['res'] [] = $ temp; } } $ Infos ['Page'] = $ ret ['Page']; $ Infos ['status'] = "1 "; } Else $ Infos = null; } Else $ Infos = null; Return $ infos; } /** * Retrieve the URL of the Baidu search result list * * @ Param string $ url * Search result page URL * @ Param int $ format * Default $ format = 0: get the default address; $ format = 1: Get the real address after the jump * @ Return NULL multitype: array () */ Function BaiduSRLinksGet ($ url, $ format = 0 ){ $ Html = CurlLoad: HtmlGet ($ url); // get the page If ($ html = null) Return null; Try { Preg_match_all ("/" url ":"(? <Links>. *) "}/", $ html, $ rets); // search result link filtering If (! Array_key_exists ('links ', $ rets) // If the array does not contain the links key name, the query fails. Return null; $ Ret = array (); If ($ format = 1 ){ $ Number = count ($ rets ['link']); For ($ I = 0; $ I <$ number; $ I ++ ){ $ Headr_temp = CurlLoad: Get_Headers ($ rets ['link'] [$ I], 1); // Obtain the real address through headr If (array_key_exists ("Location", $ headr_temp )) $ Ret ['link'] [$ I] = $ headr_temp ['location']; Else $ Ret ['link'] = $ rets ['link']; } } Else $ Ret ['link'] = $ rets ['link']; Preg_match_all ('/href = "? /S? Wd = site % 3Apan.baidu.com % 20 (? <Url>. + ?) & Ie = UTF-8 ">/', $ html, $ out ); Unset ($ out ['URL'] [0]); $ Number = count ($ out ['URL']); For ($ I = 1; $ I <$ number; $ I ++ ){ Preg_match_all ('/& pn = (. *)/', $ out ['URL'] [$ I], $ temp ); $ Ret ['Page'] [$ temp [1] [0]/10] = base64_encode ($ out ['URL'] [$ I]); } Return $ ret; } Catch (Exception $ e ){ WriteLog ($ e ); Return null; } } /** * Baidu online storage resource information retrieval * * @ Param string $ url * URL of the online storage page * @ Return NULL array */ Function PanInfoGet ($ url ){ $ Html = CurlLoad: HtmlGet ($ url); // get the page If ($ html = null) Return null; Try { If (preg_match_all ("/file name :(? <Name>. *) file size :(? <Size>. *) Sharer :(? <User>. *) sharing Time :(? <Date>. *) number of downloads :(? <Number> [0-9] +)/", $ html, $ ret) = 0) Return null; $ Rets ['name'] = $ ret ['name'] [0]; $ Rets ['size'] = $ ret ['size'] [0]; $ Rets ['user'] = $ ret ['user'] [0]; $ Rets ['Date'] = $ ret ['Date'] [0]; $ Rets ['Number'] = $ ret ['Number'] [0]; $ Rets ['link'] = $ url; Return $ rets; } Catch (Exception $ e ){ WriteLog ($ e ); Return null; } } Function WriteLog ($ str ){ $ File = fopen ("../error. log", "a + "); Fwrite ($ file, "Warning:". date ("Y/m/d H: I: s"). ":". $ str. "rn "); Fclose ($ file ); } /** * Multi-thread object capturing * @ Author MuXi * */ Class baidu_thread_run extends Thread { Public $ url; Public $ data; Public function _ construct ($ url ){ $ This-> url = $ url; } Public function run (){ If ($ url = $ this-> url )){ $ This-> data = PanInfoGet ($ url); // thread execution method } } } ?> |