Php uses curl to implement multi-thread capturing and phpcurl multi-thread capturing. Php uses curl to implement multi-thread crawling, phpcurl multi-thread crawling php combined with curl for multi-thread crawling php * curl multi-thread crawling *** @ paramarray $ array parallel URL php combined with curl for multi-thread crawling phpcurl multi-thread crawling
Php uses curl for multi-thread crawling
<? Php/* curl multi-thread crawling * // *** curl multithreading ** @ param array $ array parallel URL * @ param int $ timeout time * @ return array */function Curl_http ($ array, $ timeout) {$ res = array (); $ mh = curl_multi_init (); // create multiple curl scripts $ startime = getmicrotime (); foreach ($ array as $ k => $ url) {$ conn [$ k] = curl_init ($ url); curl_setopt ($ conn [$ k], CURLOPT_TIMEOUT, $ timeout); // Set the timeout value curl_setopt ($ conn [$ k], CURLOPT_USERAGENT, 'mozilla/5.0 (comp Atible; MSIE 5.01; Windows NT 5.0) '); curl_setopt ($ conn [$ k], CURLOPT_MAXREDIRS, 7); // HTTp targeting level curl_setopt ($ conn [$ k], CURLOPT_HEADER, 0); // do not use header here. add the block efficiency curl_setopt ($ conn [$ k], CURLOPT_FOLLOWLOCATION, 1 ); // 302 redirect curl_setopt ($ conn [$ k], CURLOPT_RETURNTRANSFER, 1); curl_multi_add_handle ($ mh, $ conn [$ k]);} // prevent endless loop consumption of cpu this section is based on the online writing do {$ mrc = curl_multi_exec ($ mh, $ active); // when no data is available, active = true} whi Le ($ mrc = CURLM_CALL_MULTI_PERFORM); // when receiving data while ($ active and $ mrc = CURLM_ OK) {// when there is no data or when the request is paused, active = true if (curl_multi_select ($ mh )! =-1) {do {$ mrc = curl_multi_exec ($ mh, $ active);} while ($ mrc = CURLM_CALL_MULTI_PERFORM );}} foreach ($ array as $ k =>$ url) {curl_error ($ conn [$ k]); $ res [$ k] = curl_multi_getcontent ($ conn [$ k]); // Get the returned information $ header [$ k] = curl_getinfo ($ conn [$ k]); // return header information curl_close ($ conn [$ k]); // Close the language handle curl_multi_remove_handle ($ mh, $ conn [$ k]); // release resources} curl_multi_close ($ mh); $ endtime = getmicrotime (); $ diff_time = $ endtime-$ startime; return array ('Diff _ time' => $ diff_time, 'Return '=> $ res, 'header' => $ header );} // calculate the current time function getmicrotime () {list ($ usec, $ sec) = explode ("", microtime (); return (float) $ usec + (float) $ sec);} // test the curl url. $ array = array (" http://www.weibo.com/ "," http://www.renren.com/ "," http://www.qq.com/ "); $ Data = Curl_http ($ array, '10'); // call var_dump ($ data); // output // if The POST data is greater than 1024 bytes, curl does not directly initiate a POST request // when sending a request, the header contains an empty response Ct. Curl_setopt ($ ch, CURLOPT_HTTPHEADER, array ("CT:");?>
Let's take a look at several examples.
(1) the following code captures multiple URLs and then writes the page code of the crawled URLs to the specified file.
$ Urls = array ('http: // www.bkjia.com/', 'http://www.google.com/', 'http://www.example.com/'); // Set the URL of the page to be crawled $ save_to = '/test.txt '; // write the captured code to the file $ st = fopen ($ save_to, "a"); $ mh = curl_multi_init (); foreach ($ urls as $ I =>$ url) {$ conn [$ I] = curl_init ($ url); curl_setopt ($ conn [$ I], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"); curl_setopt ($ conn [$ I], CURLOPT_HEADER, 0); curl_setopt ($ conn [$ I], CURLOPT_CONNECTTIMEOUT, 60); curl_setopt ($ conn [$ I], CURLOPT_FILE, $ st); // write The crawled code to the file curl_multi_add_handle ($ mh, $ conn [$ I]);} // initialize do {curl_multi_exec ($ mh, $ active);} while ($ active ); // execute foreach ($ urls as $ I =>$ url) {curl_multi_remove_handle ($ mh, $ conn [$ I]); curl_close ($ conn [$ I]);} // stop cleaning curl_multi_close ($ mh); fclose ($ st );
(2) the following code is similar to the above, except that the obtained code is first put into a variable, and then the obtained content is written into the specified file.
$ Urls = array ('http: // logs); $ save_to = '/test.txt'; // write the captured code to the file $ st = fopen ($ save_to, "a"); $ mh = curl_multi_init (); foreach ($ urls as $ I => $ url) {$ conn [$ I] = curl_init ($ url ); curl_setopt ($ conn [$ I], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"); curl_setopt ($ conn [$ I], CURLOPT_HEADER, 0); curl_setopt ($ conn [$ I], CURLOPT_CONNECTTIMEOUT, 60); curl_setopt ($ conn [$ I], CURLOPT_RETURNTRANSFER, true ); // do {curl_multi_exec ($ mh, $ mh, $ active)} instead of writing the crawling code to the browser, it is converted to the string curl_multi_add_handle ($ mh, $ I]);} do {curl_multi_exec ($ mh );} while ($ active); foreach ($ urls as $ I =>$ url) {$ data = curl_multi_getcontent ($ conn [$ I]); // Get The crawled code string fwrite ($ st, $ data); // write the string to the file} // Get the data variable, and write the file foreach ($ urls as $ I =>$ url) {curl_multi_remove_handle ($ mh, $ conn [$ I]); curl_close ($ conn [$ I]);} curl_multi_close ($ mh); fclose ($ st );
(3) the following code uses PHP Curl Functions to implement concurrent multi-threaded file download.
$urls=array( 'http://www.bkjia.com/5w.zip', 'http://www.bkjia.com/5w.zip', 'http://www.bkjia.com/5w.zip');$save_to='./home/';$mh=curl_multi_init();foreach($urls as $i=>$url){ $g=$save_to.basename($url); if(!is_file($g)){ $conn[$i]=curl_init($url); $fp[$i]=fopen($g,"w"); curl_setopt($conn[$i],CURLOPT_USERAGENT,"Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)"); curl_setopt($conn[$i],CURLOPT_FILE,$fp[$i]); curl_setopt($conn[$i],CURLOPT_HEADER ,0); curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle($mh,$conn[$i]); }}do{ $n=curl_multi_exec($mh,$active);}while($active);foreach($urls as $i=>$url){ curl_multi_remove_handle($mh,$conn[$i]); curl_close($conn[$i]); fclose($fp[$i]);}curl_multi_close($mh);$urls=array( 'http://www.bkjia.com/5w.zip', 'http://www.bkjia.com/5w.zip', 'http://www.bkjia.com/5w.zip');$save_to='./home/';$mh=curl_multi_init();foreach($urls as $i=>$url){ $g=$save_to.basename($url); if(!is_file($g)){ $conn[$i]=curl_init($url); $fp[$i]=fopen($g,"w"); curl_setopt($conn[$i],CURLOPT_USERAGENT,"Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)"); curl_setopt($conn[$i],CURLOPT_FILE,$fp[$i]); curl_setopt($conn[$i],CURLOPT_HEADER ,0); curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle($mh,$conn[$i]); }}do{ $n=curl_multi_exec($mh,$active);}while($active);foreach($urls as $i=>$url){ curl_multi_remove_handle($mh,$conn[$i]); curl_close($conn[$i]); fclose($fp[$i]);}curl_multi_close($mh);
The above is all the content of this article. I hope you will like it.
Using php and curl to achieve multi-thread crawling php/* curl multi-thread capturing * // *** curl multithreading ** @ param array $ array parallel URL...