PHP combined with curl to achieve multi-threaded crawling
<?php/* Curl multi-threaded Crawl//** * Curl Multithreading * * @param array $array parallel URL * @param int $timeout Timeout * @r
Eturn Array */function Curl_http ($array, $timeout) {$res = array ();
$MH = Curl_multi_init ()//Create multiple curl Handles $startime = Getmicrotime ();
foreach ($array as $k => $url) {$conn [$k]=curl_init ($url); curl_setopt ($conn [$k], curlopt_timeout, $timeout);/Set timeout time curl_setopt ($conn [$k], Curlopt_useragent, ' mozilla/5.0 ( compatible; MSIE 5.01;
Windows NT 5.0);
curl_setopt ($conn [$k], Curlopt_maxredirs, 7);//http Orientation level curl_setopt ($conn [$k], Curlopt_header, 0);/no HEADER here, add block efficiency curl_setopt ($conn [$k], curlopt_followlocation, 1);
302 redirect Curl_setopt ($conn [$k],curlopt_returntransfer,1);
Curl_multi_add_handle ($MH, $conn [$k]); //Prevent dead cycles from dying CPU This section is based on the online writing do {$MRC = Curl_multi_exec ($MH, $active);//When there is no data, active=true} while ($MRC = = Curlm_call_mul Ti_perform/////////When data is being accepted while ($active and $MRC = = CURLM_OK) {//When no data is encountered or when a request is paused, active=true if (Curl_multi_select ($MH)!=-1) {do {$MRC = Curl_multi_exec ($MH, $active);
while ($MRC = = Curlm_call_multi_perform);
foreach ($array as $k => $url) {curl_error ($conn [$k]); $res [$k]=curl_multi_getcontent ($conn [$k]);//Get return information $header [$k]=curl_getinfo ($conn [$k]);//Return header information Curl_close ($conn [ $K]);/close handle Curl_multi_remove_handle ($MH, $conn [$k]);
Release resources} curl_multi_close ($MH);
$endtime = Getmicrotime ();
$diff _time = $endtime-$startime;
Return Array (' Diff_time ' => $diff _time, ' return ' => $res, ' header ' => $header);
//Calculate Current Time function getmicrotime () {list ($usec, $sec) = Explode ("", Microtime ());
Return ((float) $usec + (float) $sec);
//test, curl three urls $array = Array ("http://www.weibo.com/", "http://www.renren.com/", "http://www.qq.com/"); $data = Curl_http ($array, ' 10 ');//Call Var_dump ($data)//output///If post data is greater than 1024 bytes, Curl does not directly initiate POST request//Send request. The header contains an empty expect. curl_setopt ($ch, Curlopt_httpheader, Array ("EXpect: "));?>
Let's look at a few more examples.
(1) The following code is implemented to crawl multiple URLs, and then the page code of the crawled URL is written to the specified file
$urls = Array (
' http://www.jb51.net/',
' http://www.google.com/', '
http://www.example.com/'
); Set the page URL to crawl
$save _to= '/test.txt ';//write the crawled code to the file
$st = fopen ($save _to, "a");
$MH = Curl_multi_init ();
foreach ($urls as $i => $url) {
$conn [$i] = Curl_init ($url);
curl_setopt ($conn [$i], Curlopt_useragent, "mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) ");
curl_setopt ($conn [$i], Curlopt_header, 0);
curl_setopt ($conn [$i], curlopt_connecttimeout,60);
curl_setopt ($conn [$i], Curlopt_file, $st); Writes crawled code to file
curl_multi_add_handle ($MH, $conn [$i]);
//Initialize do
{
curl_multi_exec ($MH, $active) ;
} while ($active); Execute
foreach ($urls as $i => $url) {
curl_multi_remove_handle ($mh, $conn [$i]);
Curl_close ($conn [$i]);
} End cleaning up
curl_multi_close ($MH);
Fclose ($st);
(2) The following code is similar to the above meaning, except that this place is to put the obtained code into the variable first, and then write the acquired content to the specified file
$urls = Array (' http://www.jb51.net/', ' http://www.google.com/', ' http://
www.example.com/'); $save _to= '/test.txt ';
The crawled code is written to the file $st = fopen ($save _to, "a");
$MH = Curl_multi_init (); foreach ($urls as $i => $url) {$conn [$i] = Curl_init ($url); curl_setopt ($conn [$i], Curlopt_useragent, mozilla/4.0 (CO mpatible; MSIE 7.0;
Windows NT 6.0) ");
curl_setopt ($conn [$i], Curlopt_header, 0);
curl_setopt ($conn [$i], curlopt_connecttimeout,60); curl_setopt ($conn [$i],curlopt_returntransfer,true);
Instead of writing the crawl substitution code to the browser, it translates into a string Curl_multi_add_handle ($MH, $conn [$i]);
"Do {curl_multi_exec ($MH, $active);" while ($active);
foreach ($urls as $i => $url) {$data = Curl_multi_getcontent ($conn [$i]);//Get Crawled Code string fwrite ($st, $data);//write String to file //Get data variable and write to file foreach ($urls as $i => $url) {curl_multi_remove_handle ($MH, $conn [$i]); Curl_close ($conn [$i]); cur
L_multi_close ($MH);
Fclose ($st);
(3) The following code implements a concurrent multi-threaded download file using PHP's Curl functions
$urls =array (' http://www.jb51.net/5w.zip ', ' http://www.jb51.net/5w.zip ', ' http://www.jb51.net/5w.zip ');
$save _to= './home/';
$MH =curl_multi_init ();
foreach ($urls as $i => $url) {$g = $save _to.basename ($url);
if (!is_file ($g)) {$conn [$i]=curl_init ($url);
$fp [$i]=fopen ($g, "w"); curl_setopt ($conn [$i],curlopt_useragent, "mozilla/4.0 (compatible; MSIE 7.0;
Windows NT 6.0) ");
curl_setopt ($conn [$i],curlopt_file, $fp [$i]);
curl_setopt ($conn [$i],curlopt_header, 0);
curl_setopt ($conn [$i],curlopt_connecttimeout,60);
Curl_multi_add_handle ($MH, $conn [$i]); } do{$n =curl_multi_exec ($MH, $active);
while ($active);
foreach ($urls as $i => $url) {curl_multi_remove_handle ($MH, $conn [$i]);
Curl_close ($conn [$i]);
Fclose ($fp [$i]); Curl_multi_close ($MH); $urls =array (' http://www.jb51.net/5w.zip ', ' http://www.jb51.net/5w.zip ', ' http://
Www.jb51.net/5w.zip ');
$save _to= './home/';
$MH =curl_multi_init ();
foreach ($urls as $i => $url) {$g = $save _to.basename ($url);
if (!is_file ($g)) { $conn [$i]=curl_init ($url);
$fp [$i]=fopen ($g, "w"); curl_setopt ($conn [$i],curlopt_useragent, "mozilla/4.0 (compatible; MSIE 7.0;
Windows NT 6.0) ");
curl_setopt ($conn [$i],curlopt_file, $fp [$i]);
curl_setopt ($conn [$i],curlopt_header, 0);
curl_setopt ($conn [$i],curlopt_connecttimeout,60);
Curl_multi_add_handle ($MH, $conn [$i]); } do{$n =curl_multi_exec ($MH, $active);
while ($active);
foreach ($urls as $i => $url) {curl_multi_remove_handle ($MH, $conn [$i]);
Curl_close ($conn [$i]);
Fclose ($fp [$i]);
} curl_multi_close ($MH);
The above is the entire contents of this article, I hope you can enjoy.