The PHP language itself does not support multithreading, so the development of crawler programs is not efficient, with the help of Curl Multi it can achieve concurrent multi-threaded access to multiple URL addresses. Download file code with Curl Multi Multithreading:
Code 1: Write the obtained code directly to a file
<?php
$urls =array (
' http://www.111cn.net/',
' http://www.baidu.com/',
)//Set the URL of the page to crawl
$save _to= ' test.txt '; Write the crawled code to the file
$st =fopen ($save _to, "a");
$MH = Curl_multi_init ();
foreach ($urls as $i => $url) {
$conn [$i] = Curl_init ($url);
curl_setopt ($conn [$i], Curlopt_useragent, "mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) ");
curl_setopt ($conn [$i], Curlopt_header, 0);
curl_setopt ($conn [$i], curlopt_connecttimeout,60);
curl_setopt ($conn [$i], Curlopt_file, $st);//set to write crawled code to file
Curl_multi_add_handle ($MH, $conn [$i]);
}//initialization
do {
Curl_multi_exec ($MH, $active);
}while ($active); Perform
foreach ($urls as $i => $url) {
Curl_multi_remove_handle ($MH, $conn [$i]);
Curl_close ($conn [$i]);
}//End Cleanup
Curl_multi_close ($MH);
Fclose ($st);
?>
Code 2: Put the obtained code first into a variable and then write to a file
<?php
$urls =array (
' http://m.111cn.net/',
' http://www.111cn.net/',
' http://www.163.com/'
);
$save _to= '/test.txt '; Write the crawled code to the file
$st =fopen ($save _to, "a");
$MH = Curl_multi_init ();
foreach ($urls as $i => $url) {
$conn [$i] = Curl_init ($url);
curl_setopt ($conn [$i], Curlopt_useragent, "mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) ");
curl_setopt ($conn [$i], Curlopt_header, 0);
curl_setopt ($conn [$i], curlopt_connecttimeout,60);
curl_setopt ($conn [$i],curlopt_returntransfer,true); Setting does not write the crawl substitution code to the browser, but converts it to a string
Curl_multi_add_handle ($MH, $conn [$i]);
}
do {
Curl_multi_exec ($MH, $active);
}while ($active);
foreach ($urls as $i => $url) {
$data = Curl_multi_getcontent ($conn [$i]);//Get Crawled code string
Fwrite ($st, $data); Writes a string to a file. Of course, you can not write to a file, such as a database
}//get data variables and write to file
foreach ($urls as $i => $url) {
Curl_multi_remove_handle ($MH, $conn [$i]);
Curl_close ($conn [$i]);
}
Curl_multi_close ($MH);
Fclose ($st);
?>
The solution of Curl Crawl Web page Chinese garbled
Function Ppost ($url, $data, $ref) {//Simulate submit data function
$curl = Curl_init ()//Start a Curl session
curl_setopt ($curl, Curlopt_ URL, $url); Address to access
curl_setopt ($curl, Curlopt_ssl_verifypeer, 0);//Check the source of authentication certificate
curl_setopt ($curl, Curlopt_ssl_ Verifyhost, 1); Check the SSL encryption algorithm for the existence of
curl_setopt ($curl, curlopt_useragent, $_server[' http_user_agent ') from the certificate;//simulate the browser used by the user
Curl_ Setopt ($curl, curlopt_followlocation, 1); Use automatic jump
curl_setopt ($curl, Curlopt_referer, $ref);
curl_setopt ($curl, Curlopt_post, 1);//Send a regular POST request
curl_setopt ($curl, Curlopt_postfields, $data); Post-Submitted packets
curl_setopt ($curl, Curlopt_cookiefile, $GLOBALS [' cookie_file ']);//Read the cookie information stored above
Curl_ Setopt ($curl, Curlopt_cookiejar, $GLOBALS [' cookie_file ']); The file name that holds the cookie information
Curl_setopt ($curl, Curlopt_httpheader,array (' accept-encoding:gzip, deflate '));
curl_setopt ($curl, curlopt_encoding, ' gzip,deflate '); This is to explain the gzip content ......
curl_setopt ($curl, Curlopt_timeout, 30);//Set timeout limit to prevent dead loops
curl_setopt ($curl, Curlopt_header, 0); Displays the contents of the header area returned
curl_setopt ($curl, Curlopt_returntransfer, 1);//The obtained information is returned as a file stream
$tmpInfo = curl_exec ($curl); /execute Action
if (Curl_errno ($curl)) {
echo ' errno '. Curl_error ($curl);
}
Curl_close ($curl);//Critical Curl session
return $tmpInfo;//Back to data
}