<?php command Line Color output $colors [' red '] = "\33[31m"; $colors [' green '] = "\33[32m"; $colors [' yellow '] = "\33[33m"; $colors [' end '] = "\33[0m"; $colors [' reverse '] = "\33[7m"; $colors [' purple '] = "\33[35m"; /* Default parameter settings */ $curl _default_config[' ua '] = ' mozilla/5.0 ' (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html) '; $curl _default_config[' referer '] = '; $curl _default_config[' retry '] = 5; $curl _default_config[' conntimeout '] = 30; $curl _default_config[' fetchtimeout '] = 30; $curl _default_config[' downtimeout '] = 60; /* Set referer for the specified domain name (usually for downloading pictures), prior to $curl_default_config Use empty referer by default, generally not a problem Eg: $referer _config = Array ( ' Img_domain ' => ' Web_domain ', ' e.hiphotos.baidu.com ' => ' http://hi.baidu.com/'); */ $referer _config = Array (' img1.51cto.com ' => ' blog.51cto.com '), ' 360doc.com ' => ' www.360doc.com '); /* Sets the user-agent for the specified domain name precedence over the $curl_default_config The default use of Baidu Spider UA, refused to Baidu UA site very few Eg: $useragent _config = Array ( ' Web_domain ' => ' User agent ', ' www.xxx.com ' => ' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; trident/4.0) '); */ $useragent _config = Array (' hiphotos.baidu.com ' => ' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; trident/4.0) '; /* * If the machine has more than one IP address, you can change the default export IP, each call will randomly select one in the array. This is not automatically configured for all IPs, considering that there may be an IP that needs to be excluded. * Eg: $curl _ip_config = Array (' 11.11.11.11 ', ' 22.22.22.22 '); */ $local _ip_config = Array (); Cookies and Temporary Files directory if (@file_exists ('/dev/shm/') && @is_writable ('/dev/shm/')) { $cookie _dir = $tmpfile _dir = '/dev/shm/'; }else{ $cookie _dir = $tmpfile _dir = '/tmp/'; } Clears expired cookie files and downloads temporary files if (php_sapi_name () = = ' cli ') { Clear_curl_file (); } /** * Get way Crawl Web page * * @param string $url web page URL address * @param string $encode The page encoding returned, default is GBK, set to null value is not converted * @return string Web page HTML content */ function Curl_get ($url, $encode = ' GBK ') { Return Curl_func ($url, ' get ', NULL, NULL, NULL, $ENCODE); } /** * Post mode Request Web page * * @param string $url The URL address of the request * Post data sent @param array $data * @param string $encode The page encoding returned, default is GBK, set to null value is not converted * @return BOOL */ function Curl_post ($url, $data, $encode = ' GBK ') { Return Curl_func ($url, ' POST ', $data, NULL, NULL, $ENCODE); } /** * Get header information for page * * The HTTP status code is not returned in the form of "Name: Value", where Http_code is used as its name, all other values have a fixed name and are converted to lowercase * * @param string $url URL address * @return Array to return header arrays */ function Curl_header ($url, $follow =true) { $header _text = Curl_func ($url, ' header '); if (! $header _text) { Failed to get HTTP header return FALSE; } $header _array =explode ("\r\n\r\n", Trim ($header _text)); if ($follow) { $last _header = Array_pop ($header _array); }else{ $last _header = Array_shift ($header _array); }
$lines = explode ("\ n", Trim ($last _header)); //Processing status Codes $status _line = Trim (Array_shift ($lines)); Preg_match ("/(\d\d\d)/", $status _line, $preg); if (!empty ($preg [1])) { $header [' http_code '] = $preg [1]; }else{ $header [' http_code '] = 0; } foreach ($lines as $line) { list ($ Key, $val) = Explode (': ', $line, 2); $key = str_replace ('-', ' _ ', Strtolower (Trim ($key)); $header [$key] = Trim ($val); } return $header; } /** * Download files * * @param $url file Address * @param $path saved to the local path * @return bool Download is successful */ function Curl_down ($url, $path, $data =null, $proxy =null) { if (empty ($data)) { $method = ' get '; }else{ $method = ' POST '; } Return Curl_func ($url, $method, $data, $path, $proxy); } /** * uses a proxy to initiate a GET request * * @param string $url The URL address of the request * @param string $proxy proxy address * @param string $encode return encoding br> * * @return string web page content */ Function Curl_get_by_proxy ($url, $proxy, $encode = ' GBK ') { return Curl_func ($url, ' get ', null , NULL, $proxy, $encode); } /** * Use Agent to initiate POST request * * @param string $url The URL address of the request * @param string $proxy proxy address * @param string $encode return encoding * * @return String Web page content */ function Curl_post_by_proxy ($url, $data, $proxy, $encode = ' GBK ') { Return Curl_func ($url, ' POST ', $data, NULL, $proxy, $encode); } /** * @param string $url The URL address of the request * @param string $encode return encoding * * @return String Web page content */ Function Img_down ($url, $path _pre) { $img _tmp = '/tmp/curl_imgtmp_pid_ '. Getmypid (); $res = Curl_down ($url, $img _tmp); if (empty ($res)) { return $res; } $ext = Get_img_ext ($img _tmp); if (empty ($ext) { return NULL; } $path = "{$path _pre}. {$ext} "; @mkdir (dirname ($path), 0777, TRUE); /Transfer Temporary file path rename ($img _tmp, $path); return $ Path } Function Get_img_ext ($path) { $types = Array ( 1 => ' gif ', 2 => ' jpg ', 3 => ' png ', 6 => ' bmp ' ); & nbsp; $info = @getimagesize ($path); if (isset ($types [$info [2]]) { $ext = $info [' Type '] = $types [$info [2]]; $ext = = ' jpeg ' && $ext = ' jpg '; } else{ $ext = FALSE; } return $ext; } /** * Get file type * * @param string $filepath file path * @return Array to return arrays, formatted as array ($type, $ext) */ function Get_file_type ($filepath) { } /** * Returns the size of the file, used to determine whether the file is the same size as the local file after downloading it * Curl_getinfo () Size_download is not necessarily the true size of the file * * @param string $url URL address * @return the size of a string network file */ function Get_file_size ($url) { $header = Curl_header ($url); if (!empty ($header [' content_length '])) { return $header [' content_length ']; }else{ return FALSE; } } /** * Get status code * * @param string $url URL address * @return STRING&NBSP;&NBSP;&NB sp; Status Code */ Function Get_http_code ($url, $follow =true) { $header = Curl_ Header ($url, $follow); if (!empty ($header [' Http_code ']) { return $ header[' Http_code ']; }else{ return FALSE; } } /** * Get URL file suffix * * @param string $url URL address * @return The suffix of the array file type */ function Curl_get_ext ($url) { $header = Curl_header ($url); if (!empty ($header [' Content_Type '])) { @list ($type, $ext) = @explode ('/', $header [' Content_Type ']); if (!empty ($type) &&!empty ($ext)) { Return Array ($type, $ext); }else{ Return Array (', '); } }else{ Return Array (', '); } } /**
* Encapsulation Curl operation
*
* @param string $url The URL address of the request
* @param string $method The requested method (POST, GET, HEADER, down)
* @param mix $arg post as post data, down mode for download saved path
* @param string $return The encoding returned by the _encode Web page
* @param string $proxy Agent
* Return content @return mix. 4xx sequence errors and blank pages return false Null,curl crawl errors. Returns the content of the page if the result is normal.
*/
To be improved, download to the temporary file, the download succeeds after the transfer (already has the file overwrite), download failed to delete.
To be improved, the Parameter form is changed to Curl_func ($url, $method, $data =null, Savepath=null, $proxy =null, $return _encode= ' GBK ')
function Curl_func ($url, $method, $data =null, $savepath =null, $proxy =null, $return _encode=null) {
Global $colors, $cookie _dir, $tmpfile _dir, $referer _config, $useragent _config, $local _ip_config, $curl _config; Console output Color Extract ($colors); Remove the/... from the URL. / $url = Get_absolute_path ($url); Remove Entity Transfer code $url = Htmlspecialchars_decode ($url); Statistical data if (function_exists (' Mp_counter ')) { if (!empty ($savepath)) { Mp_counter (' down_total '); Number of downloads Count }elseif ($method = = ' HEADER ') { Mp_counter (' header_total '); Count the number of fetching HTTP headers }else{ Mp_counter (' fetch_total '); Number of crawl pages count } } for ($i = 0; $i < curl_config_get (' retry '); $i + +) { //initialization $ch = Curl_init ( ); curl_setopt ($ch, Curlopt_url, $url); Set timeout curl_setopt ($ch, Curlopt_connecttimeout, Curl_config_get (' conntimeout ')); Connection Timeout if (empty ($savepath)) { curl_setopt ($ch, Curlopt_timeout, Curl_config_get (' fetchtimeout ')); Crawl page (including header) timeout }else{ curl_setopt ($ch, Curlopt_timeout, Curl_config_get (' downtimeout ')); Download file timeout } Receive page content to variables curl_setopt ($ch, Curlopt_returntransfer, TRUE); Ignore SSL authentication curl_setopt ($ch, curlopt_ssl_verifyhost, 0); curl_setopt ($ch, Curlopt_ssl_verifypeer, 0); Set Referer, the highest priority in the file configuration
foreach ($referer _config as $domain => $ref) {
if (Stripos ($url, $domain)!== FALSE) {
$referer = $ref;
Break
}
}
Check to see if there is a curl_set_referer () setting Referer
if (Empty ($referer) &&!empty ($curl _config[getmypid ()] [' referer ']) {
$referer = $curl _config[getmypid () [' Referer '];
}
if (!empty ($referer)) {
curl_setopt ($ch, Curlopt_referer, $referer);
}
//Set HTTP request identification, highest priority in file configuration foreach ($useragent _config as $domain => $ua) { if (Stripos ($url, $domain)!== FALSE) { $useragent = $ua; break; } } //checks to see if there is a Curl_set_ua () set useragent if (Empty ($ useragent) { $useragent = Curl_config_get (' UA '); } curl_setopt ($ch, curlopt_useragent, $useragent); Export IP if (!empty ($local _ip_config)) { curl_setopt ($ch, Curlopt_interface, $local _ip_config[array_rand ($local _ip_config)]); } Set up agents if (!empty ($proxy)) { curl_setopt ($ch, Curlopt_proxy, $proxy); curl_setopt ($ch, Curlopt_proxytype, CURLPROXY_SOCKS5); } Settings allow to receive gzip compressed data, as well as extract, crawl header when not used (get the correct file size, impact judgment download success) if ($method!= ' HEADER ') { curl_setopt ($ch, Curlopt_httpheader, Array (' Accept-encoding:gzip, deflate ')); curl_setopt ($ch, Curlopt_encoding, ""); } Encountered 301 and 302 turn automatic jump continue crawl, if used for Web program and set Open_basedir, this option is invalid @curl_setopt ($ch, curlopt_followlocation, TRUE); Maximum turn times to avoid entering the dead loop curl_setopt ($ch, Curlopt_maxredirs, 5); Enable cookies $cookie _path = $cookie _dir. ' Curl_cookie_pid_ '. Get_ppid (); curl_setopt ($ch, Curlopt_cookiefile, $cookie _path); curl_setopt ($ch, Curlopt_cookiejar, $cookie _path); Set post parameter contents if ($method = = ' POST ') { curl_setopt ($ch, Curlopt_header, 0); curl_setopt ($ch, Curlopt_postfields, $data); } //Set parameters for download if (! Empty ($savepath)) { $tmpfile = $tmpfile _dir. '/curl_tmpfile_pid_ '. Getmypid (); file_exists ($tmpfile) && unlink ($ Tmpfile); $fp = fopen ($tmpfile, ' w '); curl_setopt ($ch, Curlopt_file, $fp); } Get header only if ($method = = ' HEADER ') { curl_setopt ($ch, Curlopt_nobody, TRUE); curl_setopt ($ch, Curlopt_header, TRUE); } Crawl Results $curl _res = curl_exec ($ch); Curl Info $info = Curl_getinfo ($ch); Debug Curl time, record connection time, wait time, transmission time, total time. Test method, any output before setting sleep, output middle set sleep /* foreach ($info as $key => $val) { echo "$key: $val \ n"; } Exit (9); */ Error message $error _msg = Curl_error ($ch); $error _no = Curl_errno ($ch); Close Curl Handle Curl_close ($ch); //If Curl has error messages, the decision is to crawl failed, try again if (!empty ($error _no) | | |!empty ($error _msg)) { $error _msg = "{$error _msg} ($error _no)"; curl_msg ($error _msg, $method, $url, ' Yellow '); continue; } //Statistics Flow if (function_ Exists (' Mp_counter ')) { if (!empty $info [' Size_download '] && $info [' size_download '] > 0) { mp_counter (' download_total ', $info [' size_download ']); } } To process the results if ($method = = ' HEADER ') { Return header information return $curl _res; }else{ The final status code $status _code = $info [' Http_code ']; if (In_array ($status _code, Array_merge (range (in 417), array (500, 444))) {
Non-server fault error, exit directly, return NULL
$error _msg = $status _code;
if (!empty ($savepath)) {
$method = "{$method}| Down ";
}
Curl_msg ($error _msg, $method, $url, ' Red ');
return NULL;
}if ($status _code!= 200) {
Prevent the site 502 and other temporary errors, excluding the above situation, not 200 to try again. This rule needs to be improved by the circumstances.
Curl will automatically jump during execution, where 301 and 302 will not appear, unless the number of jumps exceeds the Curlopt_maxredirs value
$error _msg = $status _code;
Curl_msg ($error _msg, $method, $url, ' yellow ');
Continue
} if (empty ($savepath)) {
Crawl page
if (Empty ($curl _res)) {
Blank Page
$error _msg = "Blank page";
Returns a null value, where the call is taken to determine
return NULL;
}else{
Default to return pages in GBK encoding Parse page encoding Preg_match_all ("/<meta.*?charset=" |) (.*?) (;|\"|'| \s)/is ", $curl _res, $matches); transcoding condition: 1 matching to the encoding, 2) return encoding is not NULL, 3 matching to the encoding and return encoding is not the same
if (!empty ($matches [2][0]) &&!empty ($return _encode)
&& str_replace ('-', ', ', Strtolower ($matches [2][0])
!= str_replace ('-', ', Strtolower ($return _encode))) {
$curl _res = @iconv ($matches [2][0], "{$return _encode}//ignore", $curl _res);
Replace the code indicated on the Web page
$curl _res = Str_ireplace ($matches [2][0], $return _encode, $curl _res);
} Iconv returns a blank page if it fails
if (Empty ($curl _res)) {
return NULL;
}else{
Converts a relative path to an absolute path
$curl _res = Relative_to_absolute ($curl _res, $url);
return $curl _res;
}
}
}else{
Download files
if (@filesize ($tmpfile) = = 0) {
$error _msg = ' emtpy Content ';
Continue
} //Statistics Download Volume if (function_exists (' Mp_counter ')) { mp_counter (' Download_size ', FileSize ($tmpfile)); } //Create directory @mkdir (DirName ($savepath), 0777, TRUE); /Transfer temporary file path Rename ($tmpfile, $ Savepath); return TRUE; } } } If the header is downloaded or crawled, and the error code is 6 (the domain name cannot be resolved), the error is not printed. Invalid picture too many references. Domain name is not legitimate when the error can not be exported, need to improve, in front of the legality of the decision URL if (!) ( ($method = = ' HEADER ' | |!empty ($savepath)) &&!empty ($error _no) && $error _no = = 6)) { if (!empty ($savepath)) { $method = "{$method}| Down "; } Curl_msg ($error _msg, $method, $url, ' Red '); } Statistical data if (function_exists (' Mp_counter ')) { if (!empty ($savepath)) { Mp_counter (' down_failed '); }elseif ($method = = ' HEADER ') { Mp_counter (' header_failed '); }else{ Mp_counter (' fetch_failed '); } } return FALSE; } /** * Output error message * * @param string $msg error message * @param string $method request method * @param string $url URL address * @param string $color color */ function curl_msg ($msg, $method, $url, $color) { Global $colors; Extract ($colors); It is recommended to turn off yellow error output under multiple concurrency $available _msg[] = ' yellow '; $available _msg[] = ' red '; if (Php_sapi_name ()!= ' CLI ') { Return } if (!in_array ($color, $available _msg)) { Return } echo ' {$reverse} '. $colors [$color]. " ({$method}) [CURL ERROR: {$msg}] {$url} {$end}\n "; } /**
* Convert URL address to absolute path
* URL address may encounter include '/. /' constitutes a relative path, curl will not automatically convert
* Echo Get_absolute_path ("http://www.a.com/a/../b/../c/../././index.php");
* The result is: http://www.a.com/index.php
*
* @param string $path The URL to be processed
* @return String returns the absolute path of the URL
*/
function Get_absolute_path ($path) {
$parts = Array_filter (Explode ('/', $path), ' strlen ');
$absolutes = Array ();
foreach ($parts as $part) {
if ('. ' = = $part) continue;
if ('.. ' = $part) {
Array_pop ($absolutes);
} else {
$absolutes [] = $part;
}
}
Return Str_replace (':/', '://', implode ('/', $absolutes));
} /** * uses the MD5 value of the picture URL as the path, and the pseudo static rule is rewrite ^/(.) When the hierarchical directory * depth is E. (.) (.) (. *) $/$1/$2/$3/$4 break; * An average of 1 articles 1 pictures, 30 million articles, 30 million pictures, 3 level table of Contents end 4096 subdirectories, average 7,324 images per catalog * * @param string $str original picture address & nbsp;* @param int $deep directory depth * @return string return rating directory */ function Md5_path ($str, $deep = 3) { $md 5 = substr (MD5 ($STR), 0,); Preg_ma Tch_all ('/./', $MD 5, $preg); $res = '; for ($i = 0; $i < count ($preg [0]); $i + +) { $res. = $preg [0][$i]; if ($i < $deep) { $res. = '/'; } } return $res; } function Relative_to_absolute ($content, $url) { $content = Preg_replace ("/src\s*=\s*\" \s*/"," src= ", $content); $content = Preg_replace ("/href\s*=\s*\" \s*/"," href= ", $content); Preg_match ("/(HTTP|HTTPS|FTP): \/\/[^\/]*/", $url, $preg _base); if (!empty ($preg _base[0])) { $preg _base[0] content such as http://www.yundaiwei.com This deals with the links that fall/begin with, that is, the path relative to the site's root directory $content = preg_replace ('/href=\s* ' \//i ', ' href= ', '. $preg _base[0]. ' /', $content); $content = preg_replace ('/src=\s* ' \//ims ', ' src= ', '. $preg _base[0]. ' /', $content); } Preg_match ("/(HTTP|HTTPS|FTP): \/\/.*\//", $url, $preg _full); if (!empty ($preg _full[0])) { This handles the path relative to the directory, such as src=. /.. /images/jobs/lippman.gif " Excludes the local file link at the beginning of the file://, excluding the BASE64 picture of the Data:image way $content = preg_replace ('/href=\s* ') \s* (?!) Http|file:\/\/|data:image|javascript)/I ', ' href= '. $preg _full[0], $content); $content = preg_replace ('/src=\s* ') \s* (?!) Http|file:\/\/|data:image|javascript)/I ', ' src= '. $preg _full[0], $content); } return $content; } /** * Clears expired cookie files and downloads temporary files */ function Clear_curl_file () { Global $cookie _dir; $cookie _files = Glob ("{$cookie _dir}curl_*_pid_*"); $tmp _files = Glob ("/tmp/curl_*_pid_*"); $files = Array_merge ($cookie _files, $tmp _files); foreach ($files as $file) {
Preg_match ("/pid_ (\d*)/", $file, $preg);
$pid = $preg [1];
$exe _path = "/proc/{$pid}/exe";
If the file does not exist, the process does not exist, determine if it is a PHP process, exclude the PHP-FPM process
if (!file_exists ($exe _path)
|| Stripos (Readlink ($exe _path), ' php ') = = FALSE
|| Stripos (Readlink ($exe _path), ' php-fpm ') = = TRUE) {
$sem = @sem_get (@ftok ($file, ' a '));
if ($sem) {
@sem_remove ($sem);
}
Unlink ($file);
}
}
} /** * If it is in the subprocess, get the parent process PID, otherwise get the self PID * @return int */ if (!function_exists (' get_ppid ')) { function Get_ppid () {
if (Php_sapi_name ()!= ' CLI ') { If it is a Web-mode call, return the PHP execution process PID, such as Apache or PHP-FPM Getmypid (); }else{ Command line execution Enter here Here you need to identify whether to call in a child process or in a parent process, in different forms, where the file location of the saved variable content needs to be kept consistent $ppid = Posix_getppid (); In theory, this way of judging can be a hole. However, in practice, in addition to fork out of the child process, it is unlikely that the PHP process of the parent process of the program name appears in PHP. if (Strpos (Readlink ("/proc/{$ppid}/exe"), ' php ') = = FALSE) { $pid = Getmypid (); }else{ $pid = $ppid; } return $pid; } } } UTF-8 Turn GBK if (!function_exists (' u2g ')) { function u2g ($string) { Return @iconv ("UTF-8", "Gbk//ignore", $string); } } GBK Turn UTF-8 if (!function_exists (' g2u ')) { function g2u ($string) { Return @iconv ("GBK", "Utf-8//ignore", $string); } } function curl_rand_ua_pc () { $ua = ' mozilla/5.0 (compatible; Msie '. Rand (7, 9). '. 0; Windows NT 6.1; WOW64; trident/'. Rand (4, 5). 0) '; return $ua; } function Curl_rand_ua_mobile () { $op = ' mozilla/5.0 (Linux; U Android '. Rand (4,5). '. '. Rand (1,5). Rand (1,5). '; ZH-CN; MI '. Rand (3, 5). '); '; $browser = ' applewebkit/'. Rand (500, 700). Rand (1,100). '. ' Rand (1,100) .' (khtml, like Gecko) version/'. Rand (5,10) .'. 0 Mobile safari/537.36 xiaomi/miuibrowser/'. Rand (1,5). Rand (1,5). rand (1,5); return $op. $browser; } function Curl_config_get ($key) { Global $curl _config, $curl _default_config; if (!empty ($curl _config[getmypid () [$key])) { return $curl _config[getmypid ()] [$key]; }elseif (!empty ($curl _default_config[$key])) { return $curl _default_config[$key]; }else{ Echo ' $curl _default_config '. [$key] Not found!\n "; Exit (9); } } function Curl_config_set ($key, $val) { Global $curl _config; $curl _config[getmypid ()] [$key] = $val; } function Curl_set_ua ($ua) { Curl_config_set (' UA ', $ua); } function Curl_set_referer ($referer) { Curl_config_set (' Referer ', $referer); } function Curl_set_retry ($retry) { Curl_config_set (' Retry ', $retry); } function Curl_set_conntimeout ($conntimeout) { Curl_config_set (' Conntimeout ', $conntimeout); } function Curl_set_fetchtimeout ($fetchtimeout) { Curl_config_set (' Fetchtimeout ', $fetchtimeout); } function Curl_set_downtimeout ($downtimeout) { Curl_config_set (' Downtimeout ', $downtimeout); } |