This is a flexible multi-thread call cURL. This is different from the sample provided in the php manual http://us2.php.net/manual/zh/function.curl-multi-select.php, the code execution efficiency is much higher There are two files in this section. one is a muti_curl file, which contains two classes. One method is to use, here is to check whether the proxy ip is available in batches
-
- Class request_setting {
- Public $ url = false;
- Public $ method = 'get ';
- Public $ post_data = null;
- Public $ headers = null;
- Public $ options = null;
- Function _ construct ($ url, $ method = "GET", $ post_data = null, $ headers = null, $ options = null ){
- $ This-> url = $ url;
- $ This-> method = $ method;
- $ This-> post_data = $ post_data;
- $ This-> headers = $ headers;
- $ This-> options = $ options;
- }
- Public function _ destruct (){
- Unset ($ this-> url, $ this-> method, $ this-> post_data, $ this-> headers, $ this-> options );
- }
- }
- /*************************************** **************************************** ****************
- Batch operation class
- **************************************** **************************************** ***********/
- Class muti_curl {
- Protected $ thread_size = 100;
- Protected $ timeout = 30;
- Private $ callback;
- Protected $ options = array (
- CURLOPT_SSL_VERIFYPEER => false, // after the cURL is disabled, it is terminated from the server for verification. Use the CURLOPT_CAINFO option to set the certificate. use the CURLOPT_CAPATH option to set the certificate Directory. if CURLOPT_SSL_VERIFYPEER (default value: 2) is enabled, set CURLOPT_SSL_VERIFYHOST to TRUE or FALSE. The default value is TRUE since cURL 7.10. The installation is bound by default starting from cURL 7.10.
- CURLOPT_RETURNTRANSFER => true, // return the information obtained by curl_exec () in the form of a file stream, instead of directly entering
- CURLOPT_CONNECTTIMEOUT => 15,
- CURLOPT_TIMEOUT => 30,
- // CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_0, // it is better to use this to capture data when using a proxy.
- // CURLOPT_AUTOREFERER => false, // when based on Location: redirection, the Referer: Information in the header is automatically set.
- // CURLOPT_BINARYTRANSFER => false, // when CURLOPT_RETURNTRANSFER is enabled, the native (Raw) output is returned.
- // CURLOPT_COOKIESESSION => true, // When enabled, curl will only pass one session cookie and ignore other cookies. by default, cURL will return all cookies
- // CURLOPT_CRLF => false, // When enabled, convert Unix line breaks to carriage return line breaks.
- // CURLOPT_DNS_USE_GLOBAL_CACHE => false, // A global DNS cache is enabled when it is enabled. this option is thread-safe and enabled by default.
- // CURLOPT_FAILONERROR => false, // The HTTP status code is displayed. the default behavior is to ignore the HTTP information whose number is less than or equal to 400.
- // CURLOPT_FILETIME => true, // When enabled, the system tries to modify the information in the remote document. The result is returned through the CURLINFO_FILETIME option of the curl_getinfo () function. Curl_getinfo ().
- // CURLOPT_FOLLOWLOCATION => false, // When enabled, the "Location:" returned by the server will be recursively returned to the server in the header. you can use CURLOPT_MAXREDIRS to limit the number of recursive responses.
- // CURLOPT_FORBID_REUSE => true, // force disconnect after the interaction is completed and cannot be reused.
- // CURLOPT_FRESH_CONNECT => true, // force a new connection to replace the connection in the cache.
- // CURLOPT_FTP_USE_EPRT => false, // When FTP download is enabled, use the EPRT (or LPRT) command. If it is set to FALSE, disable EPRT and LPRT and use the PORT command only.
- // CURLOPT_FTP_USE_EPSV => false, // When enabled, try the EPSV command before returning to PASV mode during FTP transmission. Disable the EPSV command when set to FALSE.
- // CURLOPT_FTPAPPEND => false, // When enabled, append the data to the file instead of overwriting it.
- // CURLOPT_FTPASCII => false, // The alias of CURLOPT_TRANSFERTEXT.
- // CURLOPT_FTPLISTONLY => false, // only the FTP directory name is listed when enabled.
- // CURLOPT_HEADER => true, // When enabled, the header file information is output as a data stream.
- // CURLINFO_HEADER_OUT => false, // The request string of the tracking handle when enabled.
- // CURLOPT_HTTPGET => true, // When enabled, the HTTP method is set to GET. because GET is the default value, it is used only when it is modified.
- // CURLOPT_HTTPPROXYTUNNEL => true, // it will be transmitted through the HTTP proxy when enabled.
- // CURLOPT_MUTE => true, // When enabled, all the modified parameters in the cURL function are restored to the default value.
- // CURLOPT_NETRC => false, // after the connection is established, access ~ /. The netrc file obtains the user name and password to connect to the remote site.
- // CURLOPT_NOBODY => true.
- // CURLOPT_NOPROGRESS => false, // disable the curl transmission progress bar when enabled. this option is enabled by default.
- // CURLOPT_NOSIGNAL => false, // When enabled, ignore all the signals sent to php by curl. This option is enabled by default during SAPI multi-thread transmission. CURL 7.10 is added.
- // CURLOPT_POST => false, // When enabled, a conventional POST request is sent. the type is application/x-www-form-urlencoded, just like form submission.
- // CURLOPT_PUT => false, // When enabled, files can be sent over HTTP. both CURLOPT_INFILE and CURLOPT_INFILESIZE must be set.
- // CURLOPT_TRANSFERTEXT => false, // use ASCII mode for FTP transmission after enabling. For LDAP, it retrieves plain text information rather than HTML. In Windows, STDOUT is not set to binary mode.
- // CURLOPT_UNRESTRICTED_AUTH => true, // The username and password information is continuously appended to multiple locations in the header generated by CURLOPT_FOLLOWLOCATION, even if the domain name has changed.
- // CURLOPT_UPLOAD => false, // enable and allow file upload.
- // CURLOPT_VERBOSE => true, // When enabled, all information is reported and stored in STDERR or the specified CURLOPT_STDERR.
- );
- Private $ headers = array ();
- Private $ requests = array ();
- Private $ requestMap = array ();
- /*********************
- Construct a callback function
- ********************/
- Function _ construct ($ callback = null ){
- $ This-> callback = $ callback;
- }
- /*************************************** *****************************
- Reload the _ get method
- **************************************** ***************************/
- Public function _ get ($ name ){
- Return (isset ($ this-> {$ name }))? $ This-> {$ name}: null;
- }
- /*************************************** ******************************
- Reload _ set
- **************************************** ***************/
- Public function _ set ($ name, $ value ){
- // Add a setting to headers.
- If ($ name = "options" | $ name = "headers "){
- $ This-> {$ name }=$ value + $ this-> {$ name };
- } Else {
- $ This-> {$ name} = $ value;
- }
- Return true;
- }
- // Add a request
- Public function add ($ request ){
- $ This-> requests [] = $ request;
- Return true;
- }
- Public function request ($ url, $ method = "GET", $ post_data = null, $ headers = null, $ options = null ){
- $ This-> requests [] = new request_setting ($ url, $ method, $ post_data, $ headers, $ options );
- Return true;
- }
- Public function get ($ url, $ headers = null, $ options = null ){
- Return $ this-> request ($ url, "GET", null, $ headers, $ options );
- }
- Public function post ($ url, $ post_data = null, $ headers = null, $ options = null ){
- Return $ this-> request ($ url, "POST", $ post_data, $ headers, $ options );
- }
- Private function single_curl (){
- $ Ch = curl_init (); // Initialization
- $ Request = array_shift ($ this-> requests); // removes the first unit and serves as the result.
- $ Options = $ this-> get_options ($ request); // Obtain the unit settings
- Curl_setopt_array ($ ch, $ options); // batch settings
- $ Output = curl_exec ($ ch );
- $ Curl_info = curl_getinfo ($ ch );
- If ($ this-> callback ){
- $ Callback = $ this-> callback;
- If (is_callable ($ this-> callback )){
- Call_user_func ($ callback, $ output, $ curl_info, $ request );
- }
- }
- Else
- Return $ output;
- Return true;
- }
-
- Private function rolling_curl ($ thread_size = null ){
- If ($ thread_size ){
- $ This-> thread_size = $ thread_size;
- }
-
- If (count ($ this-> requests) <$ this-> thread_size ){
- $ This-> thread_size = count ($ this-> requests );
- }
- If ($ this-> thread_size <2 ){
- $ Errorinfo = 'the thread size must be greater than 1 !!!! ';
- Throw new Exception ($ errorinfo );
- }
- $ Queue = curl_multi_init ();
- // Add the task queue in the thread
- For ($ I = 0; $ I <$ this-> thread_size; $ I ++ ){
- $ Ch = curl_init ();
- $ Options = $ this-> get_options ($ this-> requests [$ I]);
- Curl_setopt_array ($ ch, $ options); // get the settings
- Curl_multi_add_handle ($ queue, $ ch); // add
- $ Key = (string) $ ch;
- $ This-> requestMap [$ key] = $ I;
- }
- Do {
- While ($ statu_run_muti_exec = curl_multi_exec ($ queue, $ active) = CURLM_CALL_MULTI_PERFORM );
- If ($ statu_run_muti_exec! = CURLM_ OK) {break ;}
- // Process a completed request
- While ($ done = curl_multi_info_read ($ queue )){
- $ Curl_info = curl_getinfo ($ done ['handle']);
- $ Output = curl_multi_getcontent ($ done ['handle']);
- $ Callback = $ this-> callback;
- If (is_callable ($ callback )){
- $ Key = (string) $ done ['handle'];
- $ Request = $ this-> requests [$ this-> requestMap [$ key];
- Unset ($ this-> requestMap [$ key]); // this destroy variable is very handsome
- Call_user_func ($ callback, $ output, $ curl_info, $ request );
- }
- // Add an unprocessed request to a completed queue
- If ($ I <count ($ this-> requests) & isset ($ this-> requests [$ I]) & $ I <count ($ this-> requests )){
- $ Ch = curl_init ();
- $ Options = $ this-> get_options ($ this-> requests [$ I]);
- Curl_setopt_array ($ ch, $ options );
- Curl_multi_add_handle ($ queue, $ ch );
- $ Key = (string) $ ch;
- $ This-> requestMap [$ key] = $ I;
- $ I ++;
- }
- Curl_multi_remove_handle ($ queue, $ done ['handle']);
- Echo "done ";
- Print_r ($ queue );
- Print_r ($ done );
- }
- // This step is very important. if you need to reset the timeout time after completing one step
- // The key point here is to ensure that at least one of the requests in all the threads for the first time is valid. otherwise, none of the requests will be effective for the first time, cause $ active = 0, so do not execute the following
- If ($ active> 0 ){
- Curl_multi_select ($ queue, $ this-> timeout );
- }
- } While ($ active );
- Curl_multi_close ($ queue );
- Return true;
- }
- Public function execute ($ thread_size = null ){
- // Determine the size of thread_size. if there is only one request, the single-thread mode is used.
- If (count ($ this-> requests) = 1 ){
- Return $ this-> single_curl ();
- } Else {
- Return $ this-> rolling_curl ($ thread_size );
- }
- }
- Private function get_options ($ request ){
- $ Options = $ this->__ get ('options ');
- If (ini_get ('safe _ mode') = 'off' |! Ini_get ('safe _ mode ')){
- // $ Options [CURLOPT_FOLLOWLOCATION] = 1;
- // $ Options [CURLOPT_MAXREDIRS] = 5;
- }
- $ Headers = $ this->__ get ('headers ');
- If ($ request-> options ){
- $ Options = $ request-> options + $ options;
- }
- $ Options [CURLOPT_URL] = $ request-> url;
- // Set the post option and header option respectively below
- If ($ request-> post_data ){
- $ Options [CURLOPT_POST] = 1;
- $ Options [CURLOPT_POSTFIELDS] = $ request-> post_data;
- }
- If ($ headers ){
- $ Options [CURLOPT_HEADER] = 0;
- $ Options [CURLOPT_HTTPHEADER] = $ headers;
- }
- Return $ options;
- }
- Public function _ destruct (){
- Unset ($ this-> thread_size, $ this-> callback, $ this-> options, $ this-> headers, $ this-> requests );
- }
- }
- ?>
- Header ("content-type: text/html; charset = utf-8 ");
- Require ("muti_curl_class.php ");
- Set_time_limit (0 );
- $ Sucesesnum = 0;
- $ Good_proxy = array ();
- Function request_callback ($ response, $ info, $ request ){
- Global $ sucesesnum, $ good_proxy;
- // The following regular expression can display the returned results selectively.
- /* If (preg_match ("~(.*?)~ I ", $ response, $ out )){
- $ Title = $ out [1];
- }*/
- // Echo'
'. $ Response .' ';
- Echo'
';
- // Check the response, that is, $ response to determine whether there are set characters in the response. If yes, the agent is used successfully.
- If ($ response! = False & substr_count ($ response, 'user-agent: Baiduspider ')> = 1 ){
- // $ Result = true;
- Echo "true
";
- // Echo $ request [options] [1, 10004];
- // Print_r ($ request-> options );
- Echo $ request-> options [CURLOPT_PROXY];
- $ Good_proxy [] = $ request-> options [CURLOPT_PROXY];
- }
- Echo'
The --> '. $ sucesesnum.' <--- use: '. $ info ['total _ time'];
- // Print_r ($ request );
- // Echo $ request-> url;
- $ Sucesesnum ++;
- Echo "";
- }
- $ Params = array_merge ($ _ GET, $ _ POST); // obtain the address of the passed proxy ip address.
- $ Result = $ proxy_ip = trim ($ params ['IP']);
- $ Timeout = intval (trim ($ params ['timeout']);
- If ($ timeout <3) {$ timeout = 3 ;}
- If ($ timeout> 300) {$ timeout = 300 ;}
- $ Thread_size = intval (trim ($ params ['thread _ size']);
- If ($ thread_size <5) {$ thread_size = 5 ;}
- If ($ thread_size> 300) {$ thread_size = 300 ;}
- If ($ proxy_ip = ''){
- Echo 'Enter the IP address !! ';
- Return;
- }
- $ Replace_arr1 = array ('', 'qq proxy: ', 'dn28. com ', 'qqip', 'qq proxy', 'qq proxy IP', 'proxy ip: ', 'Ip:', 'proxy IP ','"', "'",'\\','/','');
- $ Result = str_replace ($ replace_arr1, array (''), $ result );
- $ Result = str_replace (",", "\ n", $ result );
- $ ResArr = explode ("\ n", $ result );
- Foreach ($ resArr as $ k => $ v ){
- $ PosProxy = getPos ($ v ,'@');
- If ($ posProxy === false ){
- If (! Empty ($ v) {$ proxyip_and_port = $ v ;}
- } Else {
- $ Proxyip_and_port = substr ($ v, 0, $ posProxy );
- }
- $ NewRes [] = trim ($ proxyip_and_port );
- }
- Print_r ($ newRes );
- // Die ();
- $ Option_setting = array (
- CURLOPT_SSL_VERIFYPEER => 0,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_CONNECTTIMEOUT => 5,
- CURLOPT_TIMEOUT => 30,
- CURLOPT_HEADER => false,
- CURLOPT_PROXY => '', // Set the proxy location here
- );
- $ Url = 'http: // www.baidu.com/robots.txt ';
- $ Btime = time ();
- $ Rc = new muti_curl ("request_callback ");
- $ Rc-> timeout = $ timeout;
- $ Rc-> thread_size = $ thread_size;
- Foreach ($ newRes as $ v ){
- $ Option_setting [CURLOPT_PROXY] = $ v;
- $ Request = new request_setting ($ url, $ method = "GET", $ post_data = null, $ header = null, $ option_setting );
- $ Rc-> add ($ request );
- }
- $ Rc-> execute ();
- $ Etime = time ();
- $ Usedtime = $ etime-$ btime;
- Echo 'all'. $ sucesesnum. 'use'. $ usedtime;
- Echo '';
- $ Good_proxy = array_unique ($ good_proxy );
- $ Str = '';
- Foreach ($ good_proxy as $ v ){
- $ Str. = "'". trim ($ v )."',";
- }
- $ Str = str_replace ('','', $ str );
- $ Str = preg_replace ('/\ s +/', '', $ str );
- Echo $ str .'
';
- Var_export ($ good_proxy );
- // Var_dump ($ good_proxy );
- //************************************** **************************************** ********************
- // ********************************* Only uses one function
- Function parseProxyInfo ($ proxyStr ){
- // $ ProxyStr = '192. 115.207.25: 80 @ HTTP; Chengdu Sichuan Normal University ';
- $ PosIp = getPos ($ proxyStr ,':');
- $ Ip = substr ($ proxyStr, 0, $ posIp );
- $ PosPort = getPos ($ proxyStr ,'@');
- $ Port = substr ($ proxyStr, $ posIp + 1, $ posPort-$ posIp-1 );
- $ PosType = getPos ($ proxyStr ,';');
- $ Type = substr ($ proxyStr, $ posPort + 1, $ posType-$ posPort-1 );
- $ Location = substr (strstr ($ proxyStr, ';'), 1 );
- Return array (
- 'IP' => $ ip,
- 'Port' => $ port,
- 'Type' => $ type,
- 'Location' => $ location
- );
- }
- Function getPos ($ haystack, $ needle ){
- Return strpos ($ haystack, $ needle );
- }
- Function check_proxy_is_useful ($ model, $ proxy_info_arr = array ()){
- Global $ params, $ config;
- If ($ model = 'Singles '){
- $ Proxy_port = intval (trim ($ params ['port']);
- $ Check_proxy_url = $ config ['verify _ url'];
- $ Proxy_time_out = intval (trim ($ params ['timeout']);
- $ Retry = intval (trim ($ params ['retry']);
- $ Proxy_ip = trim ($ params ['IP']);
- $ Proxy = new proxy ($ proxy_ip, $ proxy_port, $ check_proxy_url, $ proxy_time_out, $ retry );
- // String success is returned if the call succeeds. if the call fails, boolean false is returned.
- $ Result = $ proxy-> check_proxy ();
- // Var_dump ($ result );
- $ Proxy_str_success = ''. $ proxy_ip. ':'. $ proxy_port. '@'. 'http proxy verification successful! ';
- $ Proxy_str_failed = ''. $ proxy_ip. ':'. $ proxy_port. '@'. 'http proxy verification failed! ';
- Return $ result! = False? $ Proxy_str_success: $ proxy_str_failed;
- } Elseif ($ model = 'collect '){
- $ Proxy_port = intval (trim ($ proxy_info_arr ['port']);
- $ Check_proxy_url = $ config ['verify _ url'];
- $ Proxy_time_out = intval (trim ($ params ['timeout']);
- $ Retry = intval (trim ($ params ['retry']);
- $ Proxy_ip = trim ($ proxy_info_arr ['IP']);
- /* Echo $ proxy_ip .'
';
- Echo $ proxy_port .'
';
- Echo $ check_proxy_url .'
';
- Echo $ proxy_time_out .'
';
- Echo $ retry .'
';*/
- If (! Isset ($ proxy )){
- $ Proxy = new proxy ($ proxy_ip, $ proxy_port, $ check_proxy_url, $ proxy_time_out, $ retry );
- }
- // String success is returned if the call succeeds. if the call fails, boolean false is returned.
- $ Result = $ proxy-> check_proxy ();
- Return $ result;
- }
- }
- Function get_single (){
- Global $ params, $ config;
- $ Proxy_ip = trim ($ params ['IP']);
- If ($ proxy_ip = ''){
- Echo 'Enter the IP address !! ';
- Return;
- }
- Echo check_proxy_is_useful ('Single ');
- }
- Function get_proxy_by_collect (){
- Global $ params, $ config;
- $ Params ['URL'] = trim ($ params ['URL']);
- If ($ params ['URL'] = ''){
- Echo 'Enter the url! ';
- Return;
- }
- // $ Url = 'http: // www.dn28.com/html/75/n-5175.html ';
- $ Con = iconv ('gbk', 'utf-8', file_get_contents ($ params ['URL']);
- Preg_match ('/<\/TBODY> <\/TABLE> (.*)
/S', $ con, $ arr );
- $ Result = strip_tags ($ arr [1],'
');
- $ Replace_arr1 = array ('', 'qq proxy: ', 'dn28. com ', 'qqip', 'qq proxy', 'qq proxy IP', 'proxy ip: ', 'Ip:', 'proxy IP ');
- $ Result = str_replace ($ replace_arr1, array (''), $ result );
- // Print_r ($ arr );
- $ ResArr = explode ('
', $ Result );
- // Print_r ($ resArr );
- Echo 'proxy starts batch verification, and the entire process takes several minutes. ';
- Unset ($ _ SESSION ['success _ arr']);
- Foreach ($ resArr as $ k => $ v ){
- $ NewRes [$ k] = parseProxyInfo ($ v );
- // Print_r ($ newRes [$ k]);
- /* Return ;*/
- $ Result = check_proxy_is_useful ('collect ', $ newRes [$ k]);
- $ Proxy_str_success = ''. $ newRes [$ k] ['IP']. ':'. $ newRes [$ k] ['port']. '@'. $ newRes [$ k] ['type']. 'Proxy verification successful! IP address: '. $ newRes [$ k] ['location']. '';
- $ Proxy_str_failed = ''. $ newRes [$ k] ['IP']. ':'. $ newRes [$ k] ['port']. '@'. $ newRes [$ k] ['type']. 'Proxy verification failed! IP address: '. $ newRes [$ k] ['location']. '';
- If ($ result! = False ){
- Echo $ proxy_str_success;
- $ _ SESSION ['success _ arr'] [] = $ success_arr [] = $ newRes [$ k];
- } Else {
- Echo $ proxy_str_failed;
- }
- Echo'
';
- }
- If (isset ($ success_arr) & count ($ success_arr)> 0 ){
- Save_success_proxy ($ success_arr );
- Echo'
[Save the verified proxy to your local computer] [I want to see historical data] ';
- } Else {
- Echo'
[I want to see historical data] ';
- }
- // Print_r ($ success_arr );
- }
- Function get_proxy_by_rule (){
- Global $ params, $ config;
- $ Result = $ proxy_ip = trim ($ params ['IP']);
- If ($ proxy_ip = ''){
- Echo 'Enter the IP address !! ';
- Return;
- }
- $ Replace_arr1 = array ('', 'qq proxy: ', 'dn28. com ', 'qqip', 'qq proxy', 'qq proxy IP', 'proxy ip: ', 'Ip:', 'proxy IP ');
- $ Result = str_replace ($ replace_arr1, array (''), $ result );
- $ ResArr = explode ("\ n", $ result );
- // Print_r ($ resArr );
- Echo 'proxy starts batch verification, and the entire process takes several minutes. ';
- Unset ($ _ SESSION ['success _ arr']);
- Foreach ($ resArr as $ k => $ v ){
- $ NewRes [$ k] = parseProxyInfo ($ v );
- // Print_r ($ newRes [$ k]);
- /* Return ;*/
- $ Result = check_proxy_is_useful ('collect ', $ newRes [$ k]);
- // Var_dump ($ result );
- $ Proxy_str_success = ''. $ newRes [$ k] ['IP']. ':'. $ newRes [$ k] ['port']. '@'. $ newRes [$ k] ['type']. 'Proxy verification successful! IP address: '. $ newRes [$ k] ['location']. '';
- $ Proxy_str_failed = ''. $ newRes [$ k] ['IP']. ':'. $ newRes [$ k] ['port']. '@'. $ newRes [$ k] ['type']. 'Proxy verification failed! IP address: '. $ newRes [$ k] ['location']. '';
- If ($ result! = False ){
- Echo $ proxy_str_success;
- $ _ SESSION ['success _ arr'] [] = $ success_arr [] = $ newRes [$ k];
- } Else {
- Echo $ proxy_str_failed;
- }
- Echo'
';
- }
- If (isset ($ success_arr) & count ($ success_arr)> 0 ){
- Save_success_proxy ($ success_arr );
- Echo'
[Save to a php file] [Save the verified proxy to a local computer] [I want to see historical data] ';
- } Else {
- Echo'
[I want to see historical data] ';
- }
- }
- Function save_success_proxy ($ success_arr ){
- Global $ config;
- Date_default_timezone_set ('prc ');
- $ Str = '';
- Foreach ($ success_arr as $ k => $ v ){
- $ Str. = $ v ['IP']. ':'. $ v ['port']. '@'. $ v ['type']. ';'. $ v ['location']. "\ n ";
- }
- $ Fp = fopen ($ config ['root _ path']. '/success_proxy/'. date ('ymdhi'). '. log', 'A + ');
- Fwrite ($ fp, $ str );
- Fclose ($ fp );
- Unset ($ str );
- }
- ?>
|