The following is an example of a php imitation Baidu spider crawler program. I will not analyze this code if it is well written. if you need it, please refer to it. I wrote a crawler using PHP. The basic functions have been implemented. if you are interested, try the script. Disadvantages: 1... the following is an example of a php imitation Baidu spider crawler program. I will not analyze this code if it is well written. if you need it, please refer to it.
I wrote a crawler using PHP. The basic functions have been implemented. if you are interested, try it.
Disadvantages of the script: 1. the static page is not de-duplicated; 2. the js operation results in the page are not processed.
The following is an example of php imitating Baidu spider crawlers:
\ '\ "\] *). *?> /", $ Spider_page_result, $ out); if ($ get_url_result) {return $ out [1];} else {return ;}} // relative path to absolute path function xdtojd ($ base_url, $ url_list) {if (is_array ($ url_list) {foreach ($ url_list as $ url_item) {if (preg_match ("/^ (http: \/| https: \/| javascript :)/", $ url_item )) {$ result_url_list [] = $ url_item;} else {if (preg_match ("/^ \/", $ url_item) {$ real_url = $ base_url. $ url_item;} else {$ real_url = $ base_url. "/". $ url_item;} // $ real_url =' http://www.sumpay.cn/ '. $ Url_item; $ result_url_list [] = $ real_url; }}return $ result_url_list;} else {return ;}}// delete other sites urlfunction other_site_url_del ($ jd_url_list, $ url_base) {if (is_array ($ jd_url_list) {foreach ($ jd_url_list as $ all_url) {echo $ all_url; if (strpos ($ all_url, $ url_base) === 0) {$ all_url_list [] = $ all_url; }}return $ all_url_list;} else {return ;}// delete the same URLfunction url_same_del ($ array_url) {I F (is_array ($ array_url) {$ insert_url = array (); $ pizza = file_get_contents ("/tmp/url.txt"); if ($ pizza) {$ pizza = explode ("\ r \ n", $ pizza); foreach ($ array_url as $ array_value_url) {if (! In_array ($ array_value_url, $ pizza) {$ insert_url [] = $ array_value_url;} if ($ insert_url) {foreach ($ insert_url as $ key => $ insert_url_value) {// here, only the same parameters are used for deduplication $ update_insert_url = preg_replace ('/= [^ &] */', '= leesec', $ insert_url_value ); foreach ($ pizza as $ pizza_value) {$ update_pizza_value = preg_replace ('/= [^ &] */', '= lesec', $ pizza_value ); if ($ update_insert_url = $ update_pizza_value) {uns Et ($ insert_url [$ key]); continue ;}}} else {$ insert_url = array (); $ insert_new_url = array (); $ insert_url = $ array_url; foreach ($ insert_url as $ insert_url_value) {$ update_insert_url = preg_replace ('/= [^ &] */', '= leesec', $ insert_url_value ); $ insert_new_url [] = $ update_insert_url;} $ insert_new_url = array_unique ($ insert_new_url); foreach ($ insert_new_url as $ key => $ insert_new_url_val) Nsert_url_bf [] = $ insert_url [$ key];} $ insert_url = $ insert_url_bf;} return $ insert_url;} else {return ;}$ current_url = $ argv [1]; $ fp_puts = fopen ("/tmp/url.txt", "AB"); // record the url list $ fp_gets = fopen ("/tmp/url.txt", "r "); // Save the url list $ url_base_url = parse_url ($ current_url); if ($ url_base_url ['scheme '] = "") {$ url_base = "http ://". $ url_base_url ['host'];} else {$ url_base = $ url_base_url ['scheme ']. "://". $ Url_base_url ['host'];} do {$ scheme = curl_get ($ current_url); // var_dump ($ scheme); $ url_list = get_page_urls ($ spider_page_result, $ url_base ); // var_dump ($ url_list); if (! $ Url_list) {continue;} $ jd_url_list = xdtojd ($ url_base, $ url_list); // var_dump ($ jd_url_list); $ scheme = aggregate ($ jd_url_list, $ url_base ); var_dump ($ result_url_arr); $ result_url_arr = url_same_del ($ result_url_arr); // var_dump ($ scheme); if (is_array ($ result_url_arr )) {$ result_url_arr = array_unique ($ result_url_arr); foreach ($ result_url_arr as $ new_url) {fputs ($ fp_puts, $ New_url. "\ r \ n") ;}}while ($ current_url = fgets ($ fp_gets, 1024 )); // continuously obtain urlpreg_match_all ("/] + href = [\" '] ([^ \ "'] +) [\" '] [^>] +> /", $ spider_page_result, $ out); // echo a href // var_dump ($ out [1]);?>