PHP implements simple crawling
\ '\ "\] *). *?> /'; $ Result = preg_match_all ($ reg_tag_a, $ web_content, $ match_result); if ($ result) {return $ match_result [1];} /*** modify the relative path ** @ param string $ base_url * @ param array $ url_list * @ return array */function _ reviseUrl ($ base_url, $ url_list) {$ url_info = parse_url ($ base_url); $ base_url = $ url_info ["scheme"]. ': //'; if ($ url_info ["user"] & $ url_info ["pass"]) {$ base_url. = $ url_info ["user"]. ":". $ url_info ["pass"]. "@";} $ base_url. = $ url_info ["host"]; if ($ url_info ["port"]) {$ base_url. = ":". $ url_info ["port"];} $ base_url. = $ url_info ["path"]; print_r ($ base_url); if (is_array ($ url_list) {foreach ($ url_list as $ url_item) {if (preg_match ('/^ http/', $ url_item) {// A complete url $ result [] = $ url_item ;} else {// incomplete url $ real_url = $ base_url. '/'. $ url_item; $ result [] = $ real_url;} return $ result;} else {return ;}} /*** crawler ** @ param string $ url * @ return array */function crawler ($ url) {$ content = _ getUrlContent ($ url); if ($ content) {$ url_list = _ reviseUrl ($ url, _ filterUrl ($ content); if ($ url_list) {return $ url_list;} else {return ;}} else {return ;}/ *** main program for testing */function main () {$ current_url =" http://hao123.com/ "; // Initial url $ fp_puts = fopen (" url.txt "," AB "); // record the url list $ fp_gets = fopen (" url.txt "," r "); // Save the url list do {$ result_url_arr = crawler ($ current_url); if ($ result_url_arr) {foreach ($ result_url_arr as $ url) {fputs ($ fp_puts, $ url. "\ r \ n") ;}}while ($ current_url = fgets ($ fp_gets, 1024); // continuously obtain the url} main ();?>