<?php #加载页面 Function Curl_get ($url) { $ch =curl_init (); curl_setopt ($ch, Curlopt_url, $url); curl_setopt ($ch, Curlopt_returntransfer, 1); curl_setopt ($ch, curlopt_header,1); $result =curl_exec ($ch); $code =curl_getinfo ($ch, Curlinfo_http_code); if ($code!= ' 404 ' && $result) { return $result; curl_close ($ch); } #获取页面url链接 Function Get_page_urls ($spider _page_result, $base _url) { $get _url_result=preg_match_all ("/<[a| A].*?href=[\ ']{0,1} ([^>\ ' \ \]*). *?>/", $spider _page_result, $out); if ($get _url_result) { return $out [1]; }else{ return; } } #相对路径转绝对路径 Function Xdtojd ($base _url, $url _list) { if (Is_array ($url _list)) { foreach ($url _ List as $url _item) { if (Preg_match ("/^ (http:\/\/|https:\/\/|javascript:)/", $url _item)) { $result _url_list[]= $url _item; }else { if (Preg_match ("/^\//", $url _item)) { $ Real_url = $base _url. $url _item; }else{ $real _url = $base _url. " /". $url _item; } # $real _url = ' http://www.sumpay.cn/'. $url _item; $result _url_list[] = $real _url; } } return $result _url_list; }else{ return } } #删除其他站点url function Other_site_url_del ($jd _url_list, $url _base) {
if (Is_array ($jd _url_list)) { foreach ($jd _url_list as $all _url) { echo $all _url; if (Strpos ($all _url, $url _base) ===0) { $all _url_list[]= $all _url; }
} return $all _url_list; }else{ Return } } #删除相同URL function Url_same_del ($array _url) {
if (Is_array ($array _url)) { $insert _url=array (); $pizza =file_get_contents ("/tmp/url.txt"); if ($pizza) { $pizza =explode ("\ r \ n", $pizza); foreach ($array _url as $array _value_url) { if (!in_array ($array _value_url, $pizza)) { $insert _url[]= $array _value_url; } } if ($insert _url) { foreach ($insert _url as $key => $insert _url_value) { #这里只做了参数相同去重处理 $update _insert_url=preg_replace ('/=[^&]*/', ' =leesec ', $insert _url_value); foreach ($pizza as $pizza _value) { $update _pizza_value=preg_replace ('/=[^&]*/', ' =leesec ', $pizza _value); if ($update _insert_url== $update _pizza_value) { unset ($insert _url[$key]); Continue } } } } }else{ $insert _url=array (); $insert _new_url=array (); $insert _url= $array _url; foreach ($insert _url as $insert _url_value) { $update _insert_url=preg_replace ( '/=[^&]*/', ' =leesec ', $insert _url_value); $insert _new_url[]= $update _insert_url; } $ Insert_new_url=array_unique ($insert _new_url); foreach ($insert _new_url as $key => $insert _new_url_val) { $insert _url_bf[]= $insert _url[$key]; } $insert _url= $insert _url_bf; } return $insert _url; }else{ return; } } $current _url= $argv [1]; $fp _puts = fopen ("/tmp/url.txt", "AB");//Record URL list $fp _gets = fopen ("/tmp/url.txt", "R");//Save URL List $url _base_url=parse_url ($current _url); if ($url _base_url[' scheme ']== "") { $url _base= "http://" $url _base_url[' host ']; }else{ $url _base= $url _base_url[' scheme '. ":/ /". $url _base_url[' host ']; } do{ $spider _page_result=curl_get ($current _url); #var_dump ($spider _page_result); $url _list=get_page_urls ($spider _page_result, $url _base); #var_dump ($url _list); if (! $url _list) { Continue } $jd _url_list=xdtojd ($url _base, $url _list); #var_dump ($jd _url_list); $result _url_arr=other_site_url_del ($jd _url_list, $url _base); Var_dump ($result _url_arr); $result _url_arr=url_same_del ($result _url_arr); #var_dump ($result _url_arr); if (Is_array ($result _url_arr)) { $result _url_arr=array_unique ($result _url_arr); foreach ($result _url_arr as $new _url) { Fputs ($fp _puts, $new _url. " \ r \ n "); } }
} while ($current _url = fgets ($fp _gets,1024))//keep getting URLs Preg_match_all ("/<a[^>]+href=[\"] ([^\ "']+) [\" '][^>]+>/", $spider _page_result, $out); # echo a href #var_dump ($out [1]); ?> |