0x01
Spider crawl URL uses the file_get_contents ()/fopen () function, using the regular matching method (seemingly the simplest way, record next add)
0x02
usage:php spider.php www.baidu.com results saved in www.baidu.com file
Code:
-----------------
<?PHP//2015-4-16 @developd//Get URL in Web page to save URL.txt in//usage php spider.php URLif($ARGC!== 2){ Echo"No target...\r\n"; Echo"usage:php spider.php URL \ r \ n"; Echo"Eg:php spider.php www.baidu.com \ r \ n"; Exit(); }Else{ $url=$argv[' 1 '];}if(Empty($url)){ Echo"URL Error \ r \ n"; Exit(); }$filename="$url";$url= "http://".$url;if(!file_exists($filename)){ $file=fopen($filename, "A +"); fclose($file);}$site=substr($url, 0,Strpos($url, '/', 8));$base=substr($url, 0,Strpos($url, '/') +1);$fp=fopen($url, ' R '); while(!feof($fp)){ $contents.=fread($fp, 1024); //Var_dump ($content);}$pattern= "|href=["]? ([^ ' \ "]+) [' \ ']| U;Preg_match_all($pattern,$contents,$REGARR,preg_set_order); for($i= 0;$i<Count($REGARR);$i++){ if(substr($REGARR[$i][1],0,1) = = "/") $data= "URL". ($i+1). ":".$site.$REGARR[$i][1].Php_eol; Else $data= "URL". ($i+1). ":".$REGARR[$i][1].Php_eol; $res=file_put_contents($filename,$data,file_append);}if(Empty($res)){ Echo"No urls...\r\n";}Else{ Echo"Get URLS success...\r\n";}fclose($fp);?>
-----------------
PHP simple implementation spider catch URL