Only one page can be crawled
<?PHPfunctionGet_urls ($url){ $url _array=Array(); $the _first_content=file_get_contents($url); $the _second_content=file_get_contents($url); $pattern 1= "/http:\/\/[a-za-z0-9\.\?\/\-\=\&\:\+\-\_\" \ "]+/"; $pattern 2= "/http:\/\/[a-za-z0-9\." +/"; Preg_match_all($pattern 2,$the _second_content,$matches 2); Preg_match_all($pattern 1,$the _first_content,$matches 1); $new _array1=Array_unique($matches 1[0]); $new _array2=Array_unique($matches 2[0]); $final _array=Array_merge($new _array1,$new _array2); $final _array=Array_unique($final _array); for($i= 0;$i<Count($final _array);$i++) { Echo $final _array[$i]." <br/> "; } } Get_urls ("http://www.yinghy.com");?>
<?php $string = Gethtmlcode ("http://www.yinghy.com"), echo $string, function Gethtmlcode ($url) {$ch = Curl_init ();// Initializes a Cur object curl_setopt ($ch, Curlopt_url, $url);//Set the page to crawl curl_setopt ($ch, Curlopt_returntransfer, 1);//Set Cru L parameter, which requires the result to be saved to a string or output to the screen curl_setopt ($ch, curlopt_connecttimeout,1000);//Set link delay $HtmlCode = curl_exec ($ch);//Run C URL, request page return $HtmlCode; } function Getalllink ($string) {$string = Str_replace ("\ R", "", $string); $string = Str_replace ("\ n", "", $string); $regex [url] = "((http|https|ftp|telnet|news): \/\/)? ([A-z0-9_\-\/\.] +\. [][a-z0-9:;&#@=_~%\?\/\.\,\+\-]+) "; $regex [Email] = "([a-z0-9_\-]+) @ ([a-z0-9_\-]+\.[ a-z0-9\-\._\-]+) "; remove [] $string = Eregi_replace ("\[|\", "", $string) in the Web page; Remove the JavaScript code $string = eregi_replace ("<!--. *//-->", "", $string); Remove the non-<a> html tag $string = eregi_replace ("</?[^aa][^<>]*> "," ", $string); Split all links in $string $output = Split (' </a> ', $string); for ($i =0; $i <count ($output), $i + +) {$output _1 = Split ("<a", $output [$i]); } return $output _1; } function Getusercarenews ($test, $keywords, $url) {$messTxt = ""; $k = 0; $key = Explode (";", $keywords); Automatically load HTTP on the website, avoid netease mailbox link error, there are certain limitations if (!ereg ("http", $url)) {$url = "//". $url; } for ($i =0; $i <count ($test), $i + +) {$test [$i] = eval (' Return '. Iconv (' GBK ', ' Utf-8 ', Var_export ($test [$i], true)). '; '); /Modify Encoding if (Ereg ("href", $test [$i]) &&!ereg ("href= ' # '", $test [$i])) {//Remove invalid link for ($j =0; $j <c Ount ($key); $j + +) {//Support multiple keywords if (strpos ($test [$i], $key [$j])!==false) {$mess [$k + +]=ereg_replace ($key [$j], "<font color=red>". $key [$j]. " </font> ", $test [$i]);//Highlight keywords}} }} $mess = Array_unique ($mess); Array de-=0//Handle the Send link, add the site root for the link for ($l, $l <count ($mess), $l + +) {if (!ereg ("http", $mess [$l]) & & (Strlen ($mess [$l])! = 0)) {//Remove empty array, this step is important if you do not remove the quality that directly affects the back link $mess [$l] = eregi_replace ("href=[\" '] "," ", $mess [$l]); $mess [$l] = $url. $mess [$l]; $mess [$l] = Eregi_replace ("/", "/", $mess [$l]); if (Ereg ("'", $mess [$l]) {$mess [$l]= "<a href= '". $mess [$l]. " </a> "; } if (Ereg ("\" ", $mess [$l]) {$mess [$l] =" <a href=\ ". $mess [$l]." </a> "; }} else{$mess [$l] = "<a". $mess [$l]. " </a> "; } $messTxt. = $mess [$l]; $messTxt. = "<BR>"; } return $MESSTXT; The function SendEmail ($to, $content) {//author:luofei//$to represents the recipient address, $content represents the message body content error_reporting (E_STRICT); Error report Date_default_timezone_set ("Asia/shanghai"); Set time zone require_once ("class.phpmailer.php"); Require_once ("class.smtp.php"); $mail = new Phpmailer (); Create a new object $mail->charset = "UTF-8"; Set the encoding, Chinese does not appear garbled $mail->issmtp (); Set $mail->smtpdebug = 1 using the SMTP service; Enable SMTP debugging feature I//1 = errors and messages 2 = messages only $mail->smtpsecure = "TLS"; Security protocol $mail->host = "smtp.googlemail.com"; SMTP server $mail->smtpauth = true; Enable the SMTP authentication feature $mail->username = "[email protected]"; SMTP Server user name $mail->password = "******"; SMTP Server user password $mail->from = "[email protected]"; The sender $mail->fromname = "Spider Service"; Sender's name (shown on message) $mail->addaddress ($to); Recipient address $mail->wordwrap = 50; Sets the number of characters per line of the message body $mail->ishtml (true); Sets whether the message body content is of HTML type $mail->subject = "mail from spider.html"; Message subject $mail->body = "<p> Hello! <BR> <p> This is what you're interested in </p> <BR> ". $content." "; Message body if (! $mail->send ())//Mail send report {echo "Send mail error!"; } else {echo "message sent successfully! "; }}?>
PHP web crawler