PHP web crawler

Source: Internet
Author: User
Tags ereg

Only one page can be crawled

<?PHPfunctionGet_urls ($url){         $url _array=Array(); $the _first_content=file_get_contents($url); $the _second_content=file_get_contents($url); $pattern 1= "/http:\/\/[a-za-z0-9\.\?\/\-\=\&\:\+\-\_\" \ "]+/"; $pattern 2= "/http:\/\/[a-za-z0-9\." +/"; Preg_match_all($pattern 2,$the _second_content,$matches 2); Preg_match_all($pattern 1,$the _first_content,$matches 1); $new _array1=Array_unique($matches 1[0]); $new _array2=Array_unique($matches 2[0]); $final _array=Array_merge($new _array1,$new _array2); $final _array=Array_unique($final _array);  for($i= 0;$i<Count($final _array);$i++)         {            Echo $final _array[$i]." <br/> "; }     }  Get_urls ("http://www.yinghy.com");?>

<?php $string = Gethtmlcode ("http://www.yinghy.com"), echo $string, function Gethtmlcode ($url) {$ch = Curl_init ();// Initializes a Cur object curl_setopt ($ch, Curlopt_url, $url);//Set the page to crawl curl_setopt ($ch, Curlopt_returntransfer, 1);//Set Cru L parameter, which requires the result to be saved to a string or output to the screen curl_setopt ($ch, curlopt_connecttimeout,1000);//Set link delay $HtmlCode = curl_exec ($ch);//Run C  URL, request page return $HtmlCode;         } function Getalllink ($string) {$string = Str_replace ("\ R", "", $string);         $string = Str_replace ("\ n", "", $string); $regex [url] = "((http|https|ftp|telnet|news): \/\/)? ([A-z0-9_\-\/\.] +\.              [][a-z0-9:;&#@=_~%\?\/\.\,\+\-]+) "; $regex [Email] = "([a-z0-9_\-]+) @ ([a-z0-9_\-]+\.[                       a-z0-9\-\._\-]+) ";              remove [] $string = Eregi_replace ("\[|\", "", $string) in the Web page;                          Remove the JavaScript code $string = eregi_replace ("<!--. *//-->", "", $string); Remove the non-<a> html tag $string = eregi_replace ("</?[^aa][^<>]*> "," ", $string);        Split all links in $string $output = Split (' </a> ', $string);        for ($i =0; $i <count ($output), $i + +) {$output _1 = Split ("<a", $output [$i]);   } return $output _1;      } function Getusercarenews ($test, $keywords, $url) {$messTxt = "";      $k = 0;        $key = Explode (";", $keywords);      Automatically load HTTP on the website, avoid netease mailbox link error, there are certain limitations if (!ereg ("http", $url)) {$url = "//". $url; } for ($i =0; $i <count ($test), $i + +) {$test [$i] = eval (' Return '. Iconv (' GBK ', ' Utf-8 ', Var_export ($test [$i], true)). '; '); /Modify Encoding if (Ereg ("href", $test [$i]) &&!ereg ("href= ' # '", $test [$i])) {//Remove invalid link for ($j =0; $j <c Ount ($key); $j + +) {//Support multiple keywords if (strpos ($test [$i], $key [$j])!==false) {$mess [$k + +]=ereg_replace ($key [$j], "<font color=red>". $key [$j]. "      </font> ", $test [$i]);//Highlight keywords}}    }} $mess = Array_unique ($mess); Array de-=0//Handle the Send link, add the site root for the link for ($l, $l <count ($mess), $l + +) {if (!ereg ("http", $mess [$l]) & & (Strlen ($mess [$l])! = 0)) {//Remove empty array, this step is important if you do not remove the quality that directly affects the back link $mess [$l] = eregi_replace ("href=[\" '] "," ",                  $mess [$l]);                  $mess [$l] = $url. $mess [$l];                  $mess [$l] = Eregi_replace ("/", "/", $mess [$l]); if (Ereg ("'", $mess [$l]) {$mess [$l]= "<a href= '". $mess [$l]. "                  </a> "; } if (Ereg ("\" ", $mess [$l]) {$mess [$l] =" <a href=\ ". $mess [$l]."                  </a> "; }} else{$mess [$l] = "<a". $mess [$l]. "          </a> ";          } $messTxt. = $mess [$l];      $messTxt. = "<BR>";  } return $MESSTXT; The function SendEmail ($to, $content) {//author:luofei//$to represents the recipient address, $content represents the message body content error_reporting (E_STRICT);     Error report Date_default_timezone_set ("Asia/shanghai");      Set time zone require_once ("class.phpmailer.php");        Require_once ("class.smtp.php");                        $mail = new Phpmailer ();                        Create a new object $mail->charset = "UTF-8";                             Set the encoding, Chinese does not appear garbled $mail->issmtp ();                            Set $mail->smtpdebug = 1 using the SMTP service;                                                      Enable SMTP debugging feature I//1 = errors and messages                       2 = messages only $mail->smtpsecure = "TLS";         Security protocol $mail->host = "smtp.googlemail.com";                          SMTP server $mail->smtpauth = true;                 Enable the SMTP authentication feature $mail->username = "[email protected]";                  SMTP Server user name $mail->password = "******"; SMTP Server user password $mail->from = "[email protected]";                      The sender $mail->fromname = "Spider Service";                          Sender's name (shown on message) $mail->addaddress ($to);                          Recipient address $mail->wordwrap = 50;                         Sets the number of characters per line of the message body $mail->ishtml (true);     Sets whether the message body content is of HTML type $mail->subject = "mail from spider.html"; Message subject $mail->body = "<p> Hello! <BR> <p> This is what you're interested in </p> <BR> ". $content."                                                      ";      Message body if (! $mail->send ())//Mail send report {echo "Send mail error!"; } else {echo "message sent successfully!      "; }}?>

  

PHP web crawler

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.