<?php
Class Spider {
var $mysql _host;
var $mysql _name;
var $mysql _pwd;
var $mysql _db;
var $parentUrl; URL to start the search
var $searchNum; Number of layers searched
var $url;
var $db;
Database connection functions
function connect_to_db ($mysql _host, $mysql _name, $mysql _pwd) {
$db =mysql_connect ($mysql _host, $mysql _name, $mysql _pwd);
return $db;
}
Process URLs to conform to the criteria
function Dealurl ($url) {
if (Strstr ($url, "http://")) {
}else{
$url = "http://". $url;
}
if (Strrpos ($url, '/') ==strlen ($url)-1) {
$url = substr ($url, 0,-1);
}
return $url;
}
Take all the links under a link
function GetUrl ($url) {
$fcontents = file ($url);
$NEXTURL = "Succeed";
while (list (, $line) =each ($fcontents)) {
while (Eregi (href[[:space:]]*=[[:space:]]*)? [ [: alnum:]:@/._-] "?) (. *) ', $line, $regs)})
while (Eregi (href[[:space:]]*=[[:space:]]*)? [ [: alnum:]:@/._-] [^ (: space:]|>| ")] *) (. *) ', $line, $regs)) {
$regs [1] =
Eregi_replace (' (href[[:space:]]*=[[:space:]]* "?) ([[: alnum:]:@/._-]) ("?)", "2", $regs [1]);
if (Strstr ($regs [1], "http://")) {
}else{
$regs [1]= $url.] /". $regs [1];
}
echo " $regs [1]<br>";
$line = $regs [2];
if (Strstr ($NEXTURL, $regs [1])) {
}else{
if (Strstr ($regs [1], ". php") | | Strstr ($regs [1], ". asp") | | Strstr ($regs [1], ". jsp") | | Strstr ($regs [1], ". htm") | | Strstr ($regs [1], ". com") | | Strstr ($regs [1], ". cn") | | Strstr ($regs [1], ". Net") | | Strstr ($regs [1], ". org")) {
if (Strstr ($regs [1], "_bak")) {
}else{
$NEXTURL = $nextUrl. ",". $regs [1];
}
}
}
}
}
return $NEXTURL;
}
Query whether the URL needs to be searched again
function Queryurl ($url, $contentDesc, $db) {
mysql_select_db ("SearchEngine");
$sql = "SELECT * from visited where Visitedurl= '". $url. "' and contentdesc= '". $contentDesc. "'";
$rs =mysql_query ($sql, $db);
if (Mysql_fetch_row ($rs)) {
return false;
}else{
return true;