In Data acquisition and page analysis, often need to crawl the content of a given URL page, or the second, third level depth page content.
Here is a test example implementation, for reference only.
The code is as follows:
/*
Match a given page link
Return:array Match[link,content,all]
*/
function Match_links ($host, $document) {
$pattern = '/<a (. *?) Href= "(. *?)" (.*?) > (. *?) </a>/i ';
Preg_match_all ($pattern, $document, $m);
return $m;
preg_match_all ("' <s*as.*?hrefs*=s* (["])? (1) (.*?) 1| ([^s>]+)] [^>]*>? (.*?) </a> ' Isx ', $document, $links);
while ($key, $val) = each ($links [2])) {
if (!empty ($val))
if (preg _match ("/http/", $val)) {
$match [' link '] = $val;
}
else {
$match [' link '] = $host. $val;
}
}
while ($key, $val) = each ($links [3])) {
if (!empty ($val))
if (preg _match ("/http/", $val)) {
$match [' link '] = $val;
}
else {
$match [' link '] = $host. $val;
}
}
while ($key, $val) = each ($links [4])) {
if (!empty ($val))
$match [ ' content ' [] = $val;
}
while list ($key, $val) = each($links [0])) {
if (!empty ($val))
$match [' All '] [] = $val;
}
return $match [ ' Link '];
}
/*
Get the page text content from the given URL
*/
function Get_content_from_url ($url) {
$str = @file_get_contents ($url);
if (mb_check_encoding ($str, "GBK"))
$str = Iconv ("GBK", "UTF-8", $str);
$str = Strip_tags ($STR); Filter HTML Tags
/*
$str = Preg_replace ("@<script" (. *?) </script> @is "," ", $str);
$str = Preg_replace ("@<iframe" (. *?) </iframe> @is "," ", $str);
$str = Preg_replace ("@<style" (. *?) </style> @is "," ", $str);
$str = Preg_replace ("@<" (. *?) > @is "," ", $str);
*/
Filter non-kanji characters
Preg_match_all ('/[x{4e00}-x{9fff}]+/u ', $str, $matches);
$str = Join (', ', $matches [0]);
if (! $str)
return NULL;
return $str;
}
function Get_content ($url, $depth) {
if (! $url | | $depth < 1)
return false;
while ($depth > 1) {
$str = @file_get_contents ($url);
if (! $str)
return false;
$parseurl = Parse_url ($url);
if ($parseurl [' Host '])
$host = $parseurl [scheme]. "://" . $parseurl [' Host '];
$arrlink = Match_links ($host, $STR);
$arr _url = Array_unique ($arrlink);
$depth--;
foreach ($arr _url as $url) {
$content. = Get_content ($url, $depth); Recursive call
}
}
$content. = Get_content_from_url ($url);
return $content;
}