In Data acquisition and page analysis, it is often necessary to crawl the content of a given URL page, or the second and third level of depth page content.
Here is the implementation of a test example, for informational purposes only.
The code is as follows:
/*
Match a given page link
Return:array Match[link,content,all]
*/
function Match_links ($host, $document) {
$pattern = '/(. *?) /I ';
Preg_match_all ($pattern, $document, $m);
return $m;
Preg_match_all ("' ]+)] [^>]*>? (.*?)' Isx ", $document, $links);
while (list ($key, $val) = each ($links [2])) {
if (!empty ($val))
if (Preg_match ("/http/", $val)) {
$MATC h[' link ' [] = $val;
}
Else {
$match [' link '] [] = $host. $val;
}
}
while (list ($key, $val) = each ($links [3])) {
if (!empty ($val))
if (Preg_match ("/http/", $v AL) {
$match [' link '] [] = $val;
}
Else {
$match [' link '] [] = $host. $val;
}
}
while (list ($key, $val) = each ($links [4])) {
if (!empty ($val))
$match [' content '] [] = $va L
}
while (list ($key, $val) = each ($links [0])) {
if (!empty ($val))
$match [' All '] [] = $val;
}
Return $match [' link '];
}
/*
Gets the page text content from the given URL
*/
function Get_content_from_url ($url) {
$str = @file_get_contents ($url);
if (mb_check_encoding ($str, "GBK"))
$str = Iconv ("GBK", "UTF-8", $str);
$str = Strip_tags ($STR); Filter HTML Tags
/*
$str = Preg_replace ("@ @is", "", $str);
$str = Preg_replace ("@ @is", "", $str);
$str = Preg_replace ("@ <> < style=""> @is", "", $str);
$str = Preg_replace ("@< (. *?) > @is "," ", $str);
*/
Filtering non-kanji characters
Preg_match_all ('/[x{4e00}-x{9fff}]+/u ', $str, $matches);
$str = Join (', ', $matches [0]);
if (! $str)
return NULL;
return $str;
}<>
function Get_content ($url, $depth) {
if (! $url | | $depth < 1)
return false;
while ($depth > 1) {
$str = @file_get_contents ($url);
if (! $str)
return false;
$parseurl = Parse_url ($url);
if ($parseurl [' Host '])
$host = $parseurl [scheme]. "://" . $parseurl [' Host '];
$arrlink = Match_links ($host, $STR);
$arr _url = Array_unique ($arrlink);
$depth--;
foreach ($arr _url as $url) {
$content. = Get_content ($url, $depth); Recursive invocation
}
}
$content. = Get_content_from_url ($url);
return $content;
}
http://www.bkjia.com/PHPjc/372096.html www.bkjia.com true http://www.bkjia.com/PHPjc/372096.html techarticle in Data acquisition and page analysis, it is often necessary to crawl the content of a given URL page, or the second and third level of depth page content. Here is the implementation of a test example, for informational purposes only. ...