Web page
No matter when want to crawl point page to see, also do not understand, just play, write a point code, chat with entertainment.
A slightly meaningful function is: Get_content_by_socket (), Get_url (), Get_content_url (), get_content_object several functions, and may be able to give you some idea.
<?php
Get all content URLs to save to file
function Get_index ($save _file, $prefix = "Index_") {
$count = 68;
$i = 1;
if (file_exists ($save _file)) @unlink ($save _file);
$fp = fopen ($save _file, "A +") or Die ("Open"). $save _file. "Failed");
while ($i < $count) {
$url = $prefix. $i. ". HTM ";
echo "Get". $url. " ...";
$url _str = Get_content_url (Get_url ($url));
echo "ok\n";
Fwrite ($fp, $url _str);
+ + $i;
}
Fclose ($FP);
}
Get Target Multimedia Object
function Get_object ($url _file, $save _file, $split = "| |:* *:--|") {
if (!file_exists ($url _file)) die ($url _file. "Not exist");
$file _arr = file ($url _file);
if (!is_array ($file _arr) | | | empty ($file _arr)) die ($url _file. "Not content");
$url _arr = Array_unique ($file _arr);
if (file_exists ($save _file)) @unlink ($save _file);
$fp = fopen ($save _file, "A +") or Die ("Open save File"). $save _file. "Failed");
foreach ($url _arr as $url) {
if (empty ($url)) continue;
echo "Get". $url. " ...";
$html _str = Get_url ($url);
echo $html _str;
echo $url;
Exit
$obj _str = get_content_object ($html _str);
echo "ok\n";
Fwrite ($fp, $obj _str);
}
Fclose ($FP);
}
Traverse directory to get file contents
function Get_dir ($save _file, $dir) {
$DP = Opendir ($dir);
if (file_exists ($save _file)) @unlink ($save _file);
$fp = fopen ($save _file, "A +") or Die ("Open save File"). $save _file. "Failed");
while (($file = Readdir ($DP))!= false) {
if ($file!= "." && $file!= "...") {
echo "Read file". $file. " ...";
$file _content = file_get_contents ($dir. $file);
$obj _str = get_content_object ($file _content);
echo "ok\n";
Fwrite ($fp, $obj _str);
}
}
Fclose ($FP);
}
Get the specified URL content
function Get_url ($url) {
$reg = '/^http:\/\/[^\/].+$/';
if (!preg_match ($reg, $url)) Die ($url. "Invalid");
$fp = fopen ($url, "R") or Die ("Open URL:".) $url. "Failed.");
while ($FC = Fread ($fp, 8192)) {
$content. = $FC;
}
Fclose ($FP);
if (empty ($content)) {
Die ("Get URL:".) $url. "Content failed.");
}
return $content;
}
//Use socket to get the specified page
function get_content_by_socket ($url, $host) {
$fp = Fsockopen ($host, 80) Or Die ("Open"). $url. "Failed");
$header = "Get/". $url. " Http/1.1\r\n ";
$header. = "Accept: */*\r\n";
$header. = "accept-language:zh-cn\r\n";
$header. = "Accept-encoding:gzip, deflate\r\n";
$header. = "user-agent:mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; Infopath.1. NET CLR 2.0.50727) \ r \ n ";
$header. = "Host:". $host. " \ r \ n ";
$header. = "connection:keep-alive\r\n";
//$header. = "cookie:cnzz02=2; rtime=1; ltime=1148456424859; Cnzz_eid=56601755-\r\n\r\n ";
$header. = "connection:close\r\n\r\n";
Fwrite ($fp, $header);
while (!feof ($fp)) {
$contents. = Fgets ($fp, 8192);
}
Fclose ($FP);
return $contents;
}
Gets the URL in the specified content
function Get_content_url ($host _url, $file _contents) {
$reg = '/^ (#|javascript.*?| ftp:\/\/.+|http:\/\/.+|. *?href.*?| play.*?| index.*?|. *?asp) +$/i ';
$reg = '/^ (down.*?\.html|\d+_\d+\.htm.*?) $/i ';
$rex = "/([HH][RR][EE][FF]) \s*=\s*[' \"]* ([^> ' \ "\s]+) [\] ' >]*\s*/i ';
$reg = '/^ (down.*?\.html) $/i ';
Preg_match_all ($rex, $file _contents, $r);
$result = ""; Array ();
foreach ($r as $c) {
if (Is_array ($c)) {
foreach ($c as $d) {
if (Preg_match ($reg, $d)) {$result. = $host _url. $d. " \ n "; }
}
}
}
return $result;
}
Gets the multimedia files in the specified content
function Get_content_object ($str, $split = "| |:* *:--|") {
$REGX = "/href\s*=\s*['"]* ([^> ' \ "\s]+) [\ ' >]*\s* (<b>.*?<\/b>)/I];
Preg_match_all ($REGX, $str, $result);
if (count ($result) = 3) {
$result [2] = Str_replace ("<b> Multimedia:", "", $result [2]);
$result [2] = Str_replace ("</b>", "", $result [2]);
$result = $result [1][0]. $split. $result [2][0]. "\ n";
}
return $result;
}
?>