Code is as follows:
function Getpagecontent ($url) {
$url = ' http://www.ttphp.com;
$pageinfo = Array ();
$pageinfo [Content_Type] = ';
$pageinfo [CharSet] = ';
$pageinfo [title] = ';
$pageinfo [description] = ';
$pageinfo [keywords] = ';
$pageinfo [body] = ';
$pageinfo [' httpcode '] = 200;
$pageinfo [' all '] = ';
$ch = Curl_init ();
curl_setopt ($ch, Curlopt_useragent, "mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) ");
curl_setopt ($ch, curlopt_returntransfer,1);
curl_setopt ($ch, curlopt_ssl_verifyhost, 0);
curl_setopt ($ch, curlopt_ssl_verifypeer,0);
curl_setopt ($ch, curlopt_timeout, 8);
curl_setopt ($ch, Curlopt_filetime, 1);
curl_setopt ($ch, curlopt_followlocation, 1);
curl_setopt ($ch, Curlopt_header, 1);
curl_setopt ($ch, Curlopt_url, $url);
$curl _start = Microtime (true);
$store = curl_exec ($ch);
$curl _time = Microtime (True)-$curl _start;
if (Curl_error ($ch)) {
$pageinfo [' httpcode '] = 505; Gate-through error
Echo ' Curl error: '. Curl_error ($ch). " /n ";
return $pageinfo;
}
Print_r (Curl_getinfo ($ch));
$pageinfo [' httpcode '] = Curl_getinfo ($ch, Curlinfo_http_code);
Echo Curl_getinfo ($ch, Curlinfo_content_type). " /n ";
$pageinfo [Content_Type] = Curl_getinfo ($ch, Curlinfo_content_type);
if (Intval ($pageinfo [' Httpcode ']) <>/!preg_match (' @text/[email protected] ', Curl_getinfo ($ch, Curlinfo_ Content_Type)) {
Print_r (Curl_getinfo ($ch));
Exit
return $pageinfo;
}
Preg_match ('/charset= ([^/s/n/r]+)/I ', Curl_getinfo ($ch, Curlinfo_content_type), $matches); Take charset from the header.
if (Trim ($matches [1])) {
$pageinfo [CharSet] = Trim ($matches [1]);
}
echo $pageinfo [CharSet];
Exit
Curl_close ($ch);
Echo $store;
Remove JavaScript
$store = Preg_replace ("/<mce:script.*><!--
(. *) <//script>/smui ",", $store);
Remove link
$store = Preg_replace ("/<link/s+[^>]+>/smui", "', $store);
Remove <!---->
$store = Preg_replace ("/<!--. *-->/smui", ", $store);
Remove <style </<style>
$store = Preg_replace ("/<style.*> (. *) <//style>/smui", ", $store);
Remove Chinese space
$store = Preg_replace ("//", "', $store);
Remove punctuation
$store = Preg_replace ("/[/~" [email protected]#$%^&* () _/-+={}|/[/]//; ': "/</>/?/,/.//]/", "', $store);
Preg_match ("/$head = $matches [1];
Echo $head. "/n";
CharSet
if ($pageinfo [charset] = = ") {
Preg_match (' @<meta.+charset= ([/w/-]+) [^>]*>@i ', $store, $matches);
$pageinfo [CharSet] = Trim ($matches [1]);
}
Desctiption
Preg_match (' @<meta/s+name=/' *description/"*/s+content/s*=/s* ([^/>]+)/*>@i ', $store, $matches);
Print_r ($matches);
$desc = Trim ($matches [1]);
$pageinfo [Description] = Str_replace ("/" "," ', $desc);
Preg_match (' @<meta/s+name=/' *keywords/"*/s+content/s*=/s* ([^/>]+)/*>@i ', $store, $matches);
Print_r ($matches);
$keywords = Trim ($matches [1]);
$pageinfo [keywords] = str_replace ("/" "," ', $keywords);
Preg_match ("/<title> (. *) <//title>/smui", $store, $matches);
$pageinfo [Title] = Trim ($matches [1]);
Preg_match ("/<body.*> (. *) <//body>/smui", $store, $matches);
$pageinfo [Body] = Addslashes (Replacehtmlandjs ($matches [1]);
$pageinfo [' all '] = Addslashes (Replacehtmlandjs ($store));
echo "charset =". $pageinfo [CharSet]. "/n";
Print_r ($pageinfo);
Exit
return $pageinfo;
}
/**
* Remove all HTML tags and javascript tags
*/
function Replacehtmlandjs ($document)
{
$document = Trim ($document);
if (strlen ($document) <= 0)
{
return $document;
}
$search = Array (
"' <script[^>]*?>.*?
--></mce:script> ' Si ',//Remove JavaScript
"' <[///!] *? [^<>]*?> ' Si ',//Remove HTML tags
"' [/r/n/s+] '",//Remove whitespace characters
"' & (/w+); ' I "//replace HTML entity
); Run as PHP code
$replace = Array ("", "", "", "");
Return @preg_replace ($search, $replace, $document);
}
Examples of Use
The code is as follows:
$a = getpagecontent (www.ttphp.com);
Print_r ($a);
PHP gets page title and content functions (without HTML tags)