PHP get page title and content function
Last Update:2017-02-28
Source: Internet
Author: User
Sometimes we need to get the title and content of the page, is a collection function, here to share a simple, convenient to need friends code as follows: function Getpagecontent ($url) { //$url = ' http://www.ttphp.com; $pageinfo = array (); $pageinfo [content_type] = '; $pageinfo [charset] = '; $pageinfo [title] = '; $pageinfo [description] = '; $pageinfo [keywords] = '; $pageinfo [body] = '; $pageinfo [' httpcode '] = 200; $pageinfo [' all '] = '; $ch = Curl_init (); curl_setopt ($ch, Curlopt_useragent, "mozilla/4.0" (compatible; MSIE 5.01; Windows NT 5.0) "); &NBSp curl_setopt ($ch, curlopt_returntransfer,1); curl_setopt ($ch, curlopt_ssl_verifyhost, 0); curl_setopt ($ch, curlopt_ssl_verifypeer,0); curl_setopt ($ch, curlopt_timeout, 8); curl_setopt ($ch, Curlopt_filetime, 1); curl_setopt ($ch, curlopt_followlocation, 1); //curl_setopt ($ch, Curlopt_header, 1); curl_setopt ($ch, Curlopt_url, $url); $curl _start = Microtime (true); $store = curl_exec ($ch); $curl _time = Microtime (True)-$curl _start; if (Curl_error ($ch)) { $pageinfo [' httpcode '] = 505; //gate Way Error echo ' Curl error: '. Curl_error ($ch). " /n "; return $pageinfo; { //print_r (Curl_getinfo ($ch)); $pageinfo [' httpcode '] = Curl_getinfo ($ch, Curlinfo_http_code); //echo curl_getinfo ($ch, Curlinfo_content_type). " /n "; $pageinfo [Content_Type] = Curl_getinfo ($ch, Curlinfo_content_type); if (intval ($pageinfo [' Httpcode ']) <> or!preg_match (' @text/html@ ', Curl_getinfo ($ch, Curlinfo_content_type) ) { Print_r (Curl_getinfo ($ch)); //EXIt return $pageinfo; { Preg_match ('/charset= ([^/s/n/r]+)/I ', Curl_getinfo ($ch, Curlinfo_content_type), $matches); From header charset if (Trim ($matches [1]) { & nbsp $pageinfo [CharSet] = Trim ($matches [1]); { //echo $pageinfo [CharSet]; //exit; curl_close ($ch); //echo $store; //remove JavaScript $store = Preg_replace ("/<mce:script.*><!--(. *) <//script>/smui", ", $store); //remove link $store = preg_replace ("/<link/s+[^>]+>/smui", "", $store); //remove <!-- --> $store = P Reg_replace ("/<!--. *-->/smui", ", $store)"; //remove <style </<style> &NB Sp $store = Preg_replace ("/<style.*> (. *) <//style>/smui", ", $store); //remove Chinese space $store = preg_replace ("// ", ', $store); //remove punctuation mark //$store = Preg_replace ("/ [/~ '!@#$%^&* () _/-+={}|/[/]//; ': "/</>/?/,/.//]/", ", $store); //preg_match ("/<head.*>" (. * ) <//head>/smui ", $store, $matches); $head = $matches [1]; //echo $head. "/n"; //charset if ($pageinfo [CharSet] = = ') { Preg_match (' @<meta.+charset= ([/w/-]+) [^>]*>@i ', $store, $matches); $pageinfo [CharSet] = Trim ($matches [1]); { //desctiption p Reg_match (' @<meta/s+name=/"*description/" */s+content/s*=/s* ([^/>]+)/*>@i ', $store, $matches); //print_r ($matches); $DESC = Trim ($matches [1]); $pageinfo [description] = Str_replace ("/" ",", $desc); Preg_match (' @<meta/s+name=/"*keywords/" */s+content/s*=/s* ([^/> ]+)/*>@i ', $store, $MATCHES); //print_r ($matches); $keywords = Trim ($matches [1]); $pageinfo [keywords] = str_replace ("/" ",", $keywords); Preg_match ("/<title> (. *) <//ti Tle>/smui ", $store, $matches); $pageinfo [title] = Trim ($matches [1]); Preg_match ("/<body.*> (. *) <//body>/smui", $store, $matches); $pageinfo [body] = Addslashes (Replacehtmlandjs ($matches [1])); $pageinfo [' all '] = Addslashes (Replacehtmlandjs ($store)); //echo "charset =". $pageinfo [CharSet]. "/n"; //print_r ($pageinfo); //exit; &NBSP;&NBsp return $pageinfo; } /** * Remove all HTML tags and javascript tags */ function Replacehtmlandjs ($document) { $document = Trim ($document); if (strlen ($document) <= 0) { & nbsp return $document; $search = Array ( & nbsp &NBS P "' <script[^>]*?>.*? --></mce:script> ' Si ', //remove JavaScript &NB Sp "' <[///!] *? [^<>]*?> ' Si ', &nBsp //Remove HTML tags &NBSP ; "[/r/n/s+]" " //remove whitespace character & nbsp "' & (/w+); ' I " //replacement HTML entity &N Bsp ); //as PHP code run &NBS P $replace = Array ("", "", "", "" ); return @preg_replace ($search, $replace, $document); } Use example code as follows: $a = getpagecontent (www.ttphp.com); Print_r ($a);