Copy codeThe Code is as follows:
Function getPageContent ($ url ){
// $ Url = 'HTTP: // www.ttphp.com;
$ Pageinfo = array ();
$ Pageinfo [content_type] = '';
$ Pageinfo [charset] = '';
$ Pageinfo [title] = '';
$ Pageinfo [description] = '';
$ Pageinfo [keywords] = '';
$ Pageinfo [body] = '';
$ Pageinfo ['httpcodec'] = 200;
$ Pageinfo ['all'] = '';
$ Ch = curl_init ();
Curl_setopt ($ ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0 )");
Curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1 );
Curl_setopt ($ ch, CURLOPT_SSL_VERIFYHOST, 0 );
Curl_setopt ($ ch, CURLOPT_SSL_VERIFYPEER, 0 );
Curl_setopt ($ ch, CURLOPT_TIMEOUT, 8 );
Curl_setopt ($ ch, CURLOPT_FILETIME, 1 );
Curl_setopt ($ ch, CURLOPT_FOLLOWLOCATION, 1 );
// Curl_setopt ($ ch, CURLOPT_HEADER, 1 );
Curl_setopt ($ ch, CURLOPT_URL, $ url );
$ Curl_start = microtime (true );
$ Store = curl_exec ($ ch );
$ Curl_time = microtime (true)-$ curl_start;
If (curl_error ($ ch )){
$ Pageinfo ['httpcode'] = 505; // gate way error
Echo 'curl error: '. curl_error ($ ch). "/n ";
Return $ pageinfo;
}
// Print_r (curl_getinfo ($ ch ));
$ Pageinfo ['httpcode'] = curl_getinfo ($ ch, CURLINFO_HTTP_CODE );
// Echo curl_getinfo ($ ch, CURLINFO_CONTENT_TYPE). "/n ";
$ Pageinfo [content_type] = curl_getinfo ($ ch, CURLINFO_CONTENT_TYPE );
If (intval ($ pageinfo ['httpcode']) <> 200 or! Preg_match ('@ text/html @', curl_getinfo ($ ch, CURLINFO_CONTENT_TYPE ))){
// Print_r (curl_getinfo ($ ch ));
// Exit;
Return $ pageinfo;
}
Preg_match ('/charset = ([^/s/n/r] +)/I', curl_getinfo ($ ch, CURLINFO_CONTENT_TYPE), $ matches ); // retrieve charset from the header
If (trim ($ matches [1]) {
$ Pageinfo [charset] = trim ($ matches [1]);
}
// Echo $ pageinfo [charset];
// Exit;
Curl_close ($ ch );
// Echo $ store;
// Remove javascript
$ Store = preg_replace ("/<mce: script. *> <! --
(. *) <// Script>/smUi ",'', $ store );
// Remove link
$ Store = preg_replace ("/<link/s + [^>] +>/smUi", '', $ store );
// Remove <! -->
$ Store = preg_replace ("/<! --. * -->/SmUi ",'', $ store );
// Remove <style </<style>
$ Store = preg_replace ("/<style. *> (. *) </style>/smUi", '', $ store );
// Remove Chinese Spaces
$ Store = preg_replace ("//", '', $ store );
// Remove punctuation marks
// $ Store = preg_replace ("/[/~ '! @ # $ % ^ & * () _/-+ = {}|/[/] //; ': "/</> /? //, //. //]/",'', $ Store );
// Preg_match ("/// $ Head = $ matches [1];
// Echo $ head. "/n ";
// Charset
If ($ pageinfo [charset] = ''){
Preg_match ('@ <meta. + charset = ([/w/-] +) [^>] *> @ I', $ store, $ matches );
$ Pageinfo [charset] = trim ($ matches [1]);
}
// Desctiption
Preg_match ('@ <meta/s + name =/"* description/" */s + content/s * =/s * ([^/>] +) /*> @ I ', $ store, $ matches );
// Print_r ($ matches );
$ Desc = trim ($ matches [1]);
$ Pageinfo [description] = str_replace ("/" ",'', $ desc );
Preg_match ('@ <meta/s + name =/"* keywords/" */s + content/s * =/s * ([^/>] +) /*> @ I ', $ store, $ matches );
// Print_r ($ matches );
$ Keywords = trim ($ matches [1]);
$ Pageinfo [keywords] = str_replace ("/" ",'', $ keywords );
Preg_match ("/<title> (. *) </title>/smUi", $ store, $ matches );
$ Pageinfo [title] = trim ($ matches [1]);
Preg_match ("/<body. *> (. *) </body>/smUi", $ store, $ matches );
$ Pageinfo [body] = addslashes (replaceHtmlAndJs ($ matches [1]);
$ Pageinfo ['all'] = addslashes (replaceHtmlAndJs ($ store ));
// Echo "charset =". $ pageinfo [charset]. "/n ";
// Print_r ($ pageinfo );
// Exit;
Return $ pageinfo;
}
/**
* Remove all HTML and JavaScript tags
*/
Function replaceHtmlAndJs ($ document)
{
$ Document = trim ($ document );
If (strlen ($ document) <= 0)
{
Return $ document;
}
$ Search = array (
"'<Script [^>] *?>. *?
// --> </Mce: script> 'si ", // remove javascript
"'<[//!] *? [^ <>] *?> 'Si ", // remove the HTML Tag
"'[/R/n/s +]'", // remove white space characters
"'& (/W +);' I" // replaces the HTML Entity
); // Run as PHP code
$ Replace = array ("","","","");
Return @ preg_replace ($ search, $ replace, $ document );
}
Example
Copy codeThe Code is as follows:
$ A = getPageContent (www.ttphp.com );
Print_r ($ );