Snoopy-based PHP obtains website code almost perfectly
Last Update:2014-07-25
Source: Internet
Author: User
Snoopy-based PHP obtains the website code almost perfectly for php crawlers. the code accuracy is 99.9%, and some cannot be obtained. to improve the code, download Snoopy. class. php from the Internet.
Call method:
Copy code The code is as follows:
Require 'Lib/Snoopy. class. php ';
Require 'Lib/WebCrawl. class. php'; // contains the following code:
$ Go = new WebCrawl ('http: // www.baidu.com ');
Echo $ go-> getCharset ();
?>
Copy code The code is as follows:
Class WebCrawl
{
Private $ url;
Private $ request;
Public $ charset_arr = array (
'Gb2312 ',
'Utf-8 ',
'Big5 ',
'Gbk ',
'Ascii ',
'Cp936 ',
'Ibm037 ',
'Ibm437 ',
'Ibm500 ',
'Asmo-708 ',
'Dos-100 ',
'Ibm737 ',
'Ibm775 ',
'Ibm850 ',
'Ibm852 ',
'Ibm855 ',
'Ibm857 ',
'Ibm00858 ',
'Ibm861 ',
'Ibm860 ',
'Dos-100 ',
'Ibm863 ',
'Ibm864 ',
'Ibm865 ',
'Cp866 ',
'Ibm869 ',
'Ibm870 ',
'Windows-874 ',
'Cp875 ',
'Shift _ jis ',
'KS _ c_5601-1987 ',
'Ibm1026 ',
'Ibm01047 ',
'Ibm01047 ',
'Ibm01040 ',
'Ibm01041 ',
'Ibm01042 ',
'Ibm01043 ',
'Ibm01044 ',
'Ibm01045 ',
'Ibm01046 ',
'Ibm01047 ',
'Ibm01048 ',
'Ibm01049 ',
'Utf-16 ',
'Unicodefffe ',
'Windows-1250 ',
'Windows-1251 ',
'Windows-1252 ',
'Windows-1253 ',
'Windows-1254 ',
'Windows-1255 ',
'Windows-1256 ',
'Windows-1257 ',
'Windows-1258 ',
'Job ',
'Macintosh ',
'X-mac-japanese ',
'X-mac-chinesetrad ',
'X-mac-kore ',
'X-mac-arabic ',
'X-mac-Hebrew ',
'X-mac-greek ',
'X-mac-cyrillic ',
'X-mac-chinesesimp ',
'X-mac-romanian ',
'X-mac-ukrainian ',
'X-mac-Thai ',
'X-mac-ce ',
'X-mac-icelandic ',
'X-mac-turkish ',
'X-mac-croatian ',
'X-chinese-CNS ',
'X-cp20001 ',
'X-chinese-eten ',
'X-cp20003 ',
'X-cp20004 ',
'X-cp20005 ',
'X-ia5 ',
'X-ia5-german ',
'X-ia5-swedish ',
'X-ia5-norwegian ',
'US-ascii ',
'X-cp20261 ',
'X-cp20269 ',
'Ibm273 ',
'Ibm277 ',
'Ibm278 ',
'Ibm280 ',
'Ibm284 ',
'Ibm285 ',
'Ibm290 ',
'Ibm420 ',
'Ibm423 ',
'Ibm424 ',
'X-ebcdic-koreanextended ',
'IBM-Thai ',
'Koi8-R ',
'Ibm871 ',
'Ibm880 ',
'Ibm905 ',
'Ibm00924 ',
'X-cp20936 ',
'X-cp20949 ',
'Cp1025 ',
'Koi8-U ',
'ISO-8859-1 ',
'ISO-8859-2 ',
'ISO-8859-3 ',
'ISO-8859-4 ',
'ISO-8859-5 ',
'ISO-8859-6 ',
'ISO-8859-7 ',
'ISO-8859-8 ',
'ISO-8859-9 ',
'ISO-8859-13 ',
'ISO-8859-15 ',
'X-EUROPA ',
'ISO-8859-8-i ',
'ISO-2022-jput ',
'Csiso2022jp ',
'ISO-2022-jput ',
'ISO-2022-kr ',
'X-cp50227 ',
'Euc-jpc ',
'Euc-cn ',
'Euc-kr ',
'Hz-gb-2312 ',
'Gb18030 ',
'X-iscii-de ',
'X-iscii-be ',
'X-iscii-ta ',
'X-iscii-te ',
'X-iscii-',
'X-iscii-or ',
'X-iscii-Ka ',
'X-iscii-M ',
'X-iscii-gu ',
'X-iscii-PA ',
'Utf-7 ',
'Utf-32 ',
'Utf-32be'
);
Public function _ construct ($ url)
{
$ This-> url = $ url;
}
// Open the website
Private function open ($ url)
{
If ($ this-> request! = Null)
{
If ($ this-> request-> status = 200)
{
Return true;
}
Else
{
Return false;
}
}
Else
{
$ This-> request = new Snoopy ();
$ This-> request-> fetch ($ url );
If ($ this-> request-> status = 200)
{
$ This-> request-> results = strtolower ($ this-> request-> results );
$ Charset = $ this-> getCharset ();
If ($ charset! = "UTF-8 ")
{
If ($ charset = "windows-1252 ")
{
$ This-> request-> results = $ this-> uni_decode ($ this-> request-> results );
}
Else
{
$ This-> request-> results = mb_convert_encoding ($ this-> request-> results, "UTF-8", $ charset );
}
}
Return true;
}
Else
{
Return false;
}
}
}
// Obtain the website title, keywords, and description
Public function getWebinfo ()
{
$ Info = array (
'Title' => '',
'Keyword' => '',
'Desc' => '',
'IP' =>''
);
If (! $ This-> open ($ this-> url) {return $ info; exit ;}
// Print_r ($ this-> request-> results); exit;
Preg_match ('/ ([^>] *) <\/Title>/Si', $ this-> request-> results, $ titlematch );
If (isset ($ titlematch) & is_array ($ titlematch) & count ($ titlematch)> 0)
{
$ Info ['title'] = strip_tags ($ titlematch [1]);
}
Preg_match_all ('/<[\ s] * meta [\ s] * name = "? '.' ([^> "] *)"? [\ S] * '.' content = "? ([^> "] *)"? [\ S] * [\/]? [\ S] *>/Si', $ this-> request-> results, $ match );
$ Ft = 0;
Foreach ($ match [1] as $ mt)
{
If ($ mt = "keywords" | $ mt = "description ")
{
$ Ft = 1;
}
}
If ($ ft = 0)
{
Preg_match_all ('/<[\ s] * meta [\ s] * content = "? ([^> "] *)"? [\ S] * name = "? '.' ([^> "] *)"? [\ S] * [\/]? [\ S] *>/Si', $ this-> request-> results, $ match );
If (isset ($ match) & is_array ($ match) & count ($ match) = 3)
{
$ Originals = $ match [0];
$ Names = $ match [2];
$ Values = $ match [1];
If (count ($ originals) = count ($ names) & count ($ names) = count ($ values ))
{
$ Administrative AGS = array ();
For ($ I = 0, $ limiti = count ($ names); $ I <$ limiti; $ I ++)
{
$ Mongoags [$ names [$ I] = array (
'Html' => htmlentities ($ originals [$ I]),
'Value' => $ values [$ I]
);
}
}
}
}
Else
{
If (isset ($ match) & is_array ($ match) & count ($ match) = 3)
{
$ Originals = $ match [0];
$ Names = $ match [1];
$ Values = $ match [2];
If (count ($ originals) = count ($ names) & count ($ names) = count ($ values ))
{
$ Administrative AGS = array ();
For ($ I = 0, $ limiti = count ($ names); $ I <$ limiti; $ I ++)
{
$ Mongoags [$ names [$ I] = array (
'Html' => htmlentities ($ originals [$ I]),
'Value' => $ values [$ I]
);
}
}
}
}
$ Result = array (
'Administrative AGS '=> $ administrative AGS
);
If (isset ($ result ['users' AGS] ['keyword'] ['value'])
{
$ Info ['keyword'] = $ result ['tagags '] ['keyword'] ['value'];
}
Else
{
$ Info ['keyword'] = "";
}
If (isset ($ result ['regionams'] ['description'] ['value'])
{
$ Info ['desc'] = $ result ['invalid AGS '] ['description'] ['value'];
}
Else
{
$ Info ['desc'] = "";
}
$ Domain = preg_replace ('/http \: \ // si', ', $ this-> url );
$ Ip = @ gethostbyname ($ domain );
$ Ip_arr = explode (".", $ ip );
If (count ($ ip_arr) = 4)
{
$ Info ['IP'] = $ ip;
}
Return $ info;
}
Public function t ($ string, $ o)
{
For ($ I = 0; $ I {
If (ord ($ string {$ I}) <128)
Continue;
If (ord ($ string {$ I}) & 224) = 224)
{
// The first byte is determined
$ Char = $ string {++ $ I };
If (ord ($ char) & 128) = 128)
{
// The second byte is passed
$ Char = $ string {++ $ I };
If (ord ($ char) & 128) = 128)
{
$ Encoding = "UTF-8 ";
Break;
}
}
}
If (ord ($ string {$ I}) & 192) = 192)
{
// The first byte is determined
$ Char = $ string {++ $ I };
If (ord ($ char) & 128) = 128)
{
// The second byte is passed
$ Encoding = "GB2312 ";
Break;
}
}
}
Return strtolower ($ encoding );
}
Function uni_decode ($ str, $ code = 'utf-8 '){
$ Str = json_decode (preg_replace_callback ('/& # (\ d {5});/', create_function ('$ dec', 'Return \ '\ u \'. dechex ($ dec [1]); '),' "'. $ str. '"'));
If ($ code! = 'Utf-8') {$ str = iconv ('utf-8', $ code, $ str );}
Return $ str;
}
// Obtain the website code
Public function getCharset ()
{
If (! $ This-> open ($ this-> url) {return false; exit ;}
// First obtain the encoding from html
Preg_match ("/ Request-> results, $ temp )? Strtolower ($ temp [1]): "";
If ($ temp [1]! = "")
{
If (in_array ($ temp [1], $ this-> charset_arr ))
{
If ($ temp [1] = "gb2312 ")
{
$ Tmp_charset = $ this-> t ($ this-> request-> results, $ temp [1]);
If ($ tmp_charset = $ temp [1])
{
Return $ temp [1];
}
}
Else
{
Return $ temp [1];
}
}
}
If (! Empty ($ this-> request-> headers ))
{
// Obtain the encoding from the header
$ Hstr = strtolower (implode ("|", $ this-> request-> headers ));
Preg_match ("/charset = [^ \ w]? ([-\ W] +)/is ", $ hstr, $ lang )? Strtolower ($ lang [1]): "";
If ($ lang [1]! = "")
{
Return $ lang [1];
}
}
$ Encode_arr = array ("UTF-8", "GB2312", "GBK", "BIG5", "ASCII", "EUC-JP", "Shift_JIS", "CP936 ", ISO-8859-1, JIS, eucjp-win, sjis-win ");
$ Encoded = mb_detect_encoding ($ this-> request-> results, $ encode_arr );
If ($ encoded)
{
Return strtolower ($ encoded );
}
Else
{
Return false;
}
}
}
?>