First go to the Internet to download Snoopy.class.php
Call Method:
Copy Code code as follows:
<?php
Require ' lib/snoopy.class.php ';
Require ' lib/webcrawl.class.php ';//contains the following code
$go =new webcrawl (' http://www.baidu.com ');
echo $go->getcharset ();
?>
Copy Code code as follows:
<?php
Class Webcrawl
{
Private $url;
Private $request;
Public $charset _arr=array (
' gb2312 ',
' Utf-8 ',
' Big5 ',
' GBK ',
' ASCII ',
' cp936 ',
' ibm037 ',
' ibm437 ',
' Ibm500 ',
' asmo-708 ',
' dos-720 ',
' ibm737 ',
' ibm775 ',
' ibm850 ',
' ibm852 ',
' ibm855 ',
' ibm857 ',
' ibm00858 ',
' ibm861 ',
' ibm860 ',
' dos-862 ',
' ibm863 ',
' ibm864 ',
' ibm865 ',
' cp866 ',
' ibm869 ',
' ibm870 ',
' windows-874 ',
' cp875 ',
' Shift_JIS ',
' ks_c_5601-1987 ',
' ibm1026 ',
' ibm01047 ',
' ibm01047 ',
' ibm01040 ',
' ibm01041 ',
' ibm01042 ',
' ibm01043 ',
' ibm01044 ',
' ibm01045 ',
' ibm01046 ',
' ibm01047 ',
' ibm01048 ',
' ibm01049 ',
' Utf-16 ',
' Unicodefffe ',
' Windows-1250 ',
' windows-1251 ',
' windows-1252 ',
' windows-1253 ',
' windows-1254 ',
' windows-1255 ',
' Windows-1256 ',
' windows-1257 ',
' windows-1258 ',
' Johab ',
' Macintosh ',
' X-mac-japanese ',
' X-mac-chinesetrad ',
' X-mac-korean ',
' X-mac-arabic ',
' X-mac-hebrew ',
' X-mac-greek ',
' X-mac-cyrillic ',
' X-mac-chinesesimp ',
' X-mac-romanian ',
' X-mac-ukrainian ',
' X-mac-thai ',
' X-mac-ce ',
' X-mac-icelandic ',
' X-mac-turkish ',
' X-mac-croatian ',
' X-chinese-cns ',
' X-cp20001 ',
' X-chinese-eten ',
' X-cp20003 ',
' x-cp20004 ',
' X-cp20005 ',
' X-ia5 ',
' X-ia5-german ',
' X-ia5-swedish ',
' X-ia5-norwegian ',
' Us-ascii ',
' x-cp20261 ',
' x-cp20269 ',
' ibm273 ',
' ibm277 ',
' ibm278 ',
' ibm280 ',
' ibm284 ',
' ibm285 ',
' ibm290 ',
' ibm420 ',
' ibm423 ',
' ibm424 ',
' X-ebcdic-koreanextended ',
' Ibm-thai ',
' Koi8-r ',
' ibm871 ',
' ibm880 ',
' ibm905 ',
' ibm00924 ',
' x-cp20936 ',
' x-cp20949 ',
' cp1025 ',
' Koi8-u ',
' Iso-8859-1 ',
' Iso-8859-2 ',
' Iso-8859-3 ',
' Iso-8859-4 ',
' Iso-8859-5 ',
' Iso-8859-6 ',
' Iso-8859-7 ',
' Iso-8859-8 ',
' Iso-8859-9 ',
' Iso-8859-13 ',
' Iso-8859-15 ',
' X-europa ',
' Iso-8859-8-i ',
' Iso-2022-jp ',
' Csiso2022jp ',
' Iso-2022-jp ',
' Iso-2022-kr ',
' x-cp50227 ',
' Euc-jp ',
' EUC-CN ',
' Euc-kr ',
' hz-gb-2312 ',
' GB18030 ',
' X-iscii-de ',
' X-iscii-be ',
' X-iscii-ta ',
' X-iscii-te ',
' X-iscii-as ',
' X-iscii-or ',
' X-iscii-ka ',
' X-iscii-ma ',
' X-iscii-gu ',
' X-iscii-pa ',
' Utf-7 ',
' Utf-32 ',
' Utf-32be '
);
Public function __construct ($url)
{
$this->url= $url;
}
Open a Web site
Private function open ($url)
{
if ($this->request!==null)
{
if ($this->request->status==200)
{
return true;
}
Else
{
return false;
}
}
Else
{
$this->request=new Snoopy ();
$this->request->fetch ($url);
if ($this->request->status==200)
{
$this->request->results=strtolower ($this->request->results);
$charset = $this->getcharset ();
if ($charset!= "Utf-8")
{
if ($charset = = "windows-1252")
{
$this->request->results= $this->uni_decode ($this->request->results);
}
Else
{
$this->request->results=mb_convert_encoding ($this->request->results, "UTF-8", $charset);
}
}
return true;
}
Else
{
return false;
}
}
}
Get Web Title,keywords,description
Public Function Getwebinfo ()
{
$info =array (
' title ' => ',
' Keywords ' => ',
' desc ' => ',
' IP ' => '
);
if (! $this->open ($this->url)) {return $info; exit;}
Print_r ($this->request->results); exit;
Preg_match ('/<title> ([^>]*) <\/title>/si ', $this->request->results, $titlematch);
if (Isset ($titlematch) && Is_array ($titlematch) && count ($titlematch) > 0)
{
$info [' title '] = Strip_tags ($titlematch [1]);
}
Preg_match_all ('/<[\s]*meta[\s]*name= "?) ' ([^> "]*")? [\s]* '. ' Content= '? ([^> "]*")? [\s]*[\/]? [\s]*>/si ', $this->request->results, $match);
$ft = 0;
foreach ($match [1] as $MT)
{
if ($mt = = "keywords" | | $mt = = "description")
{
$ft = 1;
}
}
if ($ft ==0)
{
Preg_match_all ('/<[\s]*meta[\s]*content= ')? ( [^> "]*"? [\s]*name=]. ' ([^> "]*")? [\s]*[\/]? [\s]*>/si ', $this->request->results, $match);
if (Isset ($match) && Is_array ($match) && count ($match) = 3)
{
$originals = $match [0];
$names = $match [2];
$values = $match [1];
if (count ($originals) = = Count ($names) && count ($names) = = count ($values))
{
$metaTags = Array ();
For ($i =0, $limiti =count ($names); $i < $limiti; $i + +)
{
$metaTags [$names [$i]] = Array (
' HTML ' => htmlentities ($originals [$i]),
' Value ' => $values [$i]
);
}
}
}
}
Else
{
if (Isset ($match) && Is_array ($match) && count ($match) = 3)
{
$originals = $match [0];
$names = $match [1];
$values = $match [2];
if (count ($originals) = = Count ($names) && count ($names) = = count ($values))
{
$metaTags = Array ();
For ($i =0, $limiti =count ($names); $i < $limiti; $i + +)
{
$metaTags [$names [$i]] = Array (
' HTML ' => htmlentities ($originals [$i]),
' Value ' => $values [$i]
);
}
}
}
}
$result = Array (
' MetaTags ' => $metaTags
);
if (Isset ($result [' metatags '] [' keywords '] [' value '])
{
$info [' keywords ']= $result [' metatags '] [' keywords '] [' value '];
}
Else
{
$info [' keywords ']= '];
}
if (Isset ($result [' metatags '] [' description '] [' value '])
{
$info [' desc ']= $result [' metatags '] [' description '] [' value '];
}
Else
{
$info [' desc ']= '];
}
$domain =preg_replace ('/http\:\/\//si ', ', $this->url);
$ip = @gethostbyname ($domain);
$ip _arr=explode (".", $ip);
if (count ($ip _arr) ==4)
{
$info [' IP ']= $ip;
}
return $info;
}
Public Function T ($string, $o)
{
for ($i =0; $i <strlen ($string); $i + +)
{
if (Ord ($string {$i}) <128)
Continue
if ((Ord ($string {$i}) &224) ==224)
{
The first byte is judged by
$char = $string {+ + $i};
if ((Ord ($char) &128) ==128)
{
The second byte is judged by
$char = $string {+ + $i};
if ((Ord ($char) &128) ==128)
{
$encoding = "UTF-8";
Break
}
}
}
if ((Ord ($string {$i}) &192) ==192)
{
The first byte is judged by
$char = $string {+ + $i};
if ((Ord ($char) &128) ==128)
{
The second byte is judged by
$encoding = "GB2312";
Break
}
}
}
Return Strtolower ($encoding);
}
function Uni_decode ($str, $code = ' utf-8 ') {
$str = Json_decode (Preg_replace_callback ('/&# (\d{5});/', Create_function (' $dec ', ' return ' \\u\ '. Dechex ($dec [1] ); ', ' "'. $str. '");
if ($code!= ' Utf-8 ') {$str = Iconv (' Utf-8 ', $code, $str);}
return $str;
}
Get the site code
Public Function Getcharset ()
{
if (! $this->open ($this->url)) {return false;exit;}
First get the encoding from HTML
Preg_match ("/<meta.+?charset=[^\w]"? ( [-\w]+)/I ", $this->request->results, $temp)? Strtolower ($temp [1]): "";
if ($temp [1]!= "")
{
if (In_array ($temp [1], $this->charset_arr))
{
if ($temp [1]== "gb2312")
{
$tmp _charset= $this->t ($this->request->results, $temp [1]);
if ($tmp _charset== $temp [1])
{
return $temp [1];
}
}
Else
{
return $temp [1];
}
}
}
if (!empty ($this->request->headers))
{
Get the encoding from the header
$hstr =strtolower (Implode ("| | | |", $this->request->headers));
Preg_match ("/charset=[^\w]"? ( [-\w]+)/is ", $hstr, $lang)? Strtolower ($lang [1]): "";
if ($lang [1]!= "")
{
return $lang [1];
}
}
$encode _arr=array ("UTF-8", "GB2312", "GBK", "BIG5", "ASCII", "EUC-JP", "Shift_JIS", "CP936", "iso-8859-1", "JIS", " Eucjp-win "," Sjis-win ");
$encoded =mb_detect_encoding ($this->request->results, $encode _arr);
if ($encoded)
{
Return Strtolower ($encoded);
}
Else
{
return false;
}
}
}
?>