Snoopy-based PHP obtains website code almost perfectly
Snoopy-based PHP obtains website code almost perfectly It is used for php crawlers. the encoding accuracy is 99.9%, and some cannot be obtained. Code Source: www.siteyun.com First Download Snoopy. class. php from the internet Call method: Echo $ go-> getCharset ();?> [Code] Url = $ url;} // open the website private function open ($ url) {if ($ this-> request! = Null) {if ($ this-> request-> status = 200) {return true;} else {return false ;}} else {$ this-> request = new Snoopy (); $ this-> request-> fetch ($ url); if ($ this-> request-> status = 200) {$ this-> request-> results = strtolower ($ this-> request-> results); $ charset = $ this-> getCharset (); if ($ charset! = "UTF-8") {if ($ charset = "windows-1252 ") {$ this-> request-> results = $ this-> uni_decode ($ this-> request-> results );} else {$ this-> request-> results = mb_convert_encoding ($ this-> request-> results, "UTF-8", $ charset) ;}} return true ;} else {return false ;}}// obtain the website title, keywords, descriptionpublic function getWebinfo () {$ info = array ('title' => '', 'keyword' => '', 'desc' =>'', 'IP' => ''); if (! $ This-> open ($ this-> url) {return $ info; exit;} // print_r ($ this-> request-> results); exit; preg_match ('/([^>] *) <\/Title>/Si', $ this-> request-> results, $ titlematch); if (isset ($ titlematch) & is_array ($ titlematch) & count ($ titlematch)> 0) {$ info ['title'] = strip_tags ($ titlematch [1]);} preg_match_all ('/<[\ s] * meta [\ s] * name = "? '.' ([^> "] *)"? [\ S] * '.' content = "? ([^> "] *)"? [\ S] * [\/]? [\ S] *>/Si', $ this-> request-> results, $ match); $ ft = 0; foreach ($ match [1] as $ mt) {if ($ mt = "keywords" | $ mt = "description") {$ ft = 1 ;}} if ($ ft = 0) {preg_match_all ('/<[\ s] * meta [\ s] * content = "? ([^> "] *)"? [\ S] * name = "? '.' ([^> "] *)"? [\ S] * [\/]? [\ S] *>/Si', $ this-> request-> results, $ match); if (isset ($ match) & is_array ($ match) & count ($ match) = 3) {$ originals = $ match [0]; $ names = $ match [2]; $ values = $ match [1]; if (count ($ originals) = count ($ names) & count ($ names) = count ($ values) {$ orders AGS = array (); for ($ I = 0, $ limiti = count ($ names); $ I <$ limiti; $ I ++) {$ cmdags [$ names [$ I] = array ('html' => htmlentities ($ originals [$ I]), 'value' => $ values [$ I]) ;}}} else {if (isset ($ match) & is_array ($ match) & count ($ match) = 3) {$ originals = $ match [0]; $ names = $ match [1]; $ values = $ match [2]; if (count ($ originals) = count ($ names) & count ($ names) = count ($ values) {$ orders AGS = array (); for ($ I = 0, $ limiti = count ($ names); $ I <$ limiti; $ I ++) {$ cmdags [$ names [$ I] = array ('html' => htmlentities ($ originals [$ I]), 'value' => $ values [$ I]) ;}}$ result = array ('tagags '=> $ tagags ); if (isset ($ result ['users' AGS] ['keyword'] ['value']) {$ info ['keyword'] = $ result ['tagags '] ['keyword'] ['value'];} else {$ info ['keyword'] = "";} if (isset ($ result ['tagags '] ['description'] ['value']) {$ info ['desc'] = $ result ['invalid AGS '] ['description'] ['value'];} else {$ info ['desc'] = "" ;}$ domain = preg_replace ('/http \: \/\ // Si ','', $ this-> url); $ ip = @ gethostbyname ($ domain); $ ip_arr = explode (". ", $ ip); if (count ($ ip_arr) = 4) {$ info ['IP'] = $ ip;} return $ info ;} public function t ($ string, $ o) {for ($ I = 0; $ I Open ($ this-> url) {return false; exit;} // first obtain the encoding preg_match ("/ Request-> results, $ temp )? Strtolower ($ temp [1]): ""; if ($ temp [1]! = "") {If (in_array ($ temp [1], $ this-> charset_arr) {if ($ temp [1] = "gb2312 ") {$ tmp_charset = $ this-> t ($ this-> request-> results, $ temp [1]); if ($ tmp_charset = $ temp [1]) {return $ temp [1] ;}} else {return $ temp [1] ;}} if (! Empty ($ this-> request-> headers) {// Get the encoding from the header $ hstr = strtolower (implode (" |