The function of page fetching is often used in the project, and the process of page fetching often comes up with two common problems.
1, the page encoding is not unified, the local is Utf-8, crawl page is GBK, resulting in crawling over the appearance of garbled
2, some websites with compression technology, compression for the page, gzip compression, which also results in the crawl result is abnormal
After searching for relevant solutions and local tests on the Internet, we will follow up with the relevant functions. Also on GitHub, address: Https://github.com/lock-upme/Spider
The main program simply explains:
First with file_get_contents crawl, if not fetch, then use Snoopy to crawl, and finally encoding conversion.
See the procedure below:
/** * Crawl Page content * * $obj = new Spider () * $result = $obj->spider (' http://www.test.com/1.html '); * * @author lock */class Spider {/** * Crawl page Contents * * @param string $url * @return string */public function Spider ($url) {set_ Time_limit, $result = self::filegetcontents ($url), if (Empty ($result)) {$result = Self::snoopy ($url);} if (empty ($result)) {return false;} $result = Self::array_iconv ($result); if (empty ($result)) {return false;} $result = Str_replace ("\ n", "", $result); return $result;} /** * Get page Content * * @param string $url * @return string */public function filegetcontents ($url) {//read-only 2 bytes if it is (16 binary) 1f 8b (10 31 139 opens gzip; $file = @fopen ($url, ' RB '); $bin = @fread ($file, 2); @fclose ($file); $strInfo = @unpack (' C2chars ', $bin); $typeCode = Intval ($strInfo [' chars1 ']. $strInfo [' chars2 ']); $url = ($typeCode ==31139)? ' compress.zlib://'. $url: $url; Ternary expression return @file_get_contents ($url);} /** * Get page Content * * @param string $url * @return string */public function snoopy ($url) {require_once ' SNOOPY.CLASS.PHP '; $snoopy = new Snoopy; $snoopy->agent = ' mozilla/5.0 (Windows NT 5.1) applewebkit/537.36 (khtml, like Gecko) CHROME/3 3.0.1750.146 safari/537.36 '; $snoopy->_fp_timeout = ten; $urlSplit = Self::urlsimplify ($url); $snoopy->referer = $ urlsplit[' domain '; $result = $snoopy->fetch ($url); return $snoopy->results; }/** * Encode data (from the network) * * @param array/string $data array * @param string $output converted encoding * @return returns the encoded data */public function Array_iconv ($data, $output = ' utf-8 ') {$encodeArr = array (' UTF-8 ', ' ASCII ', ' GBK ', ' GB2312 ', ' BIG5 ', ' JIS ', ' EU Cjp-win ', ' Sjis-win ', ' euc-jp '), $encoded = Mb_detect_encoding ($data, $ENCODEARR), if (Empty ($encoded)) {$encoded = ' UTF-8 '; }if (!is_array ($data)) {return @mb_convert_encoding ($data, $output, $encoded);} else {foreach ($data as $key + = $val) {$ Key = Self::array_iconv ($key, $output), if (Is_array ($val)) {$data [$key] = Self::array_iconv ($val, $output);} else {$data [$key] = @mb_convert_encoding ($data, $output, $encoded);}} return $data;}}}