A simple web page collection tool compiled by programmers. Currently, it only supports page resource collection functions that can be accessed without logon and permission authentication, including articles and download resources; the Project address is github. comitziygather; currently, the IP proxy function is not supported. If you need it, you can expand it on your own. Currently, only
A simple web page collection tool compiled by programmers. Currently, it only supports page resource collection functions that can be accessed without logon and permission authentication, including articles and download resources; project address http://github.com/itziy/gather; currently does not support IP proxy function, friends who need to expand themselves; currently does not support only
A simple web page collection tool for programmers
Currently, only page resources that can be accessed without logon and permission authentication are supported, including articles and download resources.
; Project address http://github.com/itziy/gather
Currently, the IP proxy function is not supported. If you need it, you can expand it on your own.
Currently, only the collection of temporary resources is not supported.
;
; Function Description
; 1. supports resource download for multiple jumps
; 2. Supports character replacement for collected content
3. Supports resumable data collection. You can set the PAGE_START parameter.
;
; Author: Rain
Contact QQ: 563268276
URL: www.94cto.com/www.itziy.com/www.verypan.com
; Time:
; Version: V2
; Copyright: You can make any modification, publish it, and learn to use it on your own. It cannot be used for illegal purposes. Otherwise, the consequences will be borne by you.
;
; Usage
# The default configuration file name is conf. ini in the current execution directory.
# Php gather. php conf. ini
Note: by default, the default command line in Linux is UTF-8, and the command line in Windows is GBK encoded in PHP.
$ Cv) {if (! Defined ($ ck) define ($ ck, $ cv);} // init global variableforeach ($ confArr ['variable'] as $ vk => $ vv) {$ vk = $ vv;} $ ga = new Gather (); $ ga-> run (); class Gather {public function _ construct () {$ this-> init_check ();} public function run () {global $ table_mapping, $ text_filter, $ preDownArr, $ downArr, $ need_host_check_field_name; for ($ page = PAGE_START; $ page <= PAGE_COUNT; $ page ++) {$ this-> write ('start collection list '. $ page. 'page Content... '); $ list_content = $ this-> get (sprintf (WEB_LIST_URL, $ page); if (empty ($ list_content )) {$ this-> write ('the content of the captured list page is empty, so filter out '); continue;} $ list_content = str_replace ("\ r ",'', $ list_content); $ list_content = str_replace ("\ n", '', $ list_content); // precisely locate the module content to be crawled if (! Preg_match (WEB_LIST_POSTION, $ list_content, $ list_search) {$ this-> write ('accurately matches the content on the list page, so filter out '); continue ;} if (isset ($ list_search [1]) $ list_content = $ list_search [1]; else $ list_content = $ list_search [0]; // endpreg_match_all (WEB_CONTENT_URL_REG, $ list_content, $ match); $ this-> write ('actual Number of captured content records '. count ($ match [0]); if (isset ($ match [1]) & is_array ($ match [1]) &! Empty ($ match [1]) $ match [0] = $ match [1]; if (is_array ($ match [0]) &! Empty ($ match [0]) {$ this-> write ('current list page, total matched :'. count ($ match [0]). 'content page'); foreach ($ match [0] as $ kval => $ val) {if (strpos ($ val, 'HTTP: ') = false) {if (substr ($ val, 0, 1) = '/') $ val = WEB_HOST. $ val; else $ val = WEB_HOST. '/'. $ val;} $ list_url = $ val; $ this-> write ('currently crawled name '. ($ kval + 1 ). 'content record '); $ web_content = $ this-> get ($ val); $ content_url = $ val; if (empty ($ web_content )) {$ this-> write ('filtered out because the crawled content page is empty '); contin Ue;} $ web_content = str_replace ("\ r", '', $ web_content); $ web_content = str_replace (" \ n ", '[]', $ web_content ); $ SQL = "INSERT ". TABLE_NAME. "(". implode (',', array_keys ($ table_mapping )). ") VALUES ("; foreach ($ table_mapping as $ field => $ reg) $ SQL. = ':'. $ field. ','; $ SQL = substr ($ SQL, 0,-1); $ SQL. = ')'; if (IS_DEBUG) $ this-> write ('execute SQL '. $ SQL); $ dsn = 'mysql: dbname = '. DB_NAME. '; host = '. DB_HOST; try {$ dbh = New PDO ($ dsn, DB_USER, DB_PWD);} catch (PDOException $ e) {$ this-> write ('Connection failed :'. $ e-> getMessage (), true) ;}$ dbh-> query ("set names 'utf8'"); $…… = $ dbh-> prepare ($ SQL ); $ something-> closeCursor (); foreach ($ table_mapping as $ field => $ reg) {if (substr ($ reg, 0, 1 )! = '/') {$ Field = $ reg;} else {if (! Preg_match ($ reg, $ web_content, $ tmp_match) {if (defined ('item _ IMAGE_FIELD_NAME ') & ITEM_IMAGE_FIELD_NAME = $ field) $ field = ITEM_DEFAULT_IMG; else {$ this-> write ('Sorry, Matched Field :'. $ field. 'failed, filter this record '); continue 2 ;}} if (isset ($ tmp_match [1]) $ field = $ tmp_match [1]; $ field = $ this-> closetags ($ field); $ field = trim ($ field); // delete a javascript script $ field = preg_replace ('/
(.*?) <\/Script>/I ', '', $ field); // Delete the link $ field = preg_replace ('/(.*?) <\/A>/I ',' $ {2} ', $ field); // The image link address must be preg_match_all (' // I ', $ field, $ img_match); if (isset ($ img_match [2]) & is_array ($ img_match [2]) &! Empty ($ img_match [2]) {foreach ($ img_match [2] as $ img_val) {if (strpos ($ img_val, 'HTTP: ') = false) {$ new_val = $ img_val; if (substr ($ new_val, 0, 1 )! = '/') $ New_val = '/'. $ img_val; $ new_val = WEB_HOST. $ new_val; $ field = str_replace ($ img_val, $ new_val, $ field );}}} // end // perform special processing on the pre line feed in HTML $ field = preg_replace ('/
(.*?) <\/Pre>/I ','
${1}
', $ Field); preg_match_all ('/
(.*?) <\/Pre>/I ', $ field, $ pre_match); if (isset ($ pre_match [1]) & is_array ($ pre_match [1]) &! Empty ($ pre_match [1]) {foreach ($ pre_match [1] as $ pre_val) $ field = str_replace ($ pre_val, str_replace ("【]", "\ r \ n", $ pre_val), $ field);} // end} // before receiving data, returns all corresponding line breaks $ field = str_replace ('[]', "\ r \ n", $ field ); // if (is_array ($ text_filter )&&! Empty ($ text_filter) {foreach ($ text_filter as $ tk => $ TV) $ field = str_ireplace ($ tk, $ TV, $ field );} if (defined ('item _ IMAGE_FIELD_NAME ') & ITEM_IMAGE_FIELD_NAME ==$ field & (empty ($ field) | trim ($ field) = '#') $ field = ITEM_DEFAULT_IMG; if (defined ('item _ IMAGE_FIELD_NAME ') & ITEM_IMAGE_FIELD_NAME = $ field & stripos ($ field, 'http: ') = false) {if (substr ($ field, 0, 1) ='/') $ field = WEB _ HOST. trim ($ field); else $ field = WEB_HOST. '/'. trim ($ field);} if (is_array ($ need_host_check_field_name )&&! Empty ($ need_host_check_field_name) & in_array ($ field, $ need_host_check_field_name) & stripos ($ field, 'HTTP: ') = false) {if (substr ($ field, 0, 1) = '/') $ field = WEB_HOST.trim ($ field); else $ field = WEB_HOST. '/'. trim ($ field);} if (defined ('item _ DOWNLOAD_FIELD_NAME ') & ITEM_DOWNLOAD_FIELD_NAME = $ field) {if (! Empty ($ preDownArr) & is_array ($ preDownArr) {$ is_find_pre_down_url = false; foreach ($ preDownArr as $ pdk => $ pdv) {if (stripos ($ field, 'http: ') ===false) $ pdv. = $ field; $ pdv_content = $ this-> get ($ pdv, array ('referer :'. $ content_url), false); $ pdv_content = str_replace ("\ r", '', $ pdv_content); $ pdv_content = str_replace (" \ n ",'', $ pdv_content); if (! Preg_match ($ pdk, $ pdv_content, $ pdv_match) continue; $ field = trim (urldecode ($ pdv_match [1]); $ is_find_pre_down_url = true; break ;} if (! $ Is_find_pre_down_url) {$ this-> write ("failed to match the pre-download address, so filter this record"); continue 2 ;}}} if (defined ('item _ DOWNLOAD_FIELD_NAME ') & ITEM_DOWNLOAD_FIELD_NAME ==$ field & stripos ($ field, 'HTTP:') == false) {if (is_array ($ downArr )&&! Empty ($ downArr) {foreach ($ downArr as $ d_url) {if (substr ($ field, 0, 1 )! = '/') $ Field = '/'. $ field; $ d_url. = $ field; if (defined ('item _ DOWNLOAD_FIELD_NAME ') & ITEM_DOWNLOAD_FIELD_NAME = $ field & (stripos ($ d_url ,'&')! = False) {$ find_real_down = false; $ result_tmp = $ this-> get ($ d_url, array ('Referer :'. $ list_url), true); if (stripos ($ result_tmp, 'location :')! = False) {if (IS_DEBUG) $ this-> write ("Check the jump information \ n ************************* **************************************** * *********** \ n ". $ result_tmp. "\ n ************************************* **************************************** *"); preg_match ('/Location :(. *?) \ R/I ', $ result_tmp, $ url_tmp_match); if (isset ($ url_tmp_match [1]) {$ find_real_down = true; $ field = trim ($ url_tmp_match [1]); break;} elsecontinue;} if (! $ Find_real_down) {$ this-> write (':'. $ d_url. 'location does not exist, so filter this record '); continue 3 ;}} else {$ find_real_down = true; break ;}} else {if (substr ($ field, 0, 1) = '/') $ field = WEB_HOST.trim ($ field); else $ field = WEB_HOST. '/'. trim ($ field); if (defined ('item _ DOWNLOAD_FIELD_NAME ') & ITEM_DOWNLOAD_FIELD_NAME ==$ field & (stripos ($ field ,'&')! = False) {$ result_tmp = $ this-> get ($ field, array ('Referer :'. $ list_url), true); if (stripos ($ result_tmp, 'location :')! = False) {if (IS_DEBUG) $ this-> write ("Check the jump information \ n ************************* **************************************** * *********** \ n ". $ result_tmp. "\ n ************************************* **************************************** *"); preg_match ('/Location :(. *?) \ R/I ', $ result_tmp, $ url_tmp_match); if (isset ($ url_tmp_match [1]) $ field = trim ($ url_tmp_match [1]) ;}}}// for the download url, make the Last Judgment if (defined ('item _ DOWNLOAD_FIELD_NAME ') & amp; ITEM_DOWNLOAD_FIELD_NAME = $ field) {if (IS_DEBUG) $ this-> write ($ field); if (! ($ Url_ret = @ get_headers ($ field, 1) {$ this-> write (':'. $ field. 'invalid, so filter this record '); continue 2;} if (stripos ($ url_ret [0], '100 OK') = false) {$ this-> write (':'. $ field. 'error 404 returned, so this record is filtered '); continue 2;} if (stripos ($ field, 'pan .baidu.com') = false) {if ($ url_ret ['content-length'] <100) {$ this-> write (':'. $ field. 'size of returned content is '. $ url_ret ['content-length']. ', so filter this record'); continue 2 ;}}$ $ field = trim ($ field); if (IS_DEB UG) $ this-> write ('*'. 'Field :'. $ field. 'value :'. "\ n ************************************* * ************************** \ n ". $ field. "\ n ************************************* ***************************"); $ something-> bindValue (':'. $ field, trim ($ field);} if (INSERT_DB) $ something-> execute (); $ something-> closeCursor (); // $ dbh = null; $ this-> write ('resting, pausing '. SLEEP_TIME. 'Second to continue capturing... '); sleep (SLEEP_TIME) ;}} else {$ this-> write ('contents are not captured on the list page, so filter out') ;}}$ This-> write ('', true);} protected function closetags ($ html) {// No need to complete the tag $ arr_single_tags = array ('meta', 'img ', 'br', 'link', 'region '); // match the start label preg_match_all ('# <([a-z] + )(? :.*)? (?
# IU ', $ html, $ result); $ openedtags = $ result [1]; // tag preg_match_all ('#
# IU ', $ html, $ result); $ closedtags = $ result [1]; // calculates the number of disabled tags, if the same, the returned html data $ len_opened = count ($ openedtags); if (count ($ closedtags) = $ len_opened) return $ html; // sorts the array, put the last enabled tag at the beginning $ openedtags = array_reverse ($ openedtags); // traverse the Enable tag array for ($ I = 0; $ I <$ len_opened; $ I ++) {// if you need to complete the tag if (! In_array ($ openedtags [$ I], $ arr_single_tags) {// if this tag is not in the disabled Tag if (! In_array ($ openedtags [$ I], $ closedtags) {// directly complete the closed tag $ html. ='
';} Else {unset ($ closedtags [array_search ($ openedtags [$ I], $ closedtags)]) ;}} return $ html;} protected function init_check () {if (! $ This-> check_curl_support () $ this-> write ('Sorry, Please enable the support of the CURL Class Library first, otherwise it cannot be executed ', true ); $ this-> check_mysql_connect (); $ this-> write ('program initialization check passed, execute the subsequent process... ');} private function get ($ url, $ data = array (), $ showhead = false) {$ url = trim (html_entity_decode ($ url )); $ this-> write ('start to capture :'. $ url); $ ch = curl_init (); curl_setopt ($ ch, CURLOPT_URL, $ url); // curl_setopt ($ ch, CURLOPT_USERAGENT, "Baiduspider + (+ http://www.baidu.com/search/spider.htm ) "); Curl_setopt ($ ch, CURLOPT_USERAGENT," Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) "); curl_setopt ($ ch, CURLOPT_HEADER, $ showhead ); curl_setopt ($ ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ ch, CURLOPT_HTTPHEADER, $ data); $ ret = curl_exec ($ ch); $ error = curl_error ($ ch ); curl_close ($ ch); unset ($ ch); if (! Empty ($ error) {$ this-> write ('program capture URL :'. $ url. 'error occurred. error message :'. $ error); return false;} if (WEB_CHARSET! = 'Utf-8') $ ret = iconv (WEB_CHARSET, 'utf-8', $ ret); return $ ret;} // when check finish, mysql connect will auto closeprivate function check_mysql_connect () {$ con = mysql_connect (DB_HOST, DB_USER, DB_PWD); if (! Is_resource ($ con) $ this-> write ('The program cannot be successfully linked to the database, specific error message: '. mysql_error (), true); if (! Mysql_select_db (DB_NAME, $ con) $ this-> write ('The program cannot be linked to the database :'. DB_NAME. ', specific error message :'. mysql_error (), true); mysql_close ($ con);} private function check_curl_support () {if (! Extension_loaded ('curl') |! Function_exists ('curl _ init ') return false; return true;} private function write ($ str, $ end = false) {if (PATH_SEPARATOR = ':') echo $ str, PHP_EOL, PHP_EOL; elseecho iconv ('utf-8', 'gbk', $ str), PHP_EOL, PHP_EOL; if ($ end) die ("program exit"); sleep (OUTPUT_SPEED );}}