Share the collection code of a novel. Recently, ET used the locomotive to collect novels, which were often unavailable according to their rules. I encountered an issue where iframe was used in the novel 520 and I wrote one myself. at first, I thought that two regular expressions would solve the problem of [list, content, writing is complicated. I have made a good change to several versions. The biggest trouble is how to encapsulate the collection code of share a novel.
Recently, ET used the locomotive to collect novels, which were often unavailable according to their rules. I encountered an issue where iframe was used in the novel 520 and I wrote one myself. at first, I thought that two regular expressions would solve the problem of [list, content, writing is complicated.
If you have modified several editions, the biggest headache is that the cost of Code Encapsulation is low when you use different sites. Here we use a small decision maker model. Then, the encapsulated function is sent to you. During the next collection, the skip mechanism of collected chapters should be avoided. [after all, it is very depressing to have a novel station with several thousand articles interrupted once and cannot be picked up.]
PHP code
Class grep extends Controller {var $ tableName = 'grep'; var $ pagesize = 31; var $ order_string = "grep_order desc, grep_id desc"; var $ filter_field = "grep_title "; var $ check_repeat_field = "grep_title"; var $ buttons = array (); var $ description = "[crawling novels]"; function index () {// get the story list $ story_model = "story_model"; $ this-> load-> model ($ story_model); $ where = array ("story_id <445 "); $ rows_story = $ this-> $ story_model-> get ($ where); foreach ($ rows_story as $ key => $ val_story): if ($ key <237) continue; $ url =" http://www.xiaoxiaoshuo.net/ ". $ Val_story-> storycate_vtitle. "/". $ val_story-> story_vtitle; $ src_content = file_get_contents ($ url); $ src_content = iconv ("GBK", "UTF-8 // IGNORE", $ src_content ); $ src_content = str_replace ("/style = \" border-width: 0px \ s * 1px \ s * 1px \ s * 0px; border-color: # C8D8B8; border-style: solid; padding: 3px; float: left; width: 313px; \ "/I", "", $ src_content ); $ src_content = str_replace ("style = \" BORDER-RIGHT: # c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: # c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: # c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: # c8d8b8 1px solid \ "", "", $ src_content); $ src_content = preg_replace ("/title = \" [^ \ "] * \"/iU ", "", $ src_content); $ src_content = preg_replace ("/title = \" [^ \ "] * \"/iU "," ", $ src_content ); $ src_content = preg_replace ("/
] *>/IU "," ", $ src_content); $ src_content = preg_replace ("/<\/LI [^>] *>/iU ","", $ src_content); $ src_content = preg_replace ("// iU", $ src_content, $ arr_dstorycate); $ dstorycate_arr = $ arr_dstorycate [1]; foreach ($ dstorycate_arr as $ key_dstorycate => $ val_dstory_cate) {preg_match_all ("/([^ <] *) <\/font>/I", $ scheme, $ dcate_title ); $ datacate ["dstorycate_pid"] = $ val_story-> story_id; $ datacate ["dstorycat E_title "] = $ dcate_title [1] [0]; // Obtain the category object. mark the previous category as downloaded $ dtitle = $ datacate [" dstorycate_title "]. $ obj_storycate = $ this-> check_dcate ($ dtitle, $ val_story); // pr ($ obj_storycate); if ($ obj_storycate-> dstorycate_ishot = 1) {$ this-> log ("$ val_story-> story_title-$ dtitle, skip"); continue;} preg_match_all ("/] *> (?! <\/A>) ([\ d \ D] *) <\/a>/iU ", $ val_dstory_cate, $ dinfo_list); $ list_story_url = $ dinfo_list [1]; $ list_story_title = $ dinfo_list [2]; $ story_url =" http://www.xiaoxiaoshuo.net/ ". $ Val_story-> storycate_vtitle. "/". $ val_story-> story_vtitle; $ this-> add_storyxxs_info ($ obj_storycate, $ list_story_url, $ list_story_title, $ story_url);} endforeach;} function status () {$ SQL = "select count (dstory_id) as all_story from dstory;"; $ query = $ this-> db-> query ($ SQL ); $ cont_all = $ query-> row (); echo $ cont_all-> all_story; $ SQL = "select count (dstory_id) as story1 from dstory where dstory_status = 1 "; $ query = $ this-> db-> query ($ SQL); $ cont_all = $ query-> row (); echo "--". $ cont_all-> story1; $ SQL = "select max (dstorycate_id) as max_id, max (dstorycate_pid) as max_pid from dstorycate "; $ query = $ this-> db-> query ($ SQL); $ cont_all = $ query-> row (); echo "--". $ cont_all-> max_id. "--". $ cont_all-> max_pid;}/*** download the section after 445 **/function index445 () {$ story_model = "story_model "; $ this-> load-> model ($ story_model); $ where = array ("story_id> 445 "); $ rows_story = $ this-> $ story_model-> get ($ where); foreach ($ rows_story as $ key => $ val_story ): // get the story_content $ story_url_arr = explode ("/", $ val_story-> story_url); $ story_url = $ story_url_arr [1]. "/". $ story_url_arr [2]. "/". $ story_url_arr [3]. "/". $ story_url_arr [4]; $ dest_url =" http://www.xiaoshuo520.com/ ". $ Story_url; $ src_content = CS_file_get_contents ($ dest_url); $ src_content = iconv ("GBK", "UTF-8 // IGNORE", $ src_content ); // group data by category. preg_match_all ("/([\ d \ D] *) $ val_dstory_cate) {preg_match_all ("/([\ d \ D] *) <\/div>/I ", $ val_dstory_cate, $ dcate_title); $ datacate ["dstorycate_pid"] = $ val_story-> story_id; $ datacate ["dstorycate_title"] = $ dcate_title [1] [0] // Obtain the category object. mark the previous category as downloaded $ dtitle = $ datacate ["dstory Cate_title "]; $ response = $ this-> check_dcate ($ dtitle, $ val_story); // pr ($ obj_storycate); if ($ obj_storycate-> dstorycate_ishot = 1) {$ this-> log ("$ val_story-> story_title-$ dtitle, skip"); continue;} preg_match_all ("/] *> (?! <\/A>) ([\ d \ D] *) <\/a>/iU ", $ val_dstory_cate, $ dinfo_list); $ list_story_url = $ dinfo_list [1]; $ list_story_title = $ dinfo_list [2]; $ this-> add_story520_info ($ obj_storycate, $ list_story_url, $ list_story_title, $ story_url);} endforeach ;} /*** obtain the title object based on the chapter standard and novel object **/function check_dcate ($ title, $ obj_story) {$ dstorycate_model = "dstorycate_model "; $ this-> load-> model ($ dstorycate_model); $ where = array ("dstorycate _ Pid = $ obj_story-> story_id "," dstorycate_title = '$ title' "); $ rows = $ this-> $ dstorycate_model-> get ($ where); if (! $ Rows) {$ datacate ["dstorycate_pid"] = $ obj_story-> story_id; $ datacate ["dstorycate_title"] = $ title; $ this-> $ dstorycate_model-> insert ($ datacate); $ obj_cate_id = $ this-> db-> insert_id (); $ where = array ("dstorycate_id = $ obj_cate_id ", "dstorycate_title = '$ title'"); $ rows = $ this-> $ dstorycate_model-> get ($ where); $ this-> log ("this book has no category, the novel $ obj_story-> story_title-$ title ");} else {$ this-> log (" already exists Related Novel category $ obj_story-> story_title-$ title, skip ");} $ obj_cate = $ rows [0]; $ SQL = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $ obj_story-> story_id & dstorycate_id <$ obj_cate-> dstorycate_id "; $ this-> db-> query ($ SQL); return $ obj_cate;}/***** add novel to */function add_story520_info ($ cate_obj, $ list_story_url, $ list_story_title, $ url) {$ dstory_model = "dstory_model"; $ this-> load-> mod El ($ dstory_model); $ min_key = intval ($ cate_obj-> dstorycate_pvcount); if (! $ Min_key) $ min_key = 0; foreach ($ list_story_url as $ key => $ val): if ($ key <$ min_key) {continue ;} $ this-> check_dstory ($ cate_obj ," http://www.xiaoshuo520.com/ ". $ Url. "/". $ val, $ list_story_title [$ key], "grep_520_info"); endforeach;} function add_storyxxs_info ($ cate_obj, $ list_story_url, $ list_story_title, $ url) {$ dstory_model = "dstory_model"; $ this-> load-> model ($ dstory_model); $ min_key = intval ($ cate_obj-> dstorycate_pvcount); if (! $ Min_key) $ min_key = 0; foreach ($ list_story_url as $ key => $ val): if ($ key <$ min_key) {$ this-> log ("$ cate_obj-> dstorycate_id No. $ cate_obj-> dstorycate_title ". $ list_story_title [$ key]. "Chapter $ key <$ min_key"); continue;} $ this-> check_dstory ($ cate_obj, $ url. "/". $ val, $ list_story_title [$ key], "grep_xxs_info"); endforeach ;}