Supergod page capture program jump
_ Init_request ($ request); $ this-> _ init_curl ();} protected function _ init_var () {$ this-> pageCount = 0 ;} protected function _ init_request (request $ request) {$ this-> request = $ request; // $ this-> request-> cache_path = _ DIR __. '/sjm_cache/'; // $ this-> request-> fetch_item_query = '# J_posts_list. subject. title a'; // $ this-> request-> fetch_page_current = '. j_page_wrap. pages strong '; // $ this-> request-> base_url =' http://bbs.sijiaomao.com/index.php?m=bbs&c=thread&fid=10&page=%d ';} Protected function _ init_curl () {$ this-> curl = new CurlMulti (); $ this-> cacheDir = $ this-> request-> cache_path. 'cache'; if (! Is_dir ($ this-> cacheDir) {mkdir ($ this-> cacheDir, 777, true);} $ this-> cacheDataDir = $ this-> request-> cache_path. 'data'; if (! Is_dir ($ this-> cacheDataDir) {mkdir ($ this-> cacheDataDir, 777, true );} $ this-> curl-> cache = array ('dir' => $ this-> cacheDir, 'on' => true, 'expire '=> 3600*24 ); $ this-> curl-> maxThread = 10; $ this-> curl-> opt [CURLOPT_CONNECTTIMEOUT] = 10;} public function fetch_list () {$ this-> _ add_fetch_list_url (); $ this-> curl-> start (); $ this-> _ save_article_list ();} public function fetch_article () {foreach ($ this-> article_list as $ k => $ v) {$ this-> curl-> add (array ('URL' => $ v ['href ']), array ($ this,' _ success_article '));} $ this-> curl-> start ();} public function display () {printf ("\ n capturing % d pages in total \ n article list % d article \ n related articles % d article \ n article directory stored in % s \ n ", $ this-> pageCount + $ this-> articleCount, $ this-> pageCount, count ($ this-> article_list), $ this-> cacheDataDir. '/list. php ');} public function fetch () {return sprintf ("\ n crawls % d pages in total \ n article list % d articles \ n related articles % d articles \ n article directories are stored in % s \ n ", $ this-> pageCount + $ this-> articleCount, $ this-> pageCount, count ($ this-> article_list), $ this-> cacheDataDir. '/list. php ');} public function _ add_fetch_list_url ($ page = 1) {$ this-> curl-> add (array ('URL' => sprintf ($ this-> request-> base_url, $ page ), 'args' => array ('page' => $ page), array ($ this, '_ success_list');} protected function _ save_article_list () {$ res = file_put_contents ($ this-> cacheDataDir. '/list. php', sprintf ("
Article_list, true); // sort correlations/* uasort ($ this-> article_list, function ($ a, $ B) {preg_match_all ('# ([a-zA-Z] +) # is', $ a ['title'], $ match ); $ a_title = strtoupper (implode ("", $ match [0]); preg_match_all ('# ([a-zA-Z] +) # AIS ', $ B ['title'], $ match); $ B _title = strtoupper (implode ("", $ match [0]); return $ a_title> $ B _title ;}); */$ res = file_put_contents ($ this-> cacheDataDir. '/list.txt', array_map (function ($ a_list ){ $ Str = sprintf ("title: % s \ t hyperlink: % s \ n", str_replace ("", "", $ a_list ['title']), $ a_list ['href ']); return $ str ;}, $ this-> article_list); return $ res;} public function _ success_article ($ r, $ param) {++ $ this-> articleCount;} public function _ success_list ($ r, $ param) {++ $ this-> pageCount; $ html = phpQuery :: newincluenthtml ($ r ['content']); $ list = $ html [$ this-> request-> fetch_item_query]; foreach ($ list as $ v) {$ v = pq ($ v); $ Item = array ("title" = >$ v-> attr ('title ')? $ V-> attr ('title'): $ v-> text (), "href" => real_url ($ v-> attr ('href '), $ this-> request-> base_url); $ this-> article_list [md5 ($ item ['href '])] = $ item ;} $ page_current = $ html [$ this-> request-> fetch_page_current]; if ($ page_current-> next ()-> text ()) {$ page = ++ $ param ['Page']; $ this-> _ add_fetch_list_url ($ page);} phpQuery: unloadDocuments ();}} class request {/* url */public $ base_url;/* cache file path */public $ cache_path;/* get CSS selector for element */public $ fetch_item_query;/* CSS selector for the current page element */public $ fetch_page_current; static $ instance; static public function getInstance () {if (empty (self: $ instance) {self: $ instance = new self;} return self: $ instance;} private function _ construct () {$ this-> _ init_base ();} function _ init_base () {$ this-> cache_path = _ DIR __. '/'. trim ($ _ POST ['cache _ path'], '/'). '/'; $ this-> fetch_item_query = $ _ POST ['Fetch _ item_query ']; $ this-> fetch_page_current = $ _ POST ['fetch _ page_current']; $ this-> base_url = $ _ POST ['URL'];} function request () {if (strstr ($ _ POST ['URL'], '? ') {$ Url = sprintf ("% s & auth = % s", $ _ POST ['URL'], $ auth );} else {$ url = sprintf ("% s? Auth = % s ", $ _ POST ['URL'], $ auth) ;}$ param = array (); if (isset ($ _ POST ['param']) {foreach ($ _ POST ['param'] as $ k => $ item) {if (! Empty ($ item ['method']) &! Empty ($ item ['name']) {$ param [$ item ['method'] [$ item ['name'] = $ item ['value'] ;}} if (isset ($ param ['get']) &! Empty ($ param ['get']) {foreach ($ param ['get'] as $ name => $ value) {$ url = sprintf ("% s & % s = % s", $ url, $ name, $ value) ;}$ post_data = null; if (isset ($ param ['post']) &! Empty ($ param ['post']) {$ post_data = $ param ['post'] ;}}?> Fetch_list (); // $ myCurl-> fetch_article (); myDebug: set_end ();} else {$ _ POST ['URL'] =' http://www.oschina.net/code/tag/php?show=time&lang=&catalog=&p=%d '; $ _ POST ['cache _ path'] = 'oschina'; $ _ POST ['fetch _ item_query'] = '. code_list ul li. code_title> A'; $ _ POST ['fetch _ page_current '] = '. pager li. current ';}?> Page crawler