Site: http://www.dy2018.com/
Database: MySQL account: root password: 123456
Build statement: CREATE table dy2008_url (id int (9) Not NULL auto_increment, URL varchar (a) not NULL, status tinyint (2) is not NULL, P Rimary KEY (id));
Code:
<?phpdeclare (ticks = 1);p cntl_signal (sigquit, ' Signal_handler ');p cntl_signal (SIGTERM, ' Signal_handler '); $ Crawlers_pid = Array (), $finish _count = 0;//Signal handler functions function Signal_handler ($signal) {global $crawlers _pid; if ($signal = = Sigquit | | $signal = = SIGTERM) {foreach ($crawlers _pid as $pid) {Posix_kill ($pid, SI Gterm); } echo "----------Crawl task Exit----------"; Global $con;//mysql exit (); }}//get method gets the link corresponding to the page content function get_page_content ($url) {$content = file_get_contents ($url); return $content;} Post method gets the link corresponding to the page content function Get_page_content_by_post ($url, $arr) {$arr = Http_build_query ($arr); $opts = Array (' http ' = > Array (' method ' = ' ' POST ', ' header ' = ' content-type:application/x-www-form-urlencoded '. Content-length: '. strlen ($data). ' ' ', ' content ' = ' $data '); $context = Stream_context_create ($opts); $content = File_ Get_contents ($url, False, $context); return $content;} dy2018 fetching main process function run_dy2018 () {Global $crawlers _piD;global $finish _count; $crawl _urls = Array ("http://www.dy2018.com/html/tv/hytv/", "http://www.dy2018.com/html/tv/ hepai/"," http://www.dy2018.com/html/tv/gangtai/"," http://www.dy2018.com/html/tv/oumeitv/"," HTTP +// www.dy2018.com/html/tv/rihantv/"," http://www.dy2018.com/html/tv/tvzz/"," http://www.dy2018.com/0/"," HTTP +// Www.dy2018.com/1/"," http://www.dy2018.com/2/"," http://www.dy2018.com/3/"," http://www.dy2018.com/4/"," http// www.dy2018.com/5/"," http://www.dy2018.com/6/"," http://www.dy2018.com/7/"," http://www.dy2018.com/8/"," http// www.dy2018.com/9/"," http://www.dy2018.com/10/"," http://www.dy2018.com/11/"," http://www.dy2018.com/12/"," http:/ /www.dy2018.com/13/"," http://www.dy2018.com/14/"," http://www.dy2018.com/15/"," http://www.dy2018.com/16/"," http ://www.dy2018.com/17/"," http://www.dy2018.com/18/"," http://www.dy2018.com/19/"," http://www.dy2018.com/20/"); $i = 0;while ($i < count ($crawl _urls)) {$pid = Pcntl_fork (); if ($pid = =-1) {echo "system error. Check it now! "; Exit ();} else if ($pid >0) {$crawlers _pid[$i] = $pid;} else {$url = $crawl _urls[$i]; $con = mysql_connect ("localhost", "root", "123456"); if (! $con) {die (' Count not connect: '. Mysql_error ());} mysql_select_db ("MySQL", $con); crawl_process ($url); $finish _count++;} $i + +;} Pcntl_waitpid may cause a signal listener to fail while (true) {if ($finish _count = = count ($crawlers _pid)) {echo "----------Crawl task Finish-- --------"; Mysql_close (); exit ();} Sleep (1); }}//from the portal link to its next all download page links crawl process function crawl_process ($url) {echo "Start handle URL:". $url; $page _idx = 1; $valid _tag = true;$ Info_url_pattern = '/\/i\/\d+.html/'; $ftp _url_pattern = '/ftp:\/\/.*? (SWF|AVI|FLV|MPG|RM|MOV|WAV|ASF|3GP|MKV|RMVB)/I ';//^$ two symbols do not work while ($valid _tag) {$page _url = Get_page_index_url ($ URL, $page _idx);p rintf ("Start crawl URL:". $page _url. " \ n "), $page _content = get_page_content ($page _url), $valid _tag = Is_valid_page ($page _content), if ($valid _tag) {$matches _urls = Array ();p Reg_match_all ($info _url_pattern, $page _content, $matches _urls); $page _content = mb_convert_encoding($page _content, "UTF-8", "GBK"); for ($i =0; $i <count ($matches _urls[0]); $i + +) {$detail _url = ' http://www.dy2018.com ' . $matches _urls[0][$i]; $detail _page_content = get_page_content ($detail _url); $detail _page_content = Mb_convert_ Encoding ($detail _page_content, "UTF-8", "GBK");p Reg_match_all ($ftp _url_pattern, $detail _page_content, $ftp _urls); $ Ftp_links = Array (), for ($j =0, $j <count ($ftp _urls[0]), $j + +) {$ftp _links[$j] = $ftp _urls[0][$j];} $ftp _links_unique = array_values (Array_unique ($ftp _links)), foreach ($ftp _links_unique as $ftp _link) {mysql_query (" Insert into Dy2018_url (URL, status) VALUES (' $ftp _link ', ' 0 ') ');//Echo Mysql_error ();//print MySQL error}sleep (1);}} $page _idx++;}} Gets the URL link for the page number function Get_page_index_url ($url, $idx) {$idx _url = $url, if ($idx = = 1) {$idx _url = $idx _url. ' Index.html ';} else if ($idx > 1) {$idx _url = $idx _url. ' Index_ '. $idx. HTML ';} return $idx _url;} Determines whether the link is valid according to the content of the page function Is_valid_page ($content) {return $content? True:false;} run_dy2018 (); Mysql_close ();? >
Results:
Crawler _ Movie FTP