Crawler _ Movie FTP download address

Source: Internet
Author: User
Tags signal handler

Site: http://www.dy2018.com/

Database: MySQL account: root password: 123456

Build statement: CREATE table dy2008_url (id int (9) Not NULL auto_increment, URL varchar (a) not NULL, status tinyint (2) is not NULL, P Rimary KEY (id));

Code:

<?phpdeclare (ticks = 1);p cntl_signal (sigquit, ' Signal_handler ');p cntl_signal (SIGTERM, ' Signal_handler '); $    Crawlers_pid = Array (), $finish _count = 0;//Signal handler functions function Signal_handler ($signal) {global $crawlers _pid; if ($signal = = Sigquit | | $signal = = SIGTERM) {foreach ($crawlers _pid as $pid) {Posix_kill ($pid, SI        Gterm);        } echo "----------Crawl task Exit----------";    Global $con;//mysql exit (); }}//get method gets the link corresponding to the page content function get_page_content ($url) {$content = file_get_contents ($url); return $content;} Post method gets the link corresponding to the page content function Get_page_content_by_post ($url, $arr) {$arr = Http_build_query ($arr); $opts = Array (' http ' = > Array (' method ' = ' ' POST ', ' header ' = ' content-type:application/x-www-form-urlencoded '. Content-length: '. strlen ($data). ' ' ', ' content ' = ' $data '); $context = Stream_context_create ($opts); $content = File_ Get_contents ($url, False, $context); return $content;} dy2018 fetching main process function run_dy2018 () {Global $crawlers _piD;global $finish _count; $crawl _urls = Array ("http://www.dy2018.com/html/tv/hytv/", "http://www.dy2018.com/html/tv/ hepai/"," http://www.dy2018.com/html/tv/gangtai/"," http://www.dy2018.com/html/tv/oumeitv/"," HTTP +// www.dy2018.com/html/tv/rihantv/"," http://www.dy2018.com/html/tv/tvzz/"," http://www.dy2018.com/0/"," HTTP +// Www.dy2018.com/1/"," http://www.dy2018.com/2/"," http://www.dy2018.com/3/"," http://www.dy2018.com/4/"," http// www.dy2018.com/5/"," http://www.dy2018.com/6/"," http://www.dy2018.com/7/"," http://www.dy2018.com/8/"," http// www.dy2018.com/9/"," http://www.dy2018.com/10/"," http://www.dy2018.com/11/"," http://www.dy2018.com/12/"," http:/ /www.dy2018.com/13/"," http://www.dy2018.com/14/"," http://www.dy2018.com/15/"," http://www.dy2018.com/16/"," http ://www.dy2018.com/17/"," http://www.dy2018.com/18/"," http://www.dy2018.com/19/"," http://www.dy2018.com/20/"); $i = 0;while ($i < count ($crawl _urls)) {$pid = Pcntl_fork (); if ($pid = =-1) {echo "system error. Check it now! "; Exit ();} else if ($pid >0) {$crawlers _pid[$i] = $pid;} else {$url = $crawl _urls[$i]; $con = mysql_connect ("localhost", "root", "123456"); if (! $con) {die (' Count not connect: '. Mysql_error ());} mysql_select_db ("MySQL", $con); crawl_process ($url); $finish _count++;} $i + +;} Pcntl_waitpid may cause a signal listener to fail while (true) {if ($finish _count = = count ($crawlers _pid)) {echo "----------Crawl task Finish--            --------"; Mysql_close (); exit ();}        Sleep (1); }}//from the portal link to its next all download page links crawl process function crawl_process ($url) {echo "Start handle URL:". $url; $page _idx = 1; $valid _tag = true;$ Info_url_pattern = '/\/i\/\d+.html/'; $ftp _url_pattern = '/ftp:\/\/.*? (SWF|AVI|FLV|MPG|RM|MOV|WAV|ASF|3GP|MKV|RMVB)/I ';//^$ two symbols do not work while ($valid _tag) {$page _url = Get_page_index_url ($ URL, $page _idx);p rintf ("Start crawl URL:". $page _url. " \ n "), $page _content = get_page_content ($page _url), $valid _tag = Is_valid_page ($page _content), if ($valid _tag) {$matches _urls = Array ();p Reg_match_all ($info _url_pattern, $page _content, $matches _urls); $page _content = mb_convert_encoding($page _content, "UTF-8", "GBK"); for ($i =0; $i <count ($matches _urls[0]); $i + +) {$detail _url = ' http://www.dy2018.com ' . $matches _urls[0][$i]; $detail _page_content = get_page_content ($detail _url); $detail _page_content = Mb_convert_ Encoding ($detail _page_content, "UTF-8", "GBK");p Reg_match_all ($ftp _url_pattern, $detail _page_content, $ftp _urls); $ Ftp_links = Array (), for ($j =0, $j <count ($ftp _urls[0]), $j + +) {$ftp _links[$j] = $ftp _urls[0][$j];} $ftp _links_unique = array_values (Array_unique ($ftp _links)), foreach ($ftp _links_unique as $ftp _link) {mysql_query (" Insert into Dy2018_url (URL, status) VALUES (' $ftp _link ', ' 0 ') ');//Echo Mysql_error ();//print MySQL error}sleep (1);}} $page _idx++;}} Gets the URL link for the page number function Get_page_index_url ($url, $idx) {$idx _url = $url, if ($idx = = 1) {$idx _url = $idx _url. ' Index.html ';} else if ($idx > 1) {$idx _url = $idx _url. ' Index_ '. $idx. HTML ';} return $idx _url;} Determines whether the link is valid according to the content of the page function Is_valid_page ($content) {return $content? True:false;} run_dy2018 (); Mysql_close ();? >

Results:



Crawler _ Movie FTP

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.