node. js aitaotu Image Bulk download node. JS Crawler version 1.00

Source: Internet
Author: User
Tags stdin

Even HTTPS Web pages, the way of parsing is not consistent, you need to try more.

Code:

//======================================================//aitaotu Image Bulk download node. JS Crawler 1.00//November 14, 2017//======================================================//built-in HTTP modulevarHttps=require ("https");//built-in file processing module for creating directories and picture filesvarFs=require (' FS ');//The cheerio module provides jquery-like functionality for finding picture addresses and next pages from HTML codevarCheerio = require ("Cheerio");//request parameter JSON. Both HTTP and HTTPS are usedvaroptions;//Request Requestsvarreq;//image Array, the address of the image to be found will be placed herevarpictures=[];//--------------------------------------//Crawl Web pages, find image addresses, and then crawl//Pageurl sample:https://www.aitaotu.com/rihan/30598.html//Pageurl sample:https://www.aitaotu.com/rihan/33405.html//--------------------------------------functionCrawl (pageurl) {Console.log ("Current Page=" +pageurl); //get hostname and path    varCurrurl=pageurl.replace ("https://", "" "); varPos=currurl.indexof ("/"); varHostname=currurl.slice (0, POS); varPath=Currurl.slice (POS); //Console.log ("hostname=" +hostname);    //Console.log ("path=" +path);        //Initialize Optionsoptions={hostname:hostname, port:443, Path:path,//Sub-PathMethod: ' GET ',                  }; Req=https.request (Options,function(resp) {varHTML = []; Resp.on ("Data",function(data) {Html.push (data); }) Resp.on ("End",function() {            varBuffer =Buffer.concat (HTML); varbody=buffer.tostring (); //Console.log (body);            var$ =Cheerio.load (body); varPiccount=0; //find a picture into an array$ ("#big-pic p a img"). each (function(index,element) {varpicurl=$ (Element). attr ("src"); //Console.log (picurl);                if(Picurl.indexof ('. jpg ')!=-1) {Pictures.push (Picurl); Piccount++; }}) Console.log ("Find the picture" +piccount+ "Zhang."); varNextpageurl=NULL; //find the next page$ (". Pages ul li a"). each (function(index,element) {vartext=$ (Element). text (); if(Text.indexof (' next page ')!=-1) {Nextpageurl=$ (Element). attr ("href"); Nextpageurl= "https://www.aitaotu.com/" +nextpageurl;//Add the omitted partConsole.log ("Find Next page.")); }                    })            if(nextpageurl==NULL) {Console.log (Pageurl+ "It's the last page.");            Download (Pictures); }Else{                //console.log ("Next page is" +nextpageurl);Crawl (Nextpageurl); }}). On ("Error",function() {Console.log ("Get Failed")        })    }); //Timeout ProcessingReq.settimeout (5000,function() {req.abort ();    }); //Error HandlingReq.on (' Error ',function(err) {if(err.code== "Econnreset") {Console.log (' Socket port connection timed out. ‘); }Else{Console.log (' Request error occurred, Err.code: ' +Err.code);    }    }); //End of Requestreq.end ();}varTotal=0;varSucceed=0;varFailed=0;//--------------------------------------//Download Image//--------------------------------------functionDownload (Pictures) {varFolder= ' pictures (' +getnowformatdate () + ")"; //Create a directoryFs.mkdir ('./' +folder,function(err) {if(Err) {Console.log ("Directory" +folder+ "already exists");    }    }); Total=pictures.length; Console.log ("A total of" +total+ "pictures will be downloaded."); Appendtologfile (folder,"A total of" +total+ "pictures will be downloaded. \ n");  for(vari=0;i<pictures.length;i++){        varPicurl=Pictures[i];    Downloadpic (Picurl,folder); }}//--------------------------------------//Write log file//--------------------------------------functionAppendtologfile (folder,text) {fs.appendfile ('./' +folder+ '/log.txt ', text,function(err) {if(Err) {Console.log ("Cannot write log file");        Console.log (ERR); }    });}//--------------------------------------//Get current Time//--------------------------------------functiongetnowformatdate () {varDate =NewDate (); varSeperator1 = "-"; varSeperator2 = "_"; varmonth = Date.getmonth () + 1; varStrdate =date.getdate (); if(Month >= 1 && month <= 9) {Month= "0" +month; }    if(strdate >= 0 && strdate <= 9) {strdate= "0" +strdate; }    varcurrentdate =date.getfullyear () + seperator1 + month + Seperator1 +strdate+ "" + date.gethours () + Seperator2 +date.getminutes ()+ Seperator2 +date.getseconds (); returncurrentdate;}//--------------------------------------//Download a single photo//Picurl sample:https://img.aitaotu.cc:8089/pics/2017/0410/03/01.jpg//--------------------------------------functionDownloadpic (Picurl,folder) {Console.log ("Picture:" +picurl+ "Download Start"); //get Hostname,path and Port    varCurrurl=picurl.replace ("https://", "" "); varPos=currurl.indexof ("/"); varHostname=currurl.slice (0, POS); varPath=Currurl.slice (POS); varArr=hostname.split (":"); Hostname=arr[0]; varPort=arr[1]; //Console.log ("hostname=" +hostname);    //Console.log ("path=" +path);    //Console.log ("port=" +port);    varPicname=currurl.slice (Currurl.lastindexof ("/")); //Initialize Optionsoptions={hostname:hostname, Port:port, Path:path,//Sub-PathMethod: ' GET ',    }; Req=https.request (Options,function(resp) {varImgdata = ""; Resp.setencoding ("Binary"); Resp.on (' Data ',function(chunk) {Imgdata+=Chunk;        }); Resp.on (' End ',function(){                    //Create a file            varFilename= "./" +folder+Picname; Fs.writefile (FileName, Imgdata,"Binary",function(err) {if(Err) {Console.log ("[Downloadpic] File" +filename+ "Download failed.");                    Console.log (ERR); Appendtologfile (folder,"File" +picurl+ "Download failed. \ n"); Failed++; }Else{appendtologfile (folder,"File" +picurl+ "Download succeeded. \ n"); Console.log ("File" +filename+ "Download succeeded"); Succeed++;            }            });    });    }); //Timeout ProcessingReq.settimeout (7500,function() {req.abort ();    }); //Error HandlingReq.on (' Error ',function(err) {if(Err) {Console.log (' [downloadpic] file ' +picurl+ ' download failed, ' + ' because ' +err); Appendtologfile (folder,"File" +picurl+ "Download failed. \ n"); } failed++;    }); //End of Requestreq.end ();}//--------------------------------------//Program Entry//--------------------------------------functionGetInput () {Process.stdout.write ("\033[35m Please enter the first page url:\033[039m");//PurpleProcess.stdin.resume (); Process.stdin.setEncoding (' UTF8 '); Process.stdin.on (' Data ',function(text) {process.stdin.end ();//Exit input StatusCrawl (Text.trim ());//Trim () is a must!     }); }//Call the GetInput function and the program startsGetInput ();

November 14, 2017 18:28:37

node. js aitaotu Image Bulk download node. JS Crawler version 1.00

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.