Even HTTPS Web pages, the way of parsing is not consistent, you need to try more.
Code:
//======================================================//aitaotu Image Bulk download node. JS Crawler 1.00//November 14, 2017//======================================================//built-in HTTP modulevarHttps=require ("https");//built-in file processing module for creating directories and picture filesvarFs=require (' FS ');//The cheerio module provides jquery-like functionality for finding picture addresses and next pages from HTML codevarCheerio = require ("Cheerio");//request parameter JSON. Both HTTP and HTTPS are usedvaroptions;//Request Requestsvarreq;//image Array, the address of the image to be found will be placed herevarpictures=[];//--------------------------------------//Crawl Web pages, find image addresses, and then crawl//Pageurl sample:https://www.aitaotu.com/rihan/30598.html//Pageurl sample:https://www.aitaotu.com/rihan/33405.html//--------------------------------------functionCrawl (pageurl) {Console.log ("Current Page=" +pageurl); //get hostname and path varCurrurl=pageurl.replace ("https://", "" "); varPos=currurl.indexof ("/"); varHostname=currurl.slice (0, POS); varPath=Currurl.slice (POS); //Console.log ("hostname=" +hostname); //Console.log ("path=" +path); //Initialize Optionsoptions={hostname:hostname, port:443, Path:path,//Sub-PathMethod: ' GET ', }; Req=https.request (Options,function(resp) {varHTML = []; Resp.on ("Data",function(data) {Html.push (data); }) Resp.on ("End",function() { varBuffer =Buffer.concat (HTML); varbody=buffer.tostring (); //Console.log (body); var$ =Cheerio.load (body); varPiccount=0; //find a picture into an array$ ("#big-pic p a img"). each (function(index,element) {varpicurl=$ (Element). attr ("src"); //Console.log (picurl); if(Picurl.indexof ('. jpg ')!=-1) {Pictures.push (Picurl); Piccount++; }}) Console.log ("Find the picture" +piccount+ "Zhang."); varNextpageurl=NULL; //find the next page$ (". Pages ul li a"). each (function(index,element) {vartext=$ (Element). text (); if(Text.indexof (' next page ')!=-1) {Nextpageurl=$ (Element). attr ("href"); Nextpageurl= "https://www.aitaotu.com/" +nextpageurl;//Add the omitted partConsole.log ("Find Next page.")); } }) if(nextpageurl==NULL) {Console.log (Pageurl+ "It's the last page."); Download (Pictures); }Else{ //console.log ("Next page is" +nextpageurl);Crawl (Nextpageurl); }}). On ("Error",function() {Console.log ("Get Failed") }) }); //Timeout ProcessingReq.settimeout (5000,function() {req.abort (); }); //Error HandlingReq.on (' Error ',function(err) {if(err.code== "Econnreset") {Console.log (' Socket port connection timed out. ‘); }Else{Console.log (' Request error occurred, Err.code: ' +Err.code); } }); //End of Requestreq.end ();}varTotal=0;varSucceed=0;varFailed=0;//--------------------------------------//Download Image//--------------------------------------functionDownload (Pictures) {varFolder= ' pictures (' +getnowformatdate () + ")"; //Create a directoryFs.mkdir ('./' +folder,function(err) {if(Err) {Console.log ("Directory" +folder+ "already exists"); } }); Total=pictures.length; Console.log ("A total of" +total+ "pictures will be downloaded."); Appendtologfile (folder,"A total of" +total+ "pictures will be downloaded. \ n"); for(vari=0;i<pictures.length;i++){ varPicurl=Pictures[i]; Downloadpic (Picurl,folder); }}//--------------------------------------//Write log file//--------------------------------------functionAppendtologfile (folder,text) {fs.appendfile ('./' +folder+ '/log.txt ', text,function(err) {if(Err) {Console.log ("Cannot write log file"); Console.log (ERR); } });}//--------------------------------------//Get current Time//--------------------------------------functiongetnowformatdate () {varDate =NewDate (); varSeperator1 = "-"; varSeperator2 = "_"; varmonth = Date.getmonth () + 1; varStrdate =date.getdate (); if(Month >= 1 && month <= 9) {Month= "0" +month; } if(strdate >= 0 && strdate <= 9) {strdate= "0" +strdate; } varcurrentdate =date.getfullyear () + seperator1 + month + Seperator1 +strdate+ "" + date.gethours () + Seperator2 +date.getminutes ()+ Seperator2 +date.getseconds (); returncurrentdate;}//--------------------------------------//Download a single photo//Picurl sample:https://img.aitaotu.cc:8089/pics/2017/0410/03/01.jpg//--------------------------------------functionDownloadpic (Picurl,folder) {Console.log ("Picture:" +picurl+ "Download Start"); //get Hostname,path and Port varCurrurl=picurl.replace ("https://", "" "); varPos=currurl.indexof ("/"); varHostname=currurl.slice (0, POS); varPath=Currurl.slice (POS); varArr=hostname.split (":"); Hostname=arr[0]; varPort=arr[1]; //Console.log ("hostname=" +hostname); //Console.log ("path=" +path); //Console.log ("port=" +port); varPicname=currurl.slice (Currurl.lastindexof ("/")); //Initialize Optionsoptions={hostname:hostname, Port:port, Path:path,//Sub-PathMethod: ' GET ', }; Req=https.request (Options,function(resp) {varImgdata = ""; Resp.setencoding ("Binary"); Resp.on (' Data ',function(chunk) {Imgdata+=Chunk; }); Resp.on (' End ',function(){ //Create a file varFilename= "./" +folder+Picname; Fs.writefile (FileName, Imgdata,"Binary",function(err) {if(Err) {Console.log ("[Downloadpic] File" +filename+ "Download failed."); Console.log (ERR); Appendtologfile (folder,"File" +picurl+ "Download failed. \ n"); Failed++; }Else{appendtologfile (folder,"File" +picurl+ "Download succeeded. \ n"); Console.log ("File" +filename+ "Download succeeded"); Succeed++; } }); }); }); //Timeout ProcessingReq.settimeout (7500,function() {req.abort (); }); //Error HandlingReq.on (' Error ',function(err) {if(Err) {Console.log (' [downloadpic] file ' +picurl+ ' download failed, ' + ' because ' +err); Appendtologfile (folder,"File" +picurl+ "Download failed. \ n"); } failed++; }); //End of Requestreq.end ();}//--------------------------------------//Program Entry//--------------------------------------functionGetInput () {Process.stdout.write ("\033[35m Please enter the first page url:\033[039m");//PurpleProcess.stdin.resume (); Process.stdin.setEncoding (' UTF8 '); Process.stdin.on (' Data ',function(text) {process.stdin.end ();//Exit input StatusCrawl (Text.trim ());//Trim () is a must! }); }//Call the GetInput function and the program startsGetInput ();
November 14, 2017 18:28:37
node. js aitaotu Image Bulk download node. JS Crawler version 1.00