Purely beginner ... There are a lot of areas to improve, please give more guidance ...
The goal is to crawl 58 of the same city under the big classification of the list data: Http://cd.58.com/caishui/?PGTID=14397169455980.9244072034489363&ClickID=1
Simple analysis:
1. Obtain data for each list according to the following level two classification
2. Main paging: As you can see, its paging is pn5 set here, then this 5 is the page number.
Http://cd.58.com/dailijizh/pn5/?PGTID=117742907188706554997826849&ClickID=1
3. Phone number: Really hidden div inside, click to contact the merchant can see. But for the program, it can only be obtained directly.
The code is as follows:
//Crawl 58 DatavarHTTP = require ("http"), Cheerio= Require ("Cheerio"), Mongoose= Require (' Mongoose ');d b= Mongoose.createconnection (' mongodb://127.0.0.1:27017/crawl58 ');d B.on (' Error ',function(Error) {Console.log (' MongoDB connection error: ' +( error);});//Storing DatavarMongooseschema =NewMongoose. Schema ({url: {type:string},//Fetch AddressType: {type:string},//typeContent: {type:string},//Fetch AddressUpdateTime: {type:date,default: Date.now},//Data Fetch TimeFlag: {type:string,default: 0}//used to determine if a crawl of 0 indicates that the details have not been crawled.});//ModelvarMongoosemodel = Db.model (' pagelist ', mongooseschema);//Agent Bookkeeping//Fetching DatavarProxy = [//Agent{IP: ' 120.203.159.14 ', Port: ' 8118 '}, {IP:' 111.161.246.233 ', Port: ' 8118 '}, {IP:' 58.30.233.196 ', Port: ' 8118 '}, {IP:' 113.215.0.130 ', Port: ' 80 '}, {IP:' 183.218.63.179 ', Port: ' 8181 '}, {IP:' 120.198.245.36 ', Port: ' 8080 '}, {IP:' 120.203.158.149 ', Port: ' 8118 '}, {IP:' 124.240.187.89 ', Port: ' 80 '}, {IP:' 218.204.140.105 ', Port: ' 8118 '}, {IP:' 175.1.79.63 ', Port: ' 80 '}];varProxyindex = 5;varFlag =false;//determine if this is the last pagevarPageNo = 1;functionCrawl () {Console.log (' Crawling page number: ' +PageNo); //The URL needs to be set manually, after each category has been captured, switch to the next category //var url = ' HTTP://CD.58.COM/DAILIJIZH/PN ' + PageNo + '/? pgtid=1007041601886955933022299 ' + pageno + ' &clickid=1 '; varurl = ' HTTP://CD.58.COM/NASHUISHENBAO/PN ' + PageNo + '/? pgtid=1007041601886955933022299 ' + pageno + ' &clickid=1 '; varType= ' tax declaration ';//Here you need to manually set the classification, corresponding URL classification if(flag) {Console.log (' Crawl complete. Total Pages: ' +PageNo); return false; } varoption ={Host:proxy[proxyindex].ip, Port:proxy[proxyindex].port, method:' GET ',//Here's how it's sent.Path:url, header: {' Host ': ' cd.58.com ', ' Connection ': ' Keep-alive ', ' Cache-control ': ' max-age=0 ', ' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ', ' User-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/39.0.2171.95 safari/537.36 ', ' Referer ': URL,' accept-encoding ': ' gzip, deflate, SDCH ', ' Accept-language ': ' zh-cn,zh;q=0.8 ', ' Cookie ': ' Userid360_xml=6b337b22e8098342c5f725d4f58495c6; time_create=1442050990222; id58=05dzxvsaeacjgzn9crp9ag==; bdshare_firstime=1419409592050; Bangbangguoqi=true; Ppqp=1; Tj_ershoubiz=true; Tj_ershounobiz=true; cnzzdata30017898=cnzz_eid%3d443859762-1419406677-%26ntime%3d1431055823; AG_FID=WEYSRNDPQWUJSUJF; Myfeet_tooltip=end; Quanmyy=forfirst; __ag_cm_=1439442804516; Bangbigtip2=1; Nearby=notshow; IPCITY=CD%7C%U6210%U90FD; sessionid=4019a46c-3b78-45f9-8af1-d5d576171b60; 58HOME=CD; bangbangid=1080863912864997567; cookieuid1=05dvuvxos3ztewlzhrnmag==; __autma=253535702.1952421463.1439442813.1439598477.1439610035.5; __autmc=253535702; __autmz=253535702.1439610035.5.2.autmcsr=cd.58.com|autmccn= (referral) |autmcmd=referral|autmcct=/caishui/; final_history=19947936375429%2c20303113064713%2c16884696076038%2c18742095746434%2c22669284355361; AG_FID=WEYSRNDPQWUJSUJF; __utmt_pagetracker=1; CITY=CD; hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1439452109,1439458833,1439516942,1439598477; hm_lpvt_3bb04d7a4ca3846dcc66a99c3e861511=1439627751; __utma=253535702.1249887847.1419409519.1439618478.1439625451.38; __utmb=253535702.20.10.1439625451; __utmc=253535702; __utmz=253535702.1439625451.38.15.utmcsr=cd.58.com|utmccn= (referral) |utmcmd=referral|utmcct=/dailijizh/pn2/; new_session=0; init_refer=http%253a%252f%252fcd.58.com%252fdailijizh%252fpn2%252f%253fpgtid%253d198304873188692623092226919; New_uv=41 ' } }; //http.request (option, function (res) {//This is the use of proxy IP, there are bugs, not resolved temporarily.Http.get (URL,function(res) {vardata = ""; Res.on (' Data ',function(chunk) {data+=Chunk; }); Res.on ("End",function () { //parse the data and deposit it into the database var$ =cheerio.load (data); if($ (' A.next ', ' Div.pager '). Length < 1) {flag=true;//set the flag for the fetch to complete } varitem ={url:url, Type:type, content:data}//Save list DataMongoosemodel.create (Item,function(Error) {if(Error) {Console.log (error); } Else{Console.log (' Save success Page: ' + pageno + ' +URL); if(Proxyindex = 10) {Proxyindex= 0; } Else{Proxyindex= Proxyindex + 1; } PageNo= PageNo + 1; SetTimeout (Crawl,5020);//set to more than 5 seconds, will not be jump verification page, O (∩_∩) o haha ~ was I found. It's better to use a proxy. } }); }); }). On ("Error",function(Error) {Console.log (' Crawl error: ' +error.message); });};//Start fetching DataCrawl ();
Nodejs Fetch data One (list fetch)