1. Module use
(1) HTTP request library in Superagent:nodejs (each language has countless, Java Okhttp,ios afnetworking)
(2) HTML parsing Library in Cheerio:nodejs (basic for each language). )
(3) Parallel/asynchronous concurrency function execution Library in Async:nodejs (this is very bull, other languages are not much the same type)
2. Crawling content
Multi-play Hero League hero page, by parsing the URL of each hero within the page, and then request the hero's detailed data, extract the required data to get results
http://lol.duowan.com/hero/
This is mainly to familiarize with the Nodejs crawler, so the 1-star difficulty crawler practiced hand.
3. Source code
varsuperagent = require (' superagent ')); varCheerio = require (' Cheerio ')); varAsync = require (' async '); start ();//The first step is to use superagent to initiate a GET request to get a page with a hero namefunctionstart () {Console.log (' bot started running ... '); Superagent. Get (' http://lol.duowan.com/hero/'). End (function(Err, res) {//Request page processing after return, use Cheerio to extract heroes var$ = cheerio.load (res.text,{decodeentities:false}); //find each hero's link and save the array, waiting for a parallel request varHeroes =NewArray (); $("A.lol_champion"). each (function(i, E) {Heroes.push ($ (e). attr ("href")); }); //concurrent traversal of Heroes objectsAsync.maplimit (heroes,5, function(Herourl, callback) {//processing logic for each role objectFetchinfo (Herourl, callback); }, function(err, result) {if(Err) {Console.log ("Error is:" +err); } //The result here is an array of callback back.Console.log ("Fetch end, total:" +result.length+ "); Result.foreach (function(Hero) {Console.log (json.stringify (Hero)); }); } ); //serial traversal of Heroes objects //async.mapseries (Heroes,function (Herourl, callback) { ////processing logic for each role object //Fetchinfo (Herourl, callback); // }, //function (err, result) { //if (err) { //Console.log ("error is:" +err); // } ////The result here is an array of callback back //console.log ("Fetch end, total:" +result.length+ "); //Result.foreach (function (Hero) { //Console.log (json.stringify (Hero)); // }); // } // ); }); }//Get role InformationvarConcurrencycount = 0;//Current number of concurrent recordsfunctionFetchinfo (Herourl, callback) {Concurrencycount++; Console.log ("... Crawling: "+ Herourl +" ... Current number of concurrent records: "+concurrencycount); //Crawl and parse detailed pages based on URLssuperagent. Get (Herourl). End (function(Err, res) {if(Err) {Console.log ("Fail"); Concurrencycount--; varHero ={succ:false } //callback The left argument is an error string, which is not NULL when the map is interruptedCallbackNULL, Hero); }Else{ //get the content of a crawled role Detail page var$ = cheerio.load (res.text,{decodeentities:false}); varHerotitle = $ ('. Hero-title ')). First (). text (); varHeroname = $ ('. Hero-name ')). First (). text (); varHerotype = $ ('. Hero-tag '). First (). Text () + "" +$ ('. Hero-tag ')). Last (). text (); Console.log (' Find Heroes: ' +herotitle+ ', ' +heroname+ ' | ' +Herotype); Concurrencycount--; varHero ={succ:true, Title:herotitle, Name:heroname, Type:herotype } //callback will not end this parallel "thread"CallbackNULL, Hero); } });}
4, Project Demo
Https://github.com/rayshen/lolcrawler
NodeJS crawler crawls the hero information of LOL League of Legends, Superagent+cheerio+async