Objective
Node.js is born to support concurrency, but for those who are accustomed to sequential programming, the first will not adapt to the node.js, for example, the scope of the variable is a function block (unlike C, Java); The value of reference I in the For loop body ({}) is actually the value after the end of the loop. Thus causing a variety of undefined problems; when nested functions, the variables of the inner function are not transmitted in time to the outer layer (because they are asynchronous) and so on.
First, API analysis
The public comment opens the API for querying restaurant information, which gives a cityid
corresponding relationship between city and
Link:http://m.api.dianping.com/searchshop.json?®ionid=0&start=0&categoryid=10&sortid=0& cityid=110
GET
gives the restaurant information in a way (JSON format).
First, explain the meaning of the next get parameter:
1. start
for the step number, the index of obtaining the information is the step, corresponding to the nextStartIndex
field;
2, cityid
indicating the city ID, for example, Hefei corresponds to 110;
3, regionid
representing the area ID, each ID represents the meaning in the start=0
time rangeNavs
field has an explanation;
4, the categoryid
search business classification ID, for example, food corresponding to the ID of 10, the specific meaning of each ID see in the start=0
time categoryNavs
field;
5, the sortid
Order of business results, for example, 0 corresponding intelligent sorting, 2 corresponding evaluation of the best, the specific meaning of each ID see the Sortnavs field when start=0.
The list field in the JSON string returned by get is a merchant listing, the ID representing the merchant ID, as the merchant's unique identity. In the returned JSON
string there is no business taste, environment, service grading information and latitude and longitude degree;
Therefore we also need to crawl two merchant page:http://m.dianping.com/shop/<id>, Http://m.dianping.com/shop/<id>/map.
Through the above analysis, the crawl strategy is determined as follows (similar to the Dianping_crawler idea):
1, gradually crawl the Searchshop API to get the basic information list;
2, by crawling the ID of all businesses, asynchronous concurrent crawl scoring information, latitude and longitude degree;
3, the final three data through the ID to do aggregation, output into a JSON file.
Second, the realization of the crawler
The Node.js Crawler code uses the following Third-party modules:
1, superagent
the lightweight HTTP request library, imitate the browser login;
2, cheerio
using jquery syntax to parse HTML elements, similar to Python's pyquery;
3, the node.js of the async
asynchronous Process Control Library, a must-learn library.
Import Dependent libraries:
var util = require ("util"); var superagent = require ("superagent"); var cheerio = require ("Cheerio"); var async = require ("async"); var fs = require (' FS ');
Declares a global variable for storing configuration items and intermediate results:
var cityoptions = {"Cityid": 110,//Hefei//All areas, Shushan District, Luyang District, Baohe District, administrative district, Yaohai District, hi-tech zone, via open area, Binhu New area, other areas, Feixi County "regionids": [0, 356, 355, 357, 8840, 354, 8839, 8841, 8843, 358, -922], "CategoryID": 10,//Gourmet "SortID": 2,//Most popular "threshhold": 5000//maximum meal Number of museums}; var idvisited = {}; Used to distinct shop var ratingdict = {}; ID-> ratings var posdict = {}; ID-> POS
To determine if an ID has been previously present, or, if object
not, to undefined
(note not null
):
function isvisited (ID) {if (Idvisited[id]!= undefined) {return true;} else {Idvisited[id] = true; return false;}}
By taking a callback function, the implementation order is recursively called the reptilian function in a gradual way:
function Dianpingspider (RegionID, start, callback) {Console.log (' crawling region= ', RegionID, ', start = ', start); var s Earchbase = ' http://m.api.dianping.com/searchshop.json?®ionid=%s&start=%s&categoryid=%s& sortid=%s&cityid=%s '; var url = util.format (searchbase, RegionID, start, Cityoptions.categoryid, Cityoptions.sortid, Cityoptions.cityid); Superagent.get (URL). End (function (err, res) {(err) return Console.err (err.stack); var restaurants = []; var data = JS On.parse (Res.text); var shops = data[' list ']; Shops.foreach (function (shop) {var restaurant = {}; if (!isvisited (shop[' id '))} {restaurant.id = shop[' id ']; restaurant. name = shop[' name ']; Restaurant.branchname = shop[' branchname ']; var regex =/(. *?) (\d+) (. *)/g; if (shop[' Pricetext '].match (regex)) {Restaurant.price = parseint (regex.exec (shop[' Pricetext ')] [2]);} else { Restaurant.price = shop[' Pricetext ']; } Restaurant.star = shop[' shoppower '/10; Restaurant.category = shop[' CategoryName ']; Restaurant.region = shop[' regionname ']; Restaurants.push (restaurant); } }); var nextstart = data[' Nextstartindex ']; if (Nextstart > Start && nextstart < Cityoptions.threshhold) {Dianpingspider (RegionID, Nextstart, function (Err, RESTAURANTS2) {if (ERR) return callback (ERR); callback (null, Restaurants.concat (RESTAURANTS2))}); else {callback (null, restaurants);}}); }
Use the maplimit
function of async
to implement concurrency control when invoking a reptile function; until
with async
Concurrent processing ensures ID consistency of three data results (no data is lost because of inconsistent concurrent completion time):
Dianpingspider (0, 0, function (err, restaurants) {if (err) return Console.err (err.stack); var concurrency = 0; var crawl move = function (ID, callback) {var delay = parseint (Math.random () * 30000000)% 1000; concurrency++; Current concurrency: ', concurrency, ', now crawling id= ', id, ', costs (ms): ', delay '; Parseshop (ID); Parsemap (ID); settimeout (function () {concurrency--; callback (null, ID);}, delay); }; Async.maplimit (restaurants, 5, function (restaurant, callback) {Crawlmove (Restaurant.id, callback)}, function (Err, IDS {console.log (' crawled IDs: ', IDS); var resultarray = []; Async.until (function () {return restaurants.length = = Objec T.keys (ratingdict). length && Restaurants.length = = Object.keys (posdict). Length}, function (callback) { settimeout (function () {callback (NULL)}, 1000)}, function (err) {Restaurants.foreach (function (restaurant) {var ratin g = ratingdict[restaurant.id]; var pos = posdict[restaurant.id]; var result = Object.assiGN (Restaurant, rating, POS); Resultarray.push (result); }); Writeasjson (resultarray); } ); }); });
Where parseshop
and Parsemap
are resolved merchant details pages, merchant map pages:
function Parseshop (id) {var shopbase = ' http://m.dianping.com/shop/%s '; var shopurl = Util.format (shopbase, id); Superag Ent.get (Shopurl). End (function (err, res) {if [err] return Console.err (err.stack); Console.log (' Crawling shop: ', Shopurl ); var restaurant = {}; var $ = cheerio.load (Res.text); var desc = $ ("Div.shopinfopagelet > Div.desc > Span"); Restaurant.taste = Desc.eq (0). Text (). Split (":") [1]; restaurant.surrounding = Desc.eq (1). Text (). Split (":") [1]; Restaurant.service = Desc.eq (2). Text (). Split (":") [1]; Ratingdict[id] = restaurant; }); function Parsemap (id) {var mapbase = ' Http://m.dianping.com/shop/%s/map '; var mapurl = Util.format (mapbase, id); SuperA Gent.get (MapURL). End (function (err, res) {if ERR] return Console.err (err.stack); Console.log (' Crawling map: ', mapurl); var restaurant = {}; var $ = cheerio.load (Res.text); var data = $ ("Body > Script"). text (); var Latregex =/(. *lat:) (\d+.\d+) (. *)/; var Lngregex =/(. *LNG:) (\d+.\d+) (. *)/; if (Data.match (Latregex) && Data.match (Lngregex)) {restaurant.latitude = latregex.exec (data) [2]; restaurant.longitude = LngRegex.exec ( Data) [2]; }else {restaurant.latitude = '; restaurant.longitude = ';} Posdict[id] = restaurant; }); }
array
writes each merchant information to the JSON file line by row:
function Writeasjson (arr) {fs.writefile (' Data.json ', Arr.map (function (data) {return json.stringify (data);}). Join (' \ n '), function (err) {if (err) return err.stack;}) }
Summarize
The above is the entire content of this article, I hope this article to learn or use node.js friends to bring certain help, if there is doubt you can message exchange.