The source code is as follows, with everyone's favorite yellow stewed chicken rice as an example ~ you can copy to the god Arrow Hand cloud Crawler (http://www.shenjianshou.cn/) directly run:
Public comments on crawling all the "braised chicken rice" business information var keywords = "braised chicken rice"; var scanurls = [];//domestic city ID to 2323 means that the seed URL has 2,323//As sample, this is changed to 1, Only to climb the yellow stewed chicken rice store in Shanghai//for (var i = 1; I <= 2323; i++) {for (var i = 1; I <= 1; i++) {Scanurls.push ("http://www.dianping . com/search/keyword/"+i+"/0_ "+keywords);} var configs = {domains: ["dianping.com"], Scanurls:scanurls, helperurlregexes: ["Http://www.dianping.com/search /keyword/\\d+/0_.* "], contenturlregexes: [" Http://www.dianping.com/shop/\\d+/editmember "], enableproxy:true, int erval:5000, fields: [{name: "Shop_name", selector: "//div[contains (@class, ' shop-review- Wrap ')]/div/h3/a/text () "}, {name:" id ", selector:"//div[contains (@class, ' Shop-revie W-wrap ')]/div/h3/a/@href "}, {name:" Create_time ", selector:"//div[contains (@class, ' Block Raw-block ')]/ul/li[1]/span "}, {name:" Region_name ", selector:"//div[@class= ' breadcrumb ']/b[1]/a/span/text () ", required:true}, {name:" Province_name ", Selector: "//div[@class = ' breadcrumb ']/b[1]/a/span/text ()"}]};configs.onprocesshelperurl = function (URL, Content, site) {var urls = extractlist (content, "//div[@class = ' tit ']/a[not (contains (@class, ' shop-branch '))]/@href"); for (var i = 0; i < urls.length; i++) {Site.addurl (urls[i]+ "/editmember"); } var nextPage = Extract (Content, "//div[@class = ' page ']/a[@class = ' Next ']/@href"); if (nextPage) {site.addurl (nextPage); var result =/\d+$/.exec (nextPage); if (result) {var data = result[0]; var count = nextpage.length-data.length; var lll = nextpage.substr (0, count) + (parseint (data) +1); Site.addurl (nextpage.substr (0, count) + (parseint (data) +1)); Site.addurl (nextpage.substr (0, count) + (parseint (data) +2)); }} return false;} Configs.afterextractfiEld = function (fieldName, data, page) {if (FieldName = = "id") {var result =/\d+$/.exec (data); if (result) {data = result[0]; }} else if (FieldName = = "Shop_name") {if (Data.indexof ("Braised chicken rice") = =-1) {page.skip (); }} else if (FieldName = = "Create_time") {var result =/\d{2}-\d{2}-\d{2}$/.exec (data); data = "+result[0"; } else if (FieldName = = "Province_name" | | fieldName = = "Region_name") {var position = Data.indexof ("county"); if (Position! =-1 && position < data.length-1) {data = Data.substr (0,position+1); } position = Data.indexof ("city"); if (Position! =-1 && position < data.length-1) {data = Data.substr (0,position+1); } data = Data.replace ("Restaurant", "" "); if (FieldName = = "Province_name") {data = getprovincenamebyregion (data); }} return data;} var crawler = new CrawLer (configs); Crawler.start ();
Volkswagen reviews Web merchant data Collection Crawler realization source code