Just started using HTTP middleware to do a lot of pit, the main pit is the coding problem, there are many Chinese websites using the gb2313 encoding method , this crawl of the message parsing is very egg broken,
Because HTTP middleware is good for utf-8 support, we need to do the encoding conversion processing for GB2312 's website.
Here I use Mongoose, so node executes JS to link the test database first
Here climbed Baidu reading data, but climbed down the picture link in the local page is not used, Baidu image server made a request to filter
The code is as follows:
/** * Created by MyCo on 2016/3/15.*//** The Iconv-lite module can be used in conjunction with the HTTP module and the request module, but not directly with the SuperAgent module. * Because SuperAgent is to UTF8 to fetch data, and then use Iconv turn is also not. * The page is GBK encoded, Sres.text has been the result of decode, * that is, it has been converted to UTF8, and then converted to buffer out of the results must be incorrect. */varHTTP = require (' http ');//provides the functionality of jquery for DOM node operationsvarCheerio = require (' Cheerio '));varMongoose = require (' Mongoose '));//Pure JavaScript Conversion coded module Iconv-litevarIconv = require (' Iconv-lite '));//Bufferhelper is a reinforced class that operates in buffervarBufferhelper =require (' Bufferhelper '));//The table structure instance object that is currently MongonvarBookmodel = require ('.. /models/model/bookmodel ');//Create a connection to a databaseMongoose.connect (' Mongodb://localhost/test ');//set the access address, Baidu is gb2312 encoded data, so must use Iconv.decodevarurl = ' http://yuedu.baidu.com/'http.get (URL,function(res) {varBufferhelper =NewBufferhelper (); Res.on (' Data ',function(data) {
All returned are buffer data Console.log ('------------Download ' +buffer.isbuffer (data) + '-------------'); Bufferhelper.concat (data); }); Res.on (' End ',function() {Console.log ('------------End-------------'); varhtml = Iconv.decode (Bufferhelper.tobuffer (), ' GBK ');filtehtml (HTML); });}). On (' Error ',function() {Console.log (' Failed to get data! ‘);})//HTML documents filter out valid informationfunctionfiltehtml (HTML) {//The Cheerio itself is turned to entity by default, so ensure that the conversion succeeds with the parameter {Decodeentities:false}, which is independent of the encodingvar$ = cheerio.load (html,{decodeentities:false}); varCollist= $ ('. Yd-reco-wrap '));Console.log ('------------data collection-------------'); Console.log ('------------collist data length: ' +collist.length+ '-------------'); vardata = []; for(varI= 0,l=collist.length;i<l;i++){ vardocobj=$ (collist[i]); varitem = {}; Item.bookcolname= Docobj.find ('. Mod-title ')). text ();Item.categoryid = 999999; varListobj = Docobj.find ('. Book ')); varBooklist = []; for(varq= 0,ql=listobj.length;q<ql;q++){ varBookobj =$ (listobj[q]); varBookData = {}; Bookdata.title= Bookobj.find ('. Book-title ')). text (); Bookdata.currentprice= Bookobj.find ('. Book-price '). Text (). Replace (' ¥ ', ' "); BOOKDATA.SRC= Bookobj.find ('. Book-cover. book-img ') [0].attribs[' Data-src ']; Bookdata.author= Bookobj.find ('. Book-card-author ')). text (); varurl = Bookobj.find ('. Book-card-wrap ') [0].attribs.href; if(URL) {bookdata.id= Url.replace (/\/ebook\/|\?fr=index/g, "); Bookdata.url=URL; } add (BookData); Booklist.push (bookdata);} item.booklist=Booklist; Data.push (item); }}functionAdd (bookdata) {if(bookdata.url) {Http.get (' http://yuedu.baidu.com/' +bookdata.url,function(res) {varBufferhelper =NewBufferhelper (); Res.on (' Data ',function(data) {bufferhelper.concat (data); }); Res.on (' End ',function(){varhtml = Iconv.decode (Bufferhelper.tobuffer (), ' GBK '); Console.log (HTML); var$ = cheerio.load (html,{decodeentities:false}); varContent = $ (' #bd. Main. scaling-content P '). text (); Console.log (content); }); }). On (' Error ',function() {Console.log (' Failed to get data! ‘); }) }}
Bookmodel.js file as follows, instantiate the MONGONDB table structure, and name the table book (the code introduced the Bookschema.js file, if you use the following code, note the JS file path)
var mongoose = require (' Mongoose '); var bookschema = require ('.. /schema/bookschema.js '); // specifies that the database table name is book var Bookmodel = Mongoose.model (' book ', Bookschema, ' book '= Bookmodel;
Bookschema.js files are as follows, primarily schema-defined MONGONDB data table structure and default values
varMongoose = require (' Mongoose '));varBookschema =NewMongoose. Schema ({id:string, src:string,//Image AddressTitle:{type:string,required:true},//title, add name non-null constraintContent:string,//contentAuthor:string,//authorRq:{type:number,default: 0},//Reading Volumeprice:{type:number,min:0,max:1000},//price, add price constraintIsshow:{type:boolean,default:true},//whether the constraint is displayedclassify:{type:string,enum:[' Youth ', ' literature ', ' history ', ' sci-fi ', ' fiction ', ' romance ', ' military ',default: ' Youth ',//type, enum-qualified typeCurrentprice:{type:number,default: 0},//Current PriceComments_count:{type:number,default: 0},//Number of commentsmeta:{//Object type Time Objectscreatedate:{Type:date,default:D Ate.now ()}, updatedate:{type:date,default:D Ate.now ()}}},{versionkey:false}); Module.exports= Bookschema;
Use node's HTTP module to implement crawler functionality and store crawled data in mongondb