Blog Address: http://blog.whattoc.com/2013/09/19/nodejs_api_http_2/
detailed node.js API series HTTP Module (2) Cnodejs crawler implementation Simple crawler Design
var http = require (' http ');
Http.get ("http://cnodejs.org/", function (res) {
var size = 0;
var chunks = [];
Res.on (' Data ', function (chunk) {
size + = Chunk.length;
Chunks.push (chunk);
});
Res.on (' End ', function () {
var data = Buffer.concat (chunks, size);
Console.log (Data.tostring ())
})
. On (' Error ', function (e) {
console.log ("Got error:" + e.message);
});
http.get (Options, callback)http://cnodejs.org/Crawl Destination address. Res.on (' data ') listens for the data event. Res.on (' end ') data acquisition completed event. Buffer.concat (chunks, size); Buff that connects multiple data. Data.tostring () Converts the data binary to a utf-8 string, and if the page is GBK, use the Iconv module to convert it, and the native Node.js does not support GBK.
Design ObjectivesMake crawler URL Rules analysis page information cleaning no data store useful data
make a crawler URL rule
Observe http://cnodejs.org/URL rules, http://cnodejs.org/?page= number of pages, according to the rules, it is not difficult to come up with the idea of processing, first get the iterator mode is the most convenient, first, get a single page inside the single page path To crawl the page content through the path. Using the iterator pattern is most convenient, next () to do page index, Hasnext () to determine whether the page is out of valid range, out of range to stop the index, the following is pseudo-code
var urls = function (start_url) {
this.start_url = start_url;//base url
this.page = 0; URL page
this.targetpage = ';//topic page
}
Urls.prototype.next = function () {
var data;
if (!this.hasnext ()) {return
null;
}
This.page + 1;
data = Request.get (this.targetpage) //get topic page return
data;
}
Urls.prototype.hasNext = function () {
//http://cnodejs.org/p=[1,2,3,4]
var url = this.start_url + this.page;
//If get page success from Url,return Ture,or return false
//Get topic page URL
}
//main
var URLs = new URLs ();
while (Urls.hasnext ()) {
Console.log (Urls.next ());
}
Analyze page Data
Analysis of the process of the page, the main task is to analyze the elements of the page to extract the content of the target, such as text and comments. Here we need to adopt Cheerio's Third-party library, which takes a jquery-like Dom selector and uses a DOM selector to extract information.
NPM Install Cheerio
Project Address: Https://github.com/MatthewMueller/cheerio
Official Demo Example
var cheerio = require (' Cheerio '),
$ = cheerio.load ('
Extract the Cnodejs topics link
$ = cheerio.load (data); Data is page
topics = $ ('. Cell. Topic_wrap a ') for
(var i=0 i < topics.length; i++) {
Console.log (topics[ i].attribs[' href '] result
:
/TOPIC/52386D26101E574521A12CCD
/topic/5232cd39101e57452106ce5a
/ topic/52390cdb101e574521b1e252
/topic/521b1dcabee8d3cb128c56dd
/topic/5238c6d2101e574521aaca13
/ topic/52380b4e101e57452193617c
Content Information Extraction
Extract Condejs post content and title
$ = cheerio.load (data);
var topic = $ ('. Inner.topic ');
Console.log (Topic.children (' H3 '). Text ())//title
var content = Topic.children ('. Topic_content '). Text ()
Console.log (content); Article content
clean up the useless data
Because crawled content, may have the HTML tag or the expression aspect information, possibly with the target content does not conform, through this link to filter, here recommends a module validator, this module may filter the XSS attack, the string inside the space, the judgment content attribute and so on, Detailed can go to the project address study Https://github.com/chriso/node-validator
Installation
NPM Install Validator
Demo Example
var check = require (' validator '). Check, sanitize = require (' validator '). sanitize//validate Check (' test@email.com '). Le N (6). Isemail (); Methods are chainable check (' abc '). Isint (); Throws ' Invalid integer ' Check (' abc ', ' Please enter a number '). Isint ();
Throws ' Please enter a number ' check (' abcdefghijklmnopzrtsuvqxyz '). is (/^[a-z]+$/); Set a message each validator check (' foo ', {isnumeric: ' This isn't a number ', contains: ' The value doesn\ ' t have
A 0 in it '}). IsNumeric (). Contains (' 0 '); Referencing validator args from the "message Check" (' foo ', ' the ' message needs to is between%1 and%2 characters long (you
Passed "%0"). Len (2, 6); sanitize/filter var int = Sanitize (' 0123 '). ToInt (); 123 var bool = sanitize (' true '). ToBoolean (); true var str = sanitize (' \t\r hello \ \ n '). Trim (); ' Hello ' var str = sanitize (' Aaaaaaaaab '). LTrim (' a '); ' B ' var str = sanitize (LARGE_INPUT_STR). XSS (); var str = sanitize (' <a> '). Entitydecode (); ' <a> '
Filter just crawled content, mainly filter space
var topic = $ ('. Inner.topic ');
title = Topic.children (' h3 '). Text ()//title
sanitize (title). Trim ()
Storing useful data
Salted fish and cabbage need, for swimming data, can be stored as text, can also be saved to the database, this time, in order to be concise enough, so do not use database storage, the text of the way Records and JSON format record data.
The process of a crawler is complete, let's take a look at the implementation code again
VI url.js
var http = require (' http ');
var cheerio = require (' Cheerio ');
var sanitize = require (' validator '). sanitize;
var async = require (' async ');
var fs = require (' FS ');
var base_url = ' http://cnodejs.org ' var scrapy = {}/** * get page from URL.
* * Examples: * * scrapy.get (' http://www.baidu.com ', CB);
*//=> ' Baidu page HTML * * @interface * @param {String} url:ex http://www.baidu.com * @param {Function} CB
* @private/scrapy.get = function (URL, cb) {http.get (URL, function (res) {var size = 0;
var chunks = [];
Res.on (' Data ', function (chunk) {size + = Chunk.length;
Chunks.push (chunk);
});
Res.on (' End ', function () {var data = Buffer.concat (chunks, size);
CB (NULL, data);
});
}. On (' Error ', function (e) {CB (E, NULL);
});
The var url = function (starturl) {this.starturl = StartURL;
this.page = 0;
This.homepage = ';
} Urls.prototype.next = function (CB) {var self = this; This.hasnext (function (err, bRet) {if (!bret) {return null;
} self.homeparse (function (err, topics) {self.page = 1;
CB (NULL, topics);
})} Urls.prototype.hasNext = function (CB) {var self = this;
var url = this.starturl + this.page;
Scrapy.get (URL, function (err, data) {var html = data.tostring ();
$ = cheerio.load (HTML);
Self.homepage = $ ('. Cell. Topic_wrap a '); if (Self.homePage.length = = 0) {return CB (NULL,