var http = require (' http ');
var Promise = require (' Bluebird '); Third-party Promises modules
var cheerio = require (' Cheerio '); Crawler Analysis Module
var bufferhelper = require (' Bufferhelper '); Buffer Assembly Module
var iconv = require (' Iconv-lite '); Character transcoding module
var baseUrl = ' http://www.imooc.com/learn/';
var courseids = [348, 637, 259, 75, 197]; The course ID to crawl
var pagesarr = []; Crawl to the collection of HTML pages
Bulk Crawl Course Page
Courseids.foreach (function (CID) {
Pagesarr.push (Grabpageasync (BASEURL + cid));
});
Asynchronously crawling page HTML
function Grabpageasync (URL) {
return new Promise (function (resolve, reject) {
Console.log (' crawling ' + URL);
Http.get (URL, function (res) {
var bufferhelper = new Bufferhelper ();
Res.on (' Data ', function (chunk) {
Bufferhelper.concat (chunk);
});
Res.on (' End ', function () {
Console.log (' crawl ' + URL + ' success ');
var fullbuffer = Bufferhelper.tobuffer ();
var utf8buffer = Iconv.decode (Fullbuffer, ' UTF-8 ');
var html = utf8buffer.tostring ()
Resolve (HTML);
});
}). On (' Error ', function (e) {
Crawl success
Reject (e);
Console.log (' crawl ' + URL + ' fail ');
});
});
}
Extract course information and print
Promise
. All (Pagesarr)
. then (function (pages) {
var coursesdata = [];
Pages.foreach (function (HTML) {
Extracting Course Information
var courses = filterchapters (HTML);
Coursesdata.push (courses);
});
Print course Information
Printcourseinfo (Coursesdata);
});
Extracting Course Information
function Filterchapters (HTML) {
var $ = cheerio.load (HTML);
var $chapters = $ ('. Chapter ');
var title = $ ('. HD. L '). Text ();
var number = parseint ($ ($ (". Meta-value Strong") [3]). Text (). Trim (), 10);
var coursedata = {
Title:title,
Number:number,
Videos: []
};
var $chapter;
var chaptertitle;
var chapterdata = {};
var $videos;
var $video;
var videotitle;
var id;
$chapters. each (function () {
$chapter = $ (this);
Chaptertitle = $chapter. Find (' strong '). Text ();
Chapterdata = {
Chaptertitle:chaptertitle,
Videos: []
};
$videos = $chapter. Find ('. Video '). Children (' Li ');
$videos. each (function () {
$video = $ (this). Find ('. Studyvideo ');
Videotitle = $video. Text ();
id = $video. attr (' href '). Split (' video/') [1];
ChapterData.videos.push ({
Title:videotitle,
Id:id
})
});
CourseData.videos.push (Chapterdata);
});
return coursedata;
}
Print course Information
function Printcourseinfo (coursesdata) {
if (Object.prototype.toString.call (coursesdata) = = ' [Object Array] ' && coursesdata.length > 0) {
Coursesdata.foreach (function (coursedata) {
Console.log (' \ n ' + ' + coursedata.number + ') people have learned ' + Coursedata.title + ');
Console.log ('----------------------------------------------');
CourseData.videos.forEach (function (item) {
Console.log (' \ n ' + item.chaptertitle);
Item.videos.forEach (function (video) {
Console.log (' + Video.title.trim ());
})
});
});
}else{
Console.log (' No course information ');
}
}
Simple web crawler