Simple web crawler

Source: Internet
Author: User


var http = require (' http ');
var Promise = require (' Bluebird '); Third-party Promises modules
var cheerio = require (' Cheerio '); Crawler Analysis Module
var bufferhelper = require (' Bufferhelper '); Buffer Assembly Module
var iconv = require (' Iconv-lite '); Character transcoding module

var baseUrl = ' http://www.imooc.com/learn/';
var courseids = [348, 637, 259, 75, 197]; The course ID to crawl
var pagesarr = []; Crawl to the collection of HTML pages

Bulk Crawl Course Page
Courseids.foreach (function (CID) {
Pagesarr.push (Grabpageasync (BASEURL + cid));
});

Asynchronously crawling page HTML
function Grabpageasync (URL) {
return new Promise (function (resolve, reject) {
Console.log (' crawling ' + URL);

Http.get (URL, function (res) {
var bufferhelper = new Bufferhelper ();

Res.on (' Data ', function (chunk) {
Bufferhelper.concat (chunk);
});

Res.on (' End ', function () {
Console.log (' crawl ' + URL + ' success ');

var fullbuffer = Bufferhelper.tobuffer ();
var utf8buffer = Iconv.decode (Fullbuffer, ' UTF-8 ');
var html = utf8buffer.tostring ()
Resolve (HTML);
});
}). On (' Error ', function (e) {
Crawl success
Reject (e);

Console.log (' crawl ' + URL + ' fail ');
});
});
}

Extract course information and print
Promise
. All (Pagesarr)
. then (function (pages) {
var coursesdata = [];

Pages.foreach (function (HTML) {
Extracting Course Information
var courses = filterchapters (HTML);
Coursesdata.push (courses);
});
Print course Information
Printcourseinfo (Coursesdata);
});

Extracting Course Information
function Filterchapters (HTML) {
var $ = cheerio.load (HTML);
var $chapters = $ ('. Chapter ');
var title = $ ('. HD. L '). Text ();
var number = parseint ($ ($ (". Meta-value Strong") [3]). Text (). Trim (), 10);
var coursedata = {
Title:title,
Number:number,
Videos: []
};

var $chapter;
var chaptertitle;
var chapterdata = {};
var $videos;
var $video;
var videotitle;
var id;

$chapters. each (function () {
$chapter = $ (this);
Chaptertitle = $chapter. Find (' strong '). Text ();
Chapterdata = {
Chaptertitle:chaptertitle,
Videos: []
};
$videos = $chapter. Find ('. Video '). Children (' Li ');
$videos. each (function () {
$video = $ (this). Find ('. Studyvideo ');
Videotitle = $video. Text ();
id = $video. attr (' href '). Split (' video/') [1];
ChapterData.videos.push ({
Title:videotitle,
Id:id
})
});
CourseData.videos.push (Chapterdata);
});
return coursedata;
}

Print course Information
function Printcourseinfo (coursesdata) {
if (Object.prototype.toString.call (coursesdata) = = ' [Object Array] ' && coursesdata.length > 0) {

Coursesdata.foreach (function (coursedata) {
Console.log (' \ n ' + ' + coursedata.number + ') people have learned ' + Coursedata.title + ');
Console.log ('----------------------------------------------');

CourseData.videos.forEach (function (item) {
Console.log (' \ n ' + item.chaptertitle);

Item.videos.forEach (function (video) {
Console.log (' + Video.title.trim ());
})
});
});
}else{
Console.log (' No course information ');
}
}

Simple web crawler

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.