Before refactoring the code, you need to know what is HTTPS?
HTTPS protocol: SSL/TLS-based HTTP protocol, all data is in the
SSL/TLS protocol, which means that the HTTPS protocol is based on the HTTP protocol
The SSL/TLS handshake and the data encryption transfer are added, so this is the biggest difference between the two.
The HTTPS module specifically handles encrypted access, except that SSL certificates are required to build an HTTPS server.
Simulating HTTPS Server Setup
var https = require (' https ') var fs = require (' FS ')//File system module var options = {//synchronous read-out SSL certificate __ Virtual Key:fs.readFileSync (' Ssh_ Key.pem ') cert:fs.readFileSync (' Ssh_cert.pem ')}//can run HTTPS server https.createserver after reading the certificate above (options, function (req , res) {Res.wirtehead (res.end) (' Hello Https ')}). Listen (8090)
Refactoring Crawler Code:
Var http = require (' http ') var cheerio = require (' cheerio ')//promise already built-in Var in the new version promise = require (' Bluebird ') var baseurl = ' http://www.imooc.com/learn/' var videoids = [348,259,197,134,751]function filterchapters (HTML) {var $ = Cheerio.load (HTML) var chapters = $ ('. Mod-chapters ')//title var title = $ (' #main .path span '). Text ()///Number of Learners//var number = parseint ($ ($ (' Static-item ') [0]). Text (). Trim (),//var number = $ ($ ('. Static-item span ') [1]). Text ();//var number = parseint ($ ('. Meta-value js-learn-num '). Text (). Trim (), ten) var number = $ (' .meta-value Js-learn-num '). HTML () var coursedata = {title:title,number:number,videos:[]}// The inside of the traverse gets the data Chapters.each (function (item) {var chapter = $ (this);//Chapter title var chaptertitle = chapter.find (' strong '). Text () Console.log (Chaptertitle) Var videos = chapter.find ('. Video '). Children (' Li ') var chapterdata = {chaptertitle:chaptertitle,videos:[]}//Traversal Videosvideos.each (function (item) {var video = $ ( This). Find ('. J-media-item ') Var videotitle = video.text () var id = video.attr (' href '). Split (' video/') [1]chapterdata.videos.push ({title:videotitle,id:id})}) CourseData.videos.push (Chapterdata)}) return coursedata}function printcourseinfo (coursesdata) {Coursesdata.foreach (function (courseData) { Console.log (coursedata.number + ' people learned ' +coursedata.title+ ' \ n ')})// Traversal Coursesdata.foreach (function (coursedata) {console.log (' ### ' +coursedata.title+ ' \ n ') in the array CourseData.videos.forEach (function (item) {var chaptertitle = Item.chapterTitleitem.videos.forEach (function (video) {Console.log (' "' +video.id+ '" ' +video.title ')})})} Function getpageasync (URL) {return new promise (function (resolve,reject) {Console.log (' fetching' + url ' Http.get (url, function (res) {var html = ' res.on (' data '), function (data) {html += data}) Res.on (' End ', function () {////At the time the request is completed, pass through the Resolve resolve (HTML)}). On (' Error ', function (e) {///If error reject (e) console.log (' get page error ')})})})}var fetchcoursearray = [] Videoids.foreach (function (ID) {//Traversal result passed past fetchcoursearray.push (GetPageAsync (Baseurl + id))})// Need to do concurrency control, all to crawl Promise.all (Fetchcoursearray). Then (pages) {//multi-page processing var coursesdata = []// Working with Page Pages.foreach (function (HTML) {//parsing HTML var courses = filterchapters (HTML) Coursesdata.push (Courses)})//Traverse Coursesdata.sort (function (A, b) {return a.number < b.number}) Printcourseinfo (Coursesdata)})
The results of the operation are as follows:
650) this.width=650; "src=" Https://s2.51cto.com/wyfs02/M01/8F/50/wKiom1jahlPyvW7uAABnQMXU45E114.jpg "title=" 36020170315204858576.jpg "alt=" Wkiom1jahlpyvw7uaabnqmxu45e114.jpg "/>
This article is from the "It Rookie" blog, make sure to keep this source http://mazongfei.blog.51cto.com/3174958/1911261
node. JS (13)--promise refactoring crawler Code