Nodejs downloads webpage _ node. js through phantomjs

Source: Internet
Author: User
This article mainly introduces how nodejs uses phantomjs to download webpages. For more information, see. The function is actually very simple. You can use phantomjs.exe to collect url-loaded resources and use sub-processes to start nodejs to load all resources. For css resources, match the css content, download the url Resource

Of course, the function is still very simple. In the case of responsive design and asynchronous loading, there are still a lot of resources that cannot be downloaded and need to be processed according to the actual situation.

First, download nodejs and phantomjs.

The following is down. js executed by phantomjs.exe.

var page = require('webpage').create(),  system = require('system');var spawn = require("child_process").spawnif (system.args.length === 1) {  console.log('Usage: netsniff.js 
 
  ');  phantom.exit(1);} else {  var urls = [];  page.address = system.args[1];  page.onResourceReceived = function (res) {    if (res.stage === 'start') {      urls.push(res.url);    }  };  page.open(page.address, function (status) {    var har;    if (status !== 'success') {      console.log('FAIL to load the address');      phantom.exit(1);    } else {      console.log('down resource ' + urls.length + ' urls.');      var child = spawn("node", ["--harmony", "downHtml.js?1.1.9", urls.join(',')])      child.stdout.on("data", function (data) {       console.log(data);      })      child.stderr.on("data", function (data) {       console.log(data);      })      child.on("exit", function (code) {       phantom.exit();      })          }  });}
 

Below is the corresponding node running downHtml. js

"Use strict"; var fs = require ('fs'); var http = require ('http'); var path = require ('path '); var r_url = require ('url'); var dirCache ={}; // judge function makedir (pathStr, callback) by reducing the cache {if (dirCache [pathStr] = 1) {callback ();} else {fs. exists (pathStr, function (exists) {if (exists = true) {dirCache [pathStr] = 1; callback () ;}else {makedir (path. dirname (pathStr), function () {fs. mkdir (pathStr, Function () {dirCache [pathStr] = 1; callback () ;}) ;}}}; var reg =/[:,] \ s * url \ (['"]?. *? (\ 1) \)/gvar reg2 =/\ (['"]?) (.*?) (\ 1) \)/var isDownMap ={}; var downImgFromCss = function (URL) {http. get (URL, function (res) {// console. log (path. resolve (process. cwd (), 'index.min.css ') // res. pipe (fs. createWriteStream (path. resolve (process. cwd (), 'index.min.css '); var body = ""; res. setEncoding ('utf8'); res. on ('data', function (chunk) {body + = chunk;}); res. on ('end', function () {var match = body. match (reg); for (var I = 0, l En = match. length; I <len; I ++) {var m = match [I]. match (reg2); if (m & m [2]) {var url = m [2]; let imgUrl = r_url.resolve (URL, url); if (! IsDownMap [imgUrl]) {var uo = r_url.parse (imgUrl); let filepath = CWD + '/' + uo. hostname + uo. pathname; makedir (path. dirname (filepath), function () {http. get (imgUrl, function (res) {res. pipe (fs. createWriteStream (filepath) ;}) isDownMap [imgUrl] = 1 ;}}}) ;}) ;}var URLS = process. argv [2]. split (','); var CWD = process. cwd (); // download the resource URLS. forEach (function (URL) {var uo = r_url.parse (URL); var fi Lepath; if (uo. pathname = '/' | uo. pathname = '') {filepath = CWD + '/' + uo. hostname + '/index.html';} else {filepath = CWD + '/' + uo. hostname + uo. pathname;} makedir (path. dirname (filepath), function () {http. get (URL, function (res) {if (URL.indexOf('.css ')! =-1 | (res. headers ["content-type"] & res. headers ["content-type"]. indexOf ('text/css ')! =-1) {console. log ('down images form css file: '+ URL + '. '); downImgFromCss (URL);} res. pipe (fs. createWriteStream (filepath ));})});});

Put down. js downHtml. js in the same folder and run it through the following cmd:

D: \ phantomjs-2.0.0-windows \ bin \ phantomjs.exe down. js http://www.youku.com/

The above is all the content of this article. I hope you will like it.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.