This article mainly introduces how nodejs uses phantomjs to download webpages. For more information, see. The function is actually very simple. You can use phantomjs.exe to collect url-loaded resources and use sub-processes to start nodejs to load all resources. For css resources, match the css content, download the url Resource
Of course, the function is still very simple. In the case of responsive design and asynchronous loading, there are still a lot of resources that cannot be downloaded and need to be processed according to the actual situation.
First, download nodejs and phantomjs.
The following is down. js executed by phantomjs.exe.
var page = require('webpage').create(), system = require('system');var spawn = require("child_process").spawnif (system.args.length === 1) { console.log('Usage: netsniff.js
'); phantom.exit(1);} else { var urls = []; page.address = system.args[1]; page.onResourceReceived = function (res) { if (res.stage === 'start') { urls.push(res.url); } }; page.open(page.address, function (status) { var har; if (status !== 'success') { console.log('FAIL to load the address'); phantom.exit(1); } else { console.log('down resource ' + urls.length + ' urls.'); var child = spawn("node", ["--harmony", "downHtml.js?1.1.9", urls.join(',')]) child.stdout.on("data", function (data) { console.log(data); }) child.stderr.on("data", function (data) { console.log(data); }) child.on("exit", function (code) { phantom.exit(); }) } });}
Below is the corresponding node running downHtml. js
"Use strict"; var fs = require ('fs'); var http = require ('http'); var path = require ('path '); var r_url = require ('url'); var dirCache ={}; // judge function makedir (pathStr, callback) by reducing the cache {if (dirCache [pathStr] = 1) {callback ();} else {fs. exists (pathStr, function (exists) {if (exists = true) {dirCache [pathStr] = 1; callback () ;}else {makedir (path. dirname (pathStr), function () {fs. mkdir (pathStr, Function () {dirCache [pathStr] = 1; callback () ;}) ;}}}; var reg =/[:,] \ s * url \ (['"]?. *? (\ 1) \)/gvar reg2 =/\ (['"]?) (.*?) (\ 1) \)/var isDownMap ={}; var downImgFromCss = function (URL) {http. get (URL, function (res) {// console. log (path. resolve (process. cwd (), 'index.min.css ') // res. pipe (fs. createWriteStream (path. resolve (process. cwd (), 'index.min.css '); var body = ""; res. setEncoding ('utf8'); res. on ('data', function (chunk) {body + = chunk;}); res. on ('end', function () {var match = body. match (reg); for (var I = 0, l En = match. length; I <len; I ++) {var m = match [I]. match (reg2); if (m & m [2]) {var url = m [2]; let imgUrl = r_url.resolve (URL, url); if (! IsDownMap [imgUrl]) {var uo = r_url.parse (imgUrl); let filepath = CWD + '/' + uo. hostname + uo. pathname; makedir (path. dirname (filepath), function () {http. get (imgUrl, function (res) {res. pipe (fs. createWriteStream (filepath) ;}) isDownMap [imgUrl] = 1 ;}}}) ;}) ;}var URLS = process. argv [2]. split (','); var CWD = process. cwd (); // download the resource URLS. forEach (function (URL) {var uo = r_url.parse (URL); var fi Lepath; if (uo. pathname = '/' | uo. pathname = '') {filepath = CWD + '/' + uo. hostname + '/index.html';} else {filepath = CWD + '/' + uo. hostname + uo. pathname;} makedir (path. dirname (filepath), function () {http. get (URL, function (res) {if (URL.indexOf('.css ')! =-1 | (res. headers ["content-type"] & res. headers ["content-type"]. indexOf ('text/css ')! =-1) {console. log ('down images form css file: '+ URL + '. '); downImgFromCss (URL);} res. pipe (fs. createWriteStream (filepath ));})});});
Put down. js downHtml. js in the same folder and run it through the following cmd:
D: \ phantomjs-2.0.0-windows \ bin \ phantomjs.exe down. js http://www.youku.com/
The above is all the content of this article. I hope you will like it.