nodejs通過phantomjs實現下載網頁

來源:互聯網
上載者:User

nodejs通過phantomjs實現下載網頁

   這篇文章主要介紹了nodejs通過phantomjs實現下載網頁的方法,有需要的小夥伴可以參考下。

  功能其實很見簡單,通過 phantomjs.exe 採集 url 載入的資源,通過子進程的方式,啟動nodejs 載入所有的資源,對於css的資源,匹配css內容,下載裡面的url資源

  當然功能還是很簡單的,在響應式設計和非同步載入的情況下,還是有很多資源沒有能夠下載,需要根據實際情況處理下

  首先當然是下載 nodejs 和 phantomjs

  下面是 phantomjs.exe 執行的 down.js

  ?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

var page = require('webpage').create(),

system = require('system');

var spawn = require("child_process").spawn

 

if (system.args.length === 1) {

console.log('Usage: netsniff.js <some URL>');

phantom.exit(1);

} else {

var urls = [];

page.address = system.args[1];

page.onResourceReceived = function (res) {

if (res.stage === 'start') {

urls.push(res.url);

}

};

page.open(page.address, function (status) {

var har;

if (status !== 'success') {

console.log('FAIL to load the address');

phantom.exit(1);

} else {

console.log('down resource ' + urls.length + ' urls.');

var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])

child.stdout.on("data", function (data) {

console.log(data);

})

child.stderr.on("data", function (data) {

console.log(data);

})

child.on("exit", function (code) {

phantom.exit();

})

}

});

}

  下面是對應的node啟動並執行 downHtml.js

  ?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

"use strict";

var fs = require('fs');

var http = require('http');

var path = require('path');

var r_url = require('url');

 

var dirCache = {};//緩衝減少判斷

function makedir (pathStr, callback) {

if (dirCache[pathStr] == 1) {

callback();

} else {

fs.exists(pathStr, function (exists) {

if (exists == true) {

dirCache[pathStr] == 1;

callback();

} else {

makedir(path.dirname(pathStr), function () {

fs.mkdir(pathStr, function () {

dirCache[pathStr] == 1;

callback();

})

});

}

})

}

};

 

var reg = /[:,]\s*url\(['"]?.*?(\1)\)/g

var reg2 = /\((['"]?)(.*?)(\1)\)/

var isDownMap = {};

var downImgFromCss = function (URL) {

http.get(URL, function(res) {

//console.log(path.resolve(process.cwd(), 'index.min.css'))

//res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));

var body = "";

res.setEncoding('utf8');

res.on('data', function (chunk) {

body += chunk;

});

res.on('end', function () {

var match = body.match(reg);

for (var i = 0, len = match.length; i < len; i++){

var m = match[i].match(reg2);

if (m && m[2]) {

var url = m[2];

let imgUrl = r_url.resolve(URL, url);

if (!isDownMap[imgUrl]) {

var uo = r_url.parse(imgUrl);

let filepath = CWD + '/' + uo.hostname + uo.pathname;

makedir(path.dirname(filepath), function () {

http.get(imgUrl, function (res) {

res.pipe(fs.createWriteStream(filepath));

})

})

isDownMap[imgUrl] = 1;

}

}

}

});

});

}

 

var URLS = process.argv[2].split(',');

var CWD = process.cwd();

//下載資源

URLS.forEach(function (URL) {

var uo = r_url.parse(URL);

var filepath;

if (uo.pathname == '/' || uo.pathname == '') {

filepath = CWD + '/' + uo.hostname + '/index.html';

} else {

filepath = CWD + '/' + uo.hostname + uo.pathname;

}

makedir(path.dirname(filepath), function () {

http.get(URL, function (res) {

if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {

console.log('down images form css file:' + URL + '.');

downImgFromCss(URL);

}

res.pipe(fs.createWriteStream(filepath));

})

});

});

  down.js downHtml.js 放在同一個檔案夾下 通過下列 cmd 運行

  D:\phantomjs-2.0.0-windows\bin\phantomjs.exe down.js http://www.youku.com/

  以上所述就是本文的全部內容了,希望大家能夠喜歡。

        注< >:更多精彩教程請關注幫客之家編程

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.