Using Node. js to capture beautiful pictures in batches through sitemap. xml

Source: Internet
Author: User

Using Node. js to capture beautiful pictures in batches through sitemap. xml

This article mainly introduces how to use Node. js to capture beautiful pictures in batches through sitemap. xml and related code. If you need it, you can refer to it.

I have read many versions before, and I have made one myself.

1. You can specify the directory to which the file is saved.

2. store articles in different directories

3. allows you to set the maximum number of parallel downloads.

Next time you are free to download the entire site.

Package. json

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

{

"Name": "me2sex-images ",

"Version": "0.0.1 ",

"Description": "Batch download images from http://me2-sex.lofter.com ",

"Main": "index. js ",

"Author": "Fay ",

"License": "MIT ",

"Dependencies ":{

"Async": "^ 0.9.0 ",

"Cheerio": "^ 0.18.0 ",

"Mkdirp": "^ 0.5.0 ",

"Request": "^ 2.51.0 ",

"Url": "^ 0.10.2 ",

"Xml2js": "^ 0.4.4"

}

}

Index. js

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

Var node = {

Async: require ('async '),

Cheerio: require ('cheerio '),

Fs: require ('fs '),

Mkdirp: require ('mkdirp '),

Path: require ('path '),

Request: require ('request '),

Url: require ('url '),

Xml2js: require ('xml2js '),

};

 

Var Me2SexImages = {

 

/**

* Configuration options

*/

Options :{

// Website map address

Sitemap: 'http: // sexy.faceks.com/sitemap.xml ',

// Save to this folder

SaveTo: '/Users/Fay/Pictures/me2sex ',

// Maximum number of parallel image downloads

DownLimit: 5,

},

 

Posts: [],

 

/**

* Start download (program entry function)

*/

Start: function (){

Var self = this;

Var async = node. async;

 

Async. waterfall ([

Self. wrapTask (self. sitemapXML ),

Self. wrapTask (self. sitemapJSON ),

Self. wrapTask (self. downAllImages ),

], Function (err, result ){

If (err ){

Console. log ('error: % s', err. message );

} Else {

Console. log ('success: Download succeeded ');

}

});

},

 

/**

* Package the task to ensure that the context of the original task points to a specific object

* @ Param {Function} The task Function that meets the asycs. js call Method

* @ Param {Any} context

* @ Param {Array} exArgs additional parameters

* @ Return {Function} refers to the task Function that meets the asycs. js call method.

*/

WrapTask: function (task, context, exArgs ){

Var self = this;

Return function (){

Var args = []. slice. call (arguments );

Args = exArgs? ExArgs. concat (args): args;

Task. apply (context | self, args );

};

},

 

/**

* Retrieve site sitemap. xml

*/

SitemapXML: function (callback ){

Console. log ('start downloading sitemap. xml ');

Node. request (this. options. sitemap, function (err, res, body ){

If (! Err) console. log ('Download sitemap. xml successfully ');

Callback (err, body );

});

},

 

/**

* Convert sitemap. xml to json

*/

SitemapJSON: function (sitemapXML, callback ){

Var self = this;

Console. log ('start parsing sitemap. xml ');

Node. xml2js. parseString (sitemapXML, {explicitArray: false}, function (err, json ){

If (! Err ){

Self. posts = json. urlset. url;

Self. posts. shift ();

Console. log ('Resolution of sitemap. xml succeeded, % d pages in total ', self. posts. length );

}

Callback (err, self. posts );

});

},

 

 

 

/**

* Download the whole site image

*/

DownAllImages: function (callback ){

Var self = this;

Var async = node. async;

Console. log ('start batch download ');

Async. eachSeries (self. posts, self. wrapTask (self. downPostImages), callback );

},

 

 

/**

* Download a single post Image

* @ Param {Object} post article

*/

DownPostImages: function (post, callback ){

Var self = this;

Var async = node. async;

 

Async. waterfall ([

Self. wrapTask (self. mkdir, self, [post]),

Self. wrapTask (self. getPost ),

Self. wrapTask (self. parsePost ),

Self. wrapTask (self. downImages ),

], Callback );

},

 

Mkdir: function (post, callback ){

Var path = node. path;

Var url = node. url. parse (post. loc );

Post. dir = path. join (this. options. saveTo, path. basename (url. pathname ));

 

Console. log ('Prepare to create directory: % s', post. dir );

If (node. fs. existsSync (post. dir )){

Callback (null, post );

Console. log ('Directory: % s already exists ', post. dir );

Return;

}

Node. mkdirp (post. dir, function (err ){

Callback (err, post );

Console. log ('Directory: % s created successfully ', post. dir );

});

},

 

/**

* Get post content

*/

GetPost: function (post, callback ){

Console. log ('start request page: % s', post. loc );

Node. request (post. loc, function (err, res, body ){

If (! Err) post.html = body;

Callback (err, post );

Console. log ('request page succeeded: % s', post. loc );

});

},

 

/**

* Parse the post and obtain the image list in the post.

*/

ParsePost: function (post, callback ){

Var $ = post. $ = node.cheerio.load(post.html );

Post. images = $ ('. img ')

. Map (function () {return $ (this). attr ('bigimgsrc ');})

. ToArray ();

Callback (null, post );

},

 

/**

* Download images from the post image list

*/

DownImages: function (post, callback ){

Console. log ('found % d sister pictures, ready to download... ', post. images. length );

Node. async. eachLimit (

Post. images,

This. options. downLimit,

This. wrapTask (this. downImage, this, [post]),

Callback

);

},

 

/**

* Download a single image

*/

DownImage: function (post, imgsrc, callback ){

Var url = node. url. parse (imgsrc );

Var fileName = node. path. basename (url. pathname );

Var toPath = node. path. join (post. dir, fileName );

Console. log ('start downloading image: % s, save to: % s, file name: % s', imgsrc, post. dir, fileName );

Node. request (imgsrc)

. Pipe (node. fs. createWriteStream (toPath ))

. On ('close', function (){

Console. log ('image download succeeded: % s', imgsrc );

Callback ();

})

. On ('error', callback );

}

};

 

Me2SexImages. start ();

The above is all the content of this article. I hope you will like it.

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.