Overview
This blog is in the previous blog Phantomjs+nodejs+mysql data fetching (1. Fetching data)
http://blog.csdn.net/jokerkon/article/details/50868880
After the second part, please read this blog before reading the previous article, because there are some of the code will be used in the previous part of the crawl results.
Okay, now start the formal crawl of the picture.
First, let's take a look at the code:
var page =require (' webpage '). Create (); var address= ' http://product.pconline.com.cn/notebook/series/417764.html '; var fs = require (' FS '); var mypath= ' version/server/server.txt '; var stream = Null;var steams = Null;var files = Null;var k=1 var line = '; var cate = '; var url = '; var dragpath= ' version/server/server_img.txt ';p hantom.outputencoding= "GBK"; Page.settings.userAgent = "mozilla/5.0 (Windows NT 6.1; WOW64; trident/7.0; rv:11.0) like Gecko "; function start (URL) {Page.open (url,function (status) {SetTimeout (function () {=" succ Ess ') {console.log (' open success! '); Console.log (' ==========begin work!============= '); stream = Page.evaluate (function () {var title = Document.queryselector ('. Pro-info '). InnerText; title = Title.replace (' picture ', '); var cont = Document.queryselectorall ('. pics>li>a>img ') [1].src; var imgurls = Document.queryselectorall ('. pics>li>a>img ') [0].src; var href = document.queryselector ('. PICS>LI>a '); Return title+ ': ' +cont+ ': ' +href+ ' \ r \ n '; }); Console.log (stream); try{Fs.write (Dragpath, Stream, ' a '); }catch (e) {console.log (e); Fs.write (Dragpath, NULL, ' a '); }}else{Console.log (' page open fail! '); } before (); }, 100); });} function ReadFile (status) {streams = Fs.open (MyPath, ' R '); Before ();} function before () {Console.log (' =========work in befor=========== ' +k); k++; if (!streams.atend ()) {Console.log (' =========work in befor get Next line=========== '); line = Streams.readline (); Cate = Line.split (', '); var imgurl = cate[1].replace (' http://product.pconline.com.cn/server/', '); var IMGs = imgurl.split ('/'); var imgsurl = Imgs[1].split ('. '); Imgsurl = ' http://product.pconline.com.cn/pdlib/' +imgsurl[0]+ ' _picture.html '; Console.log (Imgsurl); Start (Imgsurl); }else{Console.log (' End!!!!!!!!!!!! '); Phantom.exit (); }}page.open (address,function (status) {ReadFile (status);})
This part of the code is very similar in structure to the previous ones, so let's continue with the code anatomy
page.open(address,function(status){ readFile(status);})
Similar to the previous one, here is the entrance to our code, the entry for the program to start.
The ReadFile function is then called
function readFile(status){ streams = fs.open(mypath,‘r‘); before();}
Here, the use of the FS request inside the PHANTOMJS is mostly used to solve the problem of file reading.
var fs = require(‘fs‘);
After reading to the file, we will process the data:
function before(){ console.log(‘=========work in befor===========‘+K); K++; if(!streams.atEnd()){ console.log(‘=========work in befor get Next Line===========‘); line = streams.readLine(); cate = line.split(‘,‘); var imgUrl = cate[1].replace(‘http://product.pconline.com.cn/server/‘,‘‘); var imgs = imgUrl.split(‘/‘); var imgsUrl = imgs[1].split(‘.‘); imgsUrl = ‘http://product.pconline.com.cn/pdlib/‘+imgsUrl[0]+‘_picture.html‘; console.log(imgsUrl); start(imgsUrl); }else{ console.log(‘end!!!!!!!!!!!!‘); phantom.exit(); }}
We can start by looking at what the original data looks like:
联想ThinkServer TS130 S1225/2G/500O 价格:¥5417,http://product.pconline.com.cn/server/lenovo/514943.html
The above is the data that we read from the file, this piece of data belongs to this brand of computer data. After reading, we stitch the URL.
http://product.pconline.com.cn/pdlib/514943_picture.html
This is the URL we want to get to the destination, the reader can study the rules of their own, I have a little bit of stitching here. You can improve on your own.
function start(url){ page.open(url,function(status){ setTimeout(function(){ if(status == ‘success‘){ console.log(‘open success!‘); console.log(‘==========begin work!=============‘); stream = page.evaluate(function(){ var title = document.querySelector(‘.pro-info‘).innerText; // title = title.replace(‘图片‘,‘‘); var cont = document.querySelectorAll(‘.pics>li>a>img‘)[1].src; // var imgUrls = document.querySelectorAll(‘.pics>li>a>img‘)[0].src; var href = document.querySelector(‘.pics>li>a‘); return title+‘:‘+cont+‘:‘+href+‘\r\n‘; }); console.log(stream); try{ fs.write(dragPath, stream, ‘a‘); }catch(e){ console.log(e); fs.write(dragPath, null, ‘a‘); } }else{ console.log(‘page open fail!‘); } before(); }, 100); });}
The last call to the data fetch function,
var title = document.querySelector(‘.pro-info‘).innerText; // title = title.replace(‘图片‘,‘‘); var cont = document.querySelectorAll(‘.pics>li>a>img‘)[1].src; // var imgUrls = document.querySelectorAll(‘.pics>li>a>img‘)[0].src; var href = document.querySelector(‘.pics>li>a‘); return title+‘:‘+cont+‘:‘+href+‘\r\n‘;
This is what we want to grasp the processing of data, respectively, get the title, the absolute address of the small image, the URL of the large map.
联想ThinkServer TS130 S1225/2G/500O图片:http://img.pconline.com.cn/images/product/5149/514938/TS130-b_sn8.jpg:http://product.pconline.com.cn/pdlib/514943_bigpicture7748163.html
This part of the data is one of the data we crawled. After the capture, the write operation is made, and then the Before () method is called again to make a circular call until all the contents of the playing file are read.
The above is the entire process of image capture, there is a code is used to grab large images, but because the content of this article is very similar to the high, so I do not list here. Readers can refer to this article for a large-picture crawl.
NodeJs Picture Download
Next I'm going to talk about how to download the absolute address of the image we just scratched.
First on the code:
var request = require(‘request‘);var lineReader = require(‘line-reader‘);var fs = require(‘fs‘);var i=0;lineReader.eachLine(‘imgs.txt‘, {encoding: ‘utf8‘},function(line, last) { var cate = line.split(‘:‘); var url1 = cate[1]; var tt = cate[0].replace(/\//g,‘,‘); i++; console.log(tt+‘==============>‘+i); if(!(url1 == ‘null‘)){ tt = tt.replace(/\s/g,‘‘); tt = tt.replace(/[^a-z\d]/ig,""); var filename1 = ‘images/router_large/‘+tt+‘bPic.jpg‘ request(url1).pipe(fs.createWriteStream(filename1)); }});
Yes, the code is so short, let's analyze it for a while:
lineReader.eachLine(‘imgs.txt‘, {encoding: ‘utf8‘},function(line, last)
Here is the entrance to our download file, using the Nodejs inside the
var lineReader = require(‘line-reader‘);
The use of this code is to read the file line by row.
tt = tt.replace(/\s/g,‘‘); tt = tt.replace(/[^a-z\d]/ig,"");
In this I mainly deal with the file name, except for some special symbols have been Chinese name, easy to deposit the database.
request(url1).pipe(fs.createWriteStream(filename1));
Finally, this part of the code is called for file download.
The above is to grab all the contents of the picture, thank you for watching.
Phantomjs+nodejs+mysql data fetching (2. Grabbing pictures)