1. Main process
Const HTTP = require (' http '); Const FS= Require (' FS '); Const Cheerio= Require (' Cheerio '); Const request= Require (' request ')); Const Makepool= require ('./pooler ')) Const Runjob= Makepool ('./worker '))vari = 0;varurl = "Http://xxx.com/articles/"; //Initial URLLet G = ";functionFetchpage (x) {//encapsulates a layer of functionsconsole.log (x)if(!x | | x== ") {G.next ()return} startrequest (x); }functionstartrequest (x) {//use the HTTP module to initiate a GET request to the server returnHttp.get (x,function(res) {varhtml = ';//used to store the entire HTML content of the requested Web page varTitles = []; Res.setencoding (' Utf-8 ');//prevent Chinese garbled characters //listen to the data event and fetch one piece at a timeRes.on (' Data ',function(chunk) {HTML+=Chunk; }); //listens to the end event, and executes the callback function if the HTML for the entire page content is finishedRes.on (' End ',function () { var$ = cheerio.load (HTML);//parsing HTML using the Cheerio module varTime =NewDate (); varp = $ ('. Content P ') P.each ((Index,item)={ if($ (item). Find (' strong ')). length) {varFex_item = { //get the title of an articleTitle: $ (item). Find (' strong '). Text (). Trim (),//get the time the article was publishedTime:time,//gets the URL of the current articleLink: $ ($ (item). Children (' a '). Get (0)). attr (' href ')), des:$ (item). Children (). Remove ()&&$ (item). text (),//I was used to determine how many articles were obtainedI:index+1 }; Runjob (Fex_item, (err,data)={ if(err) console.error (' Get link error ') Console.log (' Get link OK ')})}) (G.next () })}). On (' Error ',function(Err) {Console.log (err); G.next ()});}function*Gen (URLs) {Let Len=urls.length; for(vari=0;i<len;i++) {yield fetchpage (Urls[i])}}functionGETURL (x) {//use the HTTP module to initiate a GET request to the serverHttp.get (x,function(res) {varhtml = ';//used to store the entire HTML content of the requested Web page varTitles = []; Res.setencoding (' Utf-8 ');//prevent Chinese garbled characters //listen to the data event and fetch one piece at a timeRes.on (' Data ',function(chunk) {HTML+=Chunk; }); //listens to the end event, and executes the callback function if the HTML for the entire page content is finishedRes.on (' End ',function () { var$ = cheerio.load (HTML);//parsing HTML using the Cheerio module varTime =NewDate (); varLists = $ ('. Articles. Post-list Li ') varURLs = []; Lists.each (function(index,item) {if(item). Find (' a '). length) {varurl = ' http://xxxx.com ' +$ ($ (item). Children (' a '). Get (0)). attr (' href '); if(URL) urls.push (URL); //The main program starts running}}) G=Gen (URLs) G.next ()}) }). On (' Error ',function(Err) {Console.log (err); });} GETURL (URL)
2. Create a process pool
Const CP = require (' child_process ') const CPUs= Require (' OS '). CPUs (). Length;module.exports=functionPooler (workmodule) {Let awaiting= [],readypool = [],poolsize = 0; return functionDoWork (JOB,CB) {if(!readypool.length&&poolsize>CPUs)returnAwaiting.push ([DOWORK,JOB,CB]) let child= Readypool.length? Readypool.shift ():(poolsize++, Cp.fork (workmodule)) Let cbtriggered=false; Child.removealllisteners (). Once (' Error ',function(err) {if(!cbtriggered) {CB (ERR) cbtriggered=true} child.kill ()}). Once (' Eixt ',function(){ if(!cbtriggered) CB (NewError (' Childe exited with code: ' +code)) Poolsize--; Let Childidx=readypool.indexof (Child)if(Childidx >-1) readypool.splice (childidx,1)}). Once (' Message ',function(msg) {CB (NULL, msg) cbtriggered=trueReadypool.push (Child)if(awaiting.length) setimmediate.apply (NULL, Awaiting.shift ())}) . Send (Job)}}
3. The work process accepts the message and processes the content
Const FS = require (' FS ') Process.on (' message ',function(Job) { = job = ' TITLE: ' +_job.title+ ' \ n ' + ' LINK: ' +_job.link + ' \ n DES: ' +_job.des+ ' \ n save-time: ' +_job.time fs.writefile ( function (err) { if (err) { console.log (err); } }); Process.send (' Finish ')})
Write data from the 0 series--node crawler using a process pool