Write data from the 0 series--node crawler using a process pool

Source: Internet
Author: User

1. Main process

Const HTTP = require (' http '); Const FS= Require (' FS '); Const Cheerio= Require (' Cheerio '); Const request= Require (' request ')); Const Makepool= require ('./pooler ')) Const Runjob= Makepool ('./worker '))vari = 0;varurl = "Http://xxx.com/articles/"; //Initial URLLet G = ";functionFetchpage (x) {//encapsulates a layer of functionsconsole.log (x)if(!x | | x== ") {G.next ()return} startrequest (x); }functionstartrequest (x) {//use the HTTP module to initiate a GET request to the server    returnHttp.get (x,function(res) {varhtml = ';//used to store the entire HTML content of the requested Web page        varTitles = []; Res.setencoding (' Utf-8 ');//prevent Chinese garbled characters     //listen to the data event and fetch one piece at a timeRes.on (' Data ',function(chunk) {HTML+=Chunk;     }); //listens to the end event, and executes the callback function if the HTML for the entire page content is finishedRes.on (' End ',function () {          var$ = cheerio.load (HTML);//parsing HTML using the Cheerio module          varTime =NewDate (); varp = $ ('. Content P ') P.each ((Index,item)={                if($ (item). Find (' strong ')). length) {varFex_item = {                    //get the title of an articleTitle: $ (item). Find (' strong '). Text (). Trim (),//get the time the article was publishedTime:time,//gets the URL of the current articleLink: $ ($ (item). Children (' a '). Get (0)). attr (' href ')), des:$ (item). Children (). Remove ()&&$ (item). text (),//I was used to determine how many articles were obtainedI:index+1                             }; Runjob (Fex_item, (err,data)={                    if(err) console.error (' Get link error ') Console.log (' Get link OK ')})}) (G.next () })}). On (' Error ',function(Err) {Console.log (err); G.next ()});}function*Gen (URLs) {Let Len=urls.length;  for(vari=0;i<len;i++) {yield fetchpage (Urls[i])}}functionGETURL (x) {//use the HTTP module to initiate a GET request to the serverHttp.get (x,function(res) {varhtml = ';//used to store the entire HTML content of the requested Web page      varTitles = []; Res.setencoding (' Utf-8 ');//prevent Chinese garbled characters   //listen to the data event and fetch one piece at a timeRes.on (' Data ',function(chunk) {HTML+=Chunk;   }); //listens to the end event, and executes the callback function if the HTML for the entire page content is finishedRes.on (' End ',function () {        var$ = cheerio.load (HTML);//parsing HTML using the Cheerio module        varTime =NewDate (); varLists = $ ('. Articles. Post-list Li ')        varURLs = []; Lists.each (function(index,item) {if(item). Find (' a '). length) {varurl = ' http://xxxx.com ' +$ ($ (item). Children (' a '). Get (0)). attr (' href '); if(URL) urls.push (URL); //The main program starts running}}) G=Gen (URLs) G.next ()}) }). On (' Error ',function(Err) {Console.log (err); });} GETURL (URL)

2. Create a process pool

Const CP = require (' child_process ') const CPUs= Require (' OS '). CPUs (). Length;module.exports=functionPooler (workmodule) {Let awaiting= [],readypool = [],poolsize = 0; return functionDoWork (JOB,CB) {if(!readypool.length&&poolsize>CPUs)returnAwaiting.push ([DOWORK,JOB,CB]) let child= Readypool.length? Readypool.shift ():(poolsize++, Cp.fork (workmodule)) Let cbtriggered=false; Child.removealllisteners (). Once (' Error ',function(err) {if(!cbtriggered) {CB (ERR) cbtriggered=true} child.kill ()}). Once (' Eixt ',function(){      if(!cbtriggered) CB (NewError (' Childe exited with code: ' +code)) Poolsize--; Let Childidx=readypool.indexof (Child)if(Childidx >-1) readypool.splice (childidx,1)}). Once (' Message ',function(msg) {CB (NULL, msg) cbtriggered=trueReadypool.push (Child)if(awaiting.length) setimmediate.apply (NULL, Awaiting.shift ())}) . Send (Job)}}

3. The work process accepts the message and processes the content

Const FS = require (' FS ') Process.on (' message ',function(Job) {  = job  = ' TITLE: ' +_job.title+ ' \ n ' + ' LINK: ' +_job.link + ' \ n DES: ' +_job.des+ ' \ n save-time: ' +_job.time    fs.writefile (  function  (err) {      if  (err) {          console.log (err);      }  });  Process.send (' Finish ')})

Write data from the 0 series--node crawler using a process pool

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.