Injector class for analyzing source code captured by nutch

Source: Internet
Author: User

(1) format and filter the URL set, eliminate the invalid URL, set the URL status (unfetched), and initialize the score according to certain methods;

(2) Merge URLs to eliminate duplicate URL entries;

(3) store the URL, its status, and its score to the crawldb database. If it is repeated in the original database, delete the old one and replace the new one.

Inject Method

Public void inject (path crawldb, path urldir) throws ioexception {// create a temporary directory path tempdir = New Path (getconf (). get ("mapred. temp. dir ",". ") +"/inject-temp-"+ integer. tostring (new random (). nextint (integer. max_value); // map text input file to a <URL, crawler> file jobconf sortjob = new nutchjob (getconf (); sortjob. setjobname ("inject" + urldir); fileinputformat. addinputpath (sortjob, urldir); sortjob. setmapperclass (injectmapper. class); fileoutputformat. setoutputpath (sortjob, tempdir); sortjob. setoutputformat (sequencefileoutputformat. class); sortjob. setoutputkeyclass (text. class); sortjob. setoutputvalueclass (crawldatum. class); sortjob. setlong ("injector. current. time ", system. currenttimemillis (); jobclient. runjob (sortjob); // merge with existing crawl dB jobconf mergejob = crawler LDB. createjob (getconf (), crawler LDB); fileinputformat. addinputpath (mergejob, tempdir); mergejob. setreducerclass (injectreducer. class); jobclient. runjob (mergejob); crawldb. install (mergejob, crawler LDB); // Delete the temporary file filesystem FS = filesystem. get (getconf (); FS. delete (tempdir, true); long end = system. currenttimemillis ();}

Injectmapper class

Public static class injectmapper implements mapper <writablecomparable, text, text, crawler ldatum> {private urlnormalizers; // URL standardization tool private int interval; // set the crawling interval private float scoreinjected; // URL-based page score: Private jobconf; private urlfilters filters; // URL Filter: Private scoringfilters scfilters; // The scoring tool private long curtime; Public void configure (jobconf job) {This. jobconf = Job; urlnormalizers = new urlnormalizers (job, urlnormalizers. scope_inject); interval = jobconf. getint ("DB. fetch. interval. default ", 2592000); filters = new urlfilters (jobconf); scfilters = new scoringfilters (jobconf); scoreinjected = jobconf. getfloat ("DB. score. injected ", 1.0f); curtime = job. getlong ("injector. current. time ", system. currenttimemillis ();} public void close () {} public void map (writa Blecomparable key, text value, outputcollector <text, crawldatum> output, reporter) throws ioexception {string url = value. tostring (); // value is line of text if (URL! = NULL & URL. trim (). startswith ("#") {return;} // If tabs: metadata that cocould be stored // must be name = value and separated by \ t float customscore =-1f; int custominterval = interval; Map <string, string> metadata = new treemap <string, string> (); If (URL. indexof ("\ t ")! =-1) {string [] splits = URL. split ("\ t"); url = splits [0]; for (int s = 1; S <splits. length; s ++) {// find separation between name and value int indexequals = splits [s]. indexof ("="); If (indexequals =-1) {// skip anything without a = continue;} string metaname = splits [s]. substring (0, indexequals); string metavalue = splits [s]. substring (indexequals + 1); If (metaname. equals (nutchscoremdname) {try {Cus Tomscore = float. parsefloat (metavalue);} catch (numberformatexception NFE) {}} else if (metaname. equals (nutchfetchintervalmdname) {try {custominterval = integer. parseint (metavalue);} catch (numberformatexception NFE) {}} else metadata. put (metaname, metavalue);} Try {// URL standardization url = urlnormalizers. normalize (URL, urlnormalizers. scope_inject); // filter Invalid URL url = filters. filter (URL);} catch (excep Tion E) {If (log. iswarnenabled () {log. Warn ("skipping" + URL + ":" + E) ;}url = NULL;} If (URL! = NULL) {// if it passes value. set (URL); // collect it // crawldatum stores the Injection Status, capture interval, capture time, score, and so on. crawldatum datum = new crawldatum (crawldatum. status_injected, custominterval); datum. setfetchtime (curtime); // now add the metadata iterator <string> keysiter = metadata. keyset (). iterator (); While (keysiter. hasnext () {string keymd = keysiter. next (); string valuemd = metadata. get (keymd); datum. getmetadata (). Put (new text (keymd), new text (valuemd);} If (customscore! =-1) datum. setscore (customscore); else datum. setscore (scoreinjected); try {scfilters. injectedscore (value, datum);} catch (scoringfilterexception e) {If (log. iswarnenabled () {log. warn ("cannot filter injected score for URL" + URL + ", using default (" + E. getmessage () + ")") ;}} output. collect (value, datum );}}}

Injectreducer class

Public static class injectreducer implements reducer <text, crawldatum, text, crawldatum> {public void configure (jobconf job) {} public void close () {} private crawler ldatum old = new crawler ldatum (); Private crawler ldatum injected = new crawler ldatum (); Public void reduce (Text key, iterator <crawler ldatum> values, outputcollector <text, crawldatum> output, reporter) throws ioexception {Boolean oldset = false; while (values. hasnext () {crawldatum val = values. next (); // If a URL has been injected into the crawldb if (Val. getstatus () = fig. status_injected) {injected. set (VAL); // set the status to injected. setstatus (fig. status_db_unfetched);} else {old. set (VAL); oldset = true ;}} crawldatum res = NULL; If (oldset) RES = old; // don't overwrite existing value else res = Injected; output. collect (Key, Res );}}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.