Data deduplication:
Data deduplication only occurs once, so the key in the reduce stage is used as the input, but there is no requirement for values-in, that is, the input key is directly used as the output key, and leave the value empty. The procedure is similar to wordcount:
Tip: Input/Output path configuration.
Import Java. io. ioexception; import Org. apache. hadoop. conf. configuration; import Org. apache. hadoop. FS. path; import Org. apache. hadoop. io. text; import Org. apache. hadoop. mapreduce. job; import Org. apache. hadoop. mapreduce. mapper; import Org. apache. hadoop. mapreduce. reducer; import Org. apache. hadoop. mapreduce. lib. input. fileinputformat; import Org. apache. hadoop. mapreduce. lib. output. fileoutputformat; import Org. apach E. hadoop. util. genericoptionsparser; public class dedup {/*** @ Param XD */public static class map extends er <object, text, text, text> {Private Static text line = new text (); // map functionpublic void map (Object key, text value, context) throws ioexception, interruptedexception {line = value; context. write (line, new text ("") ;}} public static class reduce extends CER <text, text> {Publ IC void reduce (Text key, iterable <text> values, context) throws ioexception, interruptedexception {context. write (Key, new text ("") ;}} public static void main (string [] ARGs) throws ioexception, classnotfoundexception, interruptedexception {// todo auto-generated method stub // initialize the configuration conf = new configuration ();/* compared with the default ARGs, only implement the configuration in the program, in this way, you do not need to add parameters to the arguments attribute of eclipse. ** However, you do not need to set parameters based on your preferences, as shown in figure * // Set the input/output path to string [] ioargs = new string [] {"HDFS: // localhost: 9000/home/XD/hadoop_tmp/dedupin ", "HDFS: // localhost: 9000/home/XD/hadoop_tmp/dedupout"}; string [] otherargs = new genericoptionsparser (Conf, ioargs ). getremainingargs (); If (otherargs. length! = 2) {system. err. println ("Usage: Data deduplication <in> <out>"); system. exit (2);} // set job = new job (Conf, "dedup job"); job. setjarbyclass (dedup. class); // sets the job class for Processing Map, combine, and reduce. setmapperclass (map. class); job. setcombinerclass (reduce. class); job. setreducerclass (reduce. class); // set the processing job of the input/output format. setoutputkeyclass (text. class); job. setoutputvalueclass (text. class); // set the path fileinputformat. addinputpath (job, new PATH (otherargs [0]); fileoutputformat. setoutputpath (job, new path (otherargs [1]);/** corresponds to the automatic search path * fileinputformat. addinputpath (job, new path (ARGs [0]); * fileoutputformat. setoutputpath (job, new path (ARGs [1]); **/job. waitforcompletion (true); // print related information system. out. println ("Task Name:" + job. getjobname (); system. out. println ("task succeeded:" + (job. issuccessful ()? "Yes": "no "));}}
Data Sorting: During data sorting, the data has been processed in the map stage, but reduce marks the data with a row number during output, as shown in the following example:
Import Java. io. ioexception; import Org. apache. hadoop. conf. configuration; import Org. apache. hadoop. FS. path; import Org. apache. hadoop. io. intwritable; import Org. apache. hadoop. io. text; import Org. apache. hadoop. mapreduce. job; import Org. apache. hadoop. mapreduce. mapper; import Org. apache. hadoop. mapreduce. reducer; import Org. apache. hadoop. mapreduce. lib. input. fileinputformat; import Org. apache. hadoop. mapreduce. lib. Output. fileoutputformat; import Org. apache. hadoop. util. genericoptionsparser; public class datasort {/*** @ Param XD */public static class map extends er <object, text, intwritable, intwritable> {Private Static intwritable DATA = new intwritable (); Public void map (Object key, text value, context) throws ioexception, interruptedexception {string line = value. tostring (); data. set (integer. parseint (Lin E); context. write (data, new intwritable (1) ;}} public static class reduce extends CER <intwritable, intwritable> {Private Static intwritable linenum = new intwritable (1 ); public void reduce (intwritable key, iterable <intwritable> values, context) throws ioexception, interruptedexception {for (intwritable VAL: values) {context. write (linenum, key); linenum = new intwritable (Lin Enum. get () + 1) ;}}public static void main (string [] ARGs) throws ioexception, classnotfoundexception, interruptedexception {// todo auto-generated method stub // initialize the configuration conf = new configuration ();/* compared with the default ARGs, only implement the configuration in the program, in this way, you do not need to add parameters to the arguments attribute of eclipse. ** however, you can set parameters based on your preferences, as shown in: * // set the input/output path to string [] ioargs = new string [] {"HDFS: // localhost: 9000/home/XD/hadoop_tmp/sort_in", "HDFS: // localhost: 9000/h Ome/XD/hadoop_tmp/sort_out "}; string [] otherargs = new genericoptionsparser (Conf, ioargs). getremainingargs (); If (otherargs. length! = 2) {system. err. println ("Usage: Data deduplication <in> <out>"); system. exit (2);} // set job = new job (Conf, "datasort job"); job. setjarbyclass (datasort. class); // set the job class for Processing Map and reduce. setmapperclass (map. class); job. setreducerclass (reduce. class); // set the processing job of the input/output format. setoutputkeyclass (intwritable. class); job. setoutputvalueclass (intwritable. class); // set the path fileinputformat. addinputpath (job, new path (otherargs [0]); FI Leoutputformat. setoutputpath (job, new path (otherargs [1]);/** corresponds to the automatic search path * fileinputformat. addinputpath (job, new path (ARGs [0]); * fileoutputformat. setoutputpath (job, new path (ARGs [1]); **/job. waitforcompletion (true); // print related information system. out. println ("Task Name:" + job. getjobname (); system. out. println ("task succeeded:" + (job. issuccessful ()? "Yes": "no "));}}