1. Diagram of MapReduce
2. Resume Process:
Input:
Hello World Bye
Hello Hadoop Bye Hadoop
Bye Hadoop Hello Hadoop
MAP:
<Hello,1>
<World,1>
<Bye,1>
<World,1>
<Hello,1>
<Hadoop,1>
<Bye,1>
<Hadoop,1>
<Bye,1>
<Hadoop,1>
<Hello,1>
<Hadoop,1>
Sort:
<Bye,1>
<Bye,1>
<Bye,1>
<Hadoop,1>
<Hadoop,1>
<Hadoop,1>
<Hadoop,1>
<Hello,1>
<Hello,1>
<Hello,1>
<World,1>
<World,1>
Combine:
<Bye,1,1,1>
<Hadoop,1,1,1,1>
<Hello,1,1,1>
<World,1,1>
Reduce:
<Bye,3>
<Hadoop,4>
<Hello,3>
<World,2>
3. code example:
[C-sharp]View Plaincopy
- Package Com.felix;
- Import java.io.IOException;
- Import Java.util.Iterator;
- Import Java.util.StringTokenizer;
- Import Org.apache.hadoop.fs.Path;
- Import org.apache.hadoop.io.IntWritable;
- Import org.apache.hadoop.io.LongWritable;
- Import Org.apache.hadoop.io.Text;
- Import Org.apache.hadoop.mapred.FileInputFormat;
- Import Org.apache.hadoop.mapred.FileOutputFormat;
- Import org.apache.hadoop.mapred.JobClient;
- Import org.apache.hadoop.mapred.JobConf;
- Import Org.apache.hadoop.mapred.MapReduceBase;
- Import Org.apache.hadoop.mapred.Mapper;
- Import Org.apache.hadoop.mapred.OutputCollector;
- Import Org.apache.hadoop.mapred.Reducer;
- Import Org.apache.hadoop.mapred.Reporter;
- Import Org.apache.hadoop.mapred.TextInputFormat;
- Import Org.apache.hadoop.mapred.TextOutputFormat;
- /**
- *
- * Description: WordCount explains by Felix
- * @author Hadoop Dev Group
- */
- public class WordCount
- {
- /**
- * Mapreducebase class: Implements the base class for Mapper and Reducer interfaces (where the method simply implements the interface without doing anything)
- * Mapper Interface:
- * Writablecomparable Interface: Classes that implement writablecomparable can be compared to each other. All classes that are used as keys should implement this interface.
- * Reporter can be used to report the running progress of the entire application, which is not used in this example.
- *
- */
- public static class Map extends Mapreducebase implements
- mapper<longwritable, text, text, intwritable>
- {
- /**
- * longwritable, Intwritable, Text are classes implemented in Hadoop to encapsulate Java data types that implement the Writablecomparable interface.
- * Can be serialized to facilitate data exchange in a distributed environment, and you can consider them as long,int,string alternatives.
- */
- Private final static intwritable one = new intwritable (1);
- Private text Word = new text ();
- /**
- * The map method in the Mapper interface:
- * void Map (K1 key, V1 value, outputcollector<k2,v2> output, Reporter Reporter)
- * Map a single input k/v pair to an intermediate k/v
- * The output pair is not required and the input pair is the same type, the input pair can be mapped to 0 or more output pairs.
- * Outputcollector Interface: Collects <k,v> pairs of mapper and reducer outputs.
- * Outputcollector interface Collect (k, V) method: Add a (k,v) to the output
- */
- public void Map (longwritable key, Text value,
- Outputcollector<text, intwritable> output, Reporter Reporter)
- Throws IOException
- {
- String line = value.tostring ();
- StringTokenizer tokenizer = new StringTokenizer (line);
- while (Tokenizer.hasmoretokens ())
- {
- Word.set (Tokenizer.nexttoken ());
- Output.collect (Word, one);
- }
- }
- }
- public static class Reduce extends Mapreducebase implements
- Reducer<text, Intwritable, Text, intwritable>
- {
- public void reduce (Text key, iterator<intwritable> values,
- Outputcollector<text, intwritable> output, Reporter Reporter)
- Throws IOException
- {
- int sum = 0;
- while (Values.hasnext ())
- {
- Sum + = Values.next (). get ();
- }
- Output.collect (Key, New intwritable (sum));
- }
- }
- public static void Main (string[] args) throws Exception
- {
- /**
- * Jobconf:map/reduce job Configuration class, describing the work performed by Map-reduce to the Hadoop framework
- * Construction Method: jobconf (), jobconf (Class exampleclass), jobconf (Configuration conf), etc.
- */
- jobconf conf = new jobconf (wordcount.class);
- Conf.setjobname ("WordCount"); Set a user-defined job name
- Conf.setoutputkeyclass (Text.class); Set the key class for the job's output data
- Conf.setoutputvalueclass (Intwritable.class); Set the value class for the job output
- Conf.setmapperclass (Map.class); Set the Mapper class for the job
- Conf.setcombinerclass (Reduce.class); Set the Combiner class for the job
- Conf.setreducerclass (Reduce.class); To set the reduce class for a job
- Conf.setinputformat (Textinputformat.class); To set the InputFormat implementation class for a map-reduce task
- Conf.setoutputformat (Textoutputformat.class); To set the OutputFormat implementation class for a map-reduce task
- /**
- * InputFormat describes the input definition of the job in Map-reduce
- * Setinputpaths (): Sets the path array as an input list for the Map-reduce job
- * Setinputpath (): Sets the path array as the output list for the Map-reduce job
- */
- Fileinputformat.setinputpaths (conf, new Path (Args[0]));
- Fileoutputformat.setoutputpath (conf, new Path (Args[1]));
- Jobclient.runjob (conf); Run a job
- }
- }
Hadoop sample program WordCount and examples (RPM)