Today, when I read chapter 3 of data-intensive Text Processing with mapreduce, I wrote it to optimize wordcount on the map end.
When distributed processing of data-intensive data, an important aspect that affects data processing speed is the intermediate output result of map. During the process of transmitting data to reduce, A lot of intermediate data needs to be exchanged and processed, and then handed over to the corresponding reduce. The intermediate data must be transmitted over the network, and the intermediate data must be written to the local disk before being sent to the network, because network bandwidth and disk I/O are very time-consuming compared with other operations, reducing the transmission of intermediate data will increase the efficiency of Algorithm Execution, reduce the number of key-value pairs by using the combiner function or other methods. The following is an improved wordcount algorithm.
The basic idea is:
Define an associated array during Map Processing, process the document, and add <word, number of times> to the associated array. If word exists, add 1 to the number of times, if it does not exist, it is directly added to the associated array. After all the map tasks are completed, output the processing results in the run function.
Pseudocode:
Class mapper
Method map (docid A, Doc D)
H = new associativearray
For all term t belongs to Doc d do
H {t} = H {t} + 1;
For all term t belongs to h do
Emit (term T, Count H {t })
Class CER
Method reduce (term T, counts [C1, C2,...])
Sum = 0
For all count C belongs to counts [C1, C2,...] Do
Sum + = C
Emit (term T, Count sum)
The Code is as follows:
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
public class Mapper extends
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable> {
int c;
HashMap<String,IntWritable> map=new HashMap<String,IntWritable>();
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
String str=value.toString();
StringTokenizer token=new StringTokenizer(str);
while(token.hasMoreTokens()){
String value1=token.nextToken();
if(map.containsKey(value1)){
//System.out.println("ni");
int p=map.get(value1).get()+1;
map.remove(value1);
map.put(value1, new IntWritable(p));
}
else{
//System.out.println("ni");
map.put(value1, new IntWritable(1));
}
}
// TODO Auto-generated method stub
c++;
System.out.println(c);
}
@Override
protected void cleanup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
System.out.println("cleanup");
super.cleanup(context);
}
@Override
public void run(Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
super.run(context);
System.out.println("run");
Iterator it=map.entrySet().iterator();
while(it.hasNext()){
//System.out.println("nihe");
Map.Entry<String, IntWritable> entry=(Map.Entry<String, IntWritable>) it.next();
//System.out.println("nihe");
context.write(new Text(entry.getKey()), entry.getValue());
}
}
@Override
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// System.out.println(context.getInputSplit().toString());
// System.out.println(context.getJobID());
// FileSplit input=(FileSplit)context.getInputSplit();
// String path=input.getPath().toString();
// Configuration conf=new Configuration();
// System.out.println(input.getPath().toString());
// FileSystem fs=FileSystem.get(URI.create(path), conf);
// FSDataInputStream filein=fs.open(input.getPath());
// LineReader in=new LineReader(filein,conf);
// Text line=new Text();
// int cd=in.readLine(line);
// System.out.println(line);
}
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
public class Reducer extends
org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int sum=0;
for(IntWritable it:values){
sum+=it.get();
}
context.write(key, new IntWritable(sum));
}
}
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Word {
/**
* @param args
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// TODO Auto-generated method stub
Job job=new Job();
Configuration conf=new Configuration();
Path in=new Path(args[0]);
Path out=new Path(args[1]);
FileSystem fs=FileSystem.get(URI.create(args[1]), conf);
fs.delete(out);
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.waitForCompletion(false);
}
}