First, steaming
Map tasks:
#!/bin/Bashawk 'begin{FS ="[ ,. ]"OFS="\ t"}{ for(i =1; I <= NF; ++i) {dict[$i]+=1}}end{ for(Keyinchdict) {print Key,dict[key]}}'
Reducer tasks:
#!/bin/bashawk'begin{' \ t'} { dict[$ 1] + = $2}end{ for in dict) { Print Key,dict[key] }}'
Startup script:
#!/bin/Bashhadoop FS-RM-r/data/apps/zhangwenchao/mapreduce/streaming/wordcount/Outputhadoop Jar/data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2. Jar-input/data/apps/zhangwenchao/mapreduce/streaming/wordcount/input-output/data/apps/zhangwenchao/mapreduce/streaming/wordcount/Output-mapper"sh-x mapper.sh" -reducer"sh-x reducer.sh" -fileMapper.SH -fileReducer.SH -jobconf mapred.job.name=WordCount-jobconf mapred.job.tasks=5 -jobconf mapred.reduce.tasks=3
Second, Python
Map tasks:
#! /usr/bin/python import sysimport re forLineinchsys.stdin:wordlist=re.Split('[;,.?]', line) forWordsinchWordlist:words=Words.strip () tmp= words.Split() forIteminchTmp:print"%s\t%s"% (item,1)
Reducer tasks:
#!/usr/bin/Envpythonfrom operator Import Itemgetterimport Syscurrent_word=Nonecurrent_count=0Word=None forLineinchSys.stdin:line=Line.strip () Word, Count= line.Split('\ t',1) Try:count=int(count) except Valueerror:continueifCurrent_word = =Word:current_count+=CountElse: ifCurrent_word:print'%s\t%s'%(current_word,current_count) Current_count=Count Current_word=WordifWord = =Current_word:print"%s\t%s"% (Current_word, Current_count)
Startup script:
#!/bin/Bashhadoop FS-RM-r/data/apps/zhangwenchao/mapreduce/python/wordcount/Outputhadoop Jar/data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2. Jar-input/data/apps/zhangwenchao/mapreduce/python/wordcount/input-output/data/apps/zhangwenchao/mapreduce/python/wordcount/Output-mapper"mapper.py" -reducer"reducer.py" -filemapper.py-filereducer.py-jobconf mapred.job.name=WordCount-jobconf mapred.job.tasks=5 -jobconf mapred.reduce.tasks=3
Third, Java
MAP:
Importjava.io.IOException;ImportJava.util.StringTokenizer;Importorg.apache.hadoop.io.IntWritable;Importorg.apache.hadoop.io.LongWritable;ImportOrg.apache.hadoop.io.Text;ImportOrg.apache.hadoop.mapreduce.Mapper; Public classMyMapextendsmapper<longwritable, text, text, intwritable> {Private Final StaticIntwritable one =NewIntwritable (1);PrivateText Word =NewText (); Public voidMap (longwritable key, Text value, context context)throwsIOException, interruptedexception {stringtokenizer ITR=NewStringTokenizer (value.tostring ()); while(Itr.hasmoretokens ()) {Word.set (Itr.nexttoken ()); Context.write (Word, one); }}}
Reduce:
Importjava.io.IOException;Importorg.apache.hadoop.io.IntWritable;ImportOrg.apache.hadoop.io.Text;ImportOrg.apache.hadoop.mapreduce.Reducer; Public classMyreduceextendsReducer<text, Intwritable, Text, intwritable>{@Override Public voidReduce (Text key, iterable<intwritable>values, context context)throwsIOException, interruptedexception {intsum = 0; for(intwritable val:values) {sum+=val.get ();} Context.write (Key,Newintwritable (sum));}}
Main:
Importorg.apache.hadoop.conf.Configuration;ImportOrg.apache.hadoop.fs.Path;Importorg.apache.hadoop.io.IntWritable;ImportOrg.apache.hadoop.io.Text;ImportOrg.apache.hadoop.mapreduce.Job;ImportOrg.apache.hadoop.mapreduce.lib.input.FileInputFormat;ImportOrg.apache.hadoop.mapreduce.lib.output.FileOutputFormat; Public classMain { Public Static voidMain (string[] args)throwsException {String input= "Hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput"; String Output= "Hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output"; Configuration conf=NewConfiguration (); Job Job=NewJob (conf); Job.setjobname ("Test4"); Job.setjarbyclass (Main.class); Fileinputformat.addinputpath (Job,NewPath (input)); Fileoutputformat.setoutputpath (Job,NewPath (output)); Job.setmapperclass (MyMap.class); Job.setreducerclass (myreduce.class); Job.setoutputkeyclass (Text.class); Job.setoutputvalueclass (intwritable.class); Job.setcombinerclass (myreduce.class); Job.setnumreducetasks (3); Job.waitforcompletion (true);}}
Hadoop WordCount (Streaming,python,java triad)