- One: Understand the function of two order, use your own understanding of the way to express (including custom data types, partitioning, grouping, sorting)
- Two: Write to achieve two order function, provide source files.
- Three: Understand the MapReduce join several ways, the code to achieve reduce join, provide source code, say the idea.
One: Two order expressions using their own understanding (including custom data types, partitioning, grouping, sorting) 1.1 Two-time sorting functions
1. 当客户端提交一个作业的时候,hadoop 会开启yarn 接受进行数据拷贝处理,之后交友有yarn 框架上的启动服务resourcemanager 接收,同时指派任务给nomanager ,nodemanger 会调用开 applicationmaster 处理任务,同时在 container 分配好要处理任务环境的抽象,封装了CPU、内存等多维资源以及环境变量、启动命令等任务运行相关的信息.之后输入数据,在输入数据进行数据inputspilt分割,人很掉用mapper基类将数据分割成,key-values键值对之后调用map()方法,调用该方法后会对keys-values 对分割,之后经过shuffle 过程map 的输出,就是reduce 端的输入 经过reduce段数据即可输出到hdfs 上面。 二次排序 就是首先按照第一字段排序,然后再对第一字段相同的行按照第二字段排序。 2. 在shuffle 过程中,会对数据进行分割(spilt),分区(partitioner),排序(sort),合并(combine),压缩(compress),分组(group) 之后输出到reduce端。
1.2 Shuffle the job format definition:
1) partitioner job.setPartitionerClass(FirstPartitioner.class); 2) sort job.setSortComparatorClass(cls); 3) combine job.setCombinerClass(cls); 4) compress set by configuration 5) group job.setGroupingComparatorClass(FirstGroupingComparator.class);
Two: Write to achieve two order function, provide source files. 2.1 Two Order format requirements
1. 利用mapreduce 默认会对key 进行排序的方法对job 进行第一次排序 2. 把key和需要排序的第二个字段进行组合
2.2 Two-time code to sort Java
Secoundarysortmapreduce.javapackage Org.apache.hadoop.studyhadoop.sort;import Java.io.IOException;import Org.apache.hadoop.conf.configuration;import Org.apache.hadoop.conf.configured;import Org.apache.hadoop.fs.Path; Import Org.apache.hadoop.io.intwritable;import Org.apache.hadoop.io.longwritable;import Org.apache.hadoop.io.Text ; Import Org.apache.hadoop.mapreduce.job;import Org.apache.hadoop.mapreduce.mapper;import Org.apache.hadoop.mapreduce.reducer;import Org.apache.hadoop.mapreduce.lib.input.fileinputformat;import Org.apache.hadoop.mapreduce.lib.output.fileoutputformat;import Org.apache.hadoop.util.tool;import org.apache.hadoop.util.toolrunner;/** * * @author zhangyy * */public class Secondarysortmapreduce extends configured Impl Ements tool{//Step 1:mapper class/** * public class Mapper<keyin, Valuein, Keyout, valueout> * * public static class Secondarysortmapper extends//mapper<longwritable,text,pairwritable,intwritable>{ Private PairWritable Mapoutputkey = new pairwritable (); Private intwritable Mapoutputvalue = new intwritable (); @Override public void Map (longwritable key, Text value, Context context) throws IOException, Interru ptedexception {//Line value String Linevalue = value.tostring (); Split string[] STRs = Linevalue.split (","); Invalidate if (2! = strs.length) {return; }//Set map output key and Value Mapoutputkey.set (Strs[0], integer.valueof (strs[1])); Mapoutputvalue.set (integer.valueof (strs[1)); Output Context.write (Mapoutputkey, Mapoutputvalue); }}//Step 2:reducer class/** * public class reducer<keyin,valuein,keyout,valueout> */Public Static class Secondarysortreducer extends//reducer<pairwritable,intwritable,text,intwritable>{Priv Ate Text outputKey = new Text (); @Override public void reduce (pairwritable key, iterable<intwritable> values, context context) Throws IOException, Interruptedexception {//Set Output key Outputkey.set (Key.getfir St ()); Iterator for (intwritable value:values) {//Output context.write (Outputkey, V Alue); }}}//step 3:driver public int run (string[] args) throws Exception {//1:get configuration Configuration configuration = super.getconf (); 2:create Job Job Job = job.getinstance (//configuration,//This.getclass (). Getsimplename ()// ); Job.setjarbyclass (This.getclass ()); 3:set Job//input----map, reduce, output//3.1:input path Inpath = new Path (arg S[0]); Fileinputformat.addinputpath (Job, Inpath); 3.2:mapper Job.setmapperclass (Secondarysortmapper.class); Job.setmapoutputkeyclass (Pairwritable.class); Job.setmapoutputvalueclass (Intwritable.class);//===========================shuffle============================ ==========//1) partitioner Job.setpartitionerclass (Firstpartitioner.class); 2) sort//Job.setsortcomparatorclass (CLS); 3) combine//Job.setcombinerclass (CLS); 4) Compress//set by Configuration//5) group Job.setgroupingcomparatorclass (Firstgroupingco Mparator.class);//===========================shuffle======================================//3.3:reducer Job.setreducerclass (Secondarysortreducer.class); Job.setoutputkeyclass (Intwritable.class); Job.setoutputvalueclass (Intwritable.class); Set reducer number Job.setnumreducetasks (2); 3.4:output path Outpath = new Path (args[1]); FileoUtputformat.setoutputpath (Job, Outpath); 4:submit Job Boolean issuccess = Job.waitforcompletion (true); Return issuccess? 0:1; public static void Main (string[] args) throws Exception {args = new string[]{"Hdfs://namenode 01.hadoop.com:8020/input/sort ",//" Hdfs://namenode01.hadoop.com:8020/output "}; Create configuration configuration Configuration = new configuration (); Run Job int status = Toolrunner.run (//configuration,//New Secondarysortmapreduce (),// args); Exit program System.exit (status); }}
Pairwritable.javapackage Org.apache.hadoop.studyhadoop.sort;import Java.io.datainput;import java.io.DataOutput; Import Java.io.ioexception;import Org.apache.hadoop.io.writablecomparable;public class Pairwritable implements writablecomparable<pairwritable> {private String first; private int second; Public pairwritable () {} public pairwritable (String first, int second) {This.set (first, second); } public void set (String first, int second) {this.first = first; This.setsecond (second); } public String GetFirst () {return first; } public void Setfirst (String first) {This.first = first; } public int Getsecond () {return second-integer.max_value; } public void Setsecond (int second) {This.second = second + Integer.max_value; public void Write (DataOutput out) throws IOException {Out.writeutf (first); Out.writeint (second); } public void ReadFields (Datainput in) throWS IOException {this.first = In.readutf (); This.second = In.readint (); } public int compareTo (pairwritable o) {//Compare first int comp =this.first.compareto (O.getfirst ()); Eqauls if (0! = Comp) {return comp; }//Compare return integer.valueof (This.getsecond ()). CompareTo (Integer.valueof (O.getsecond ())); }}
firstpartitioner.javapackage org.apache.hadoop.studyhadoop.sort;import Org.apache.hadoop.io.intwritable;import Org.apache.hadoop.mapreduce.partitioner;public class FirstPartitioner Extends partitioner<pairwritable,intwritable> {@Override public int getpartition (pairwritable key, Intwritabl e value, int numpartitions) {return (Key.getfirst (). Hashcode () & integer.max_value)% numpartitions ; }}
FirstGroupingComparator.javapackage org.apache.hadoop.studyhadoop.sort;import org.apache.hadoop.io.RawComparator;import org.apache.hadoop.io.WritableComparator;public class FirstGroupingComparator implements RawComparator<PairWritable> { // object compare public int compare(PairWritable o1, PairWritable o2) { return o1.getFirst().compareTo(o2.getFirst()); } // bytes compare public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return WritableComparator.compareBytes(b1, 0, l1 - 4, b2, 0, l2 - 4); }}
2.3 Output Test
上传数据处理: hdfs dfs -put sort /input运行输出:
Three: Understand the MapReduce join several ways, the code to achieve reduce join, provide source code, say the idea. There are three types of 3.1 MapReduce joins:
3.1.1 map 的端的join map阶段不能获取所有需要的join字段,即:同一个key对应的字段可能位于不同map中。Reduce side join是非常低效的,因为shuffle阶段要进行大量的数据传输。Map side join是针对以下场景进行的优化:两个待连接表中,有一个表非常大,而另一个表非常小,以至于小表可以直接存放到内存中。这样,我们可以将小表复制多份,让每个map task内存中存在一份(比如存放到hash table中),然后只扫描大表:对于大表中的每一条记录key/value,在hash table中查找是否有相同的key的记录,如果有,则连接后输出即可。为了支持文件的复制,Hadoop提供了一个类DistributedCache 去实现。 3.1.2 reduce 的端的join 在map阶段,map函数同时读取两个文件File1和File2,为了区分两种来源的key/value数据对,对每条数据打一个标签(tag),比如:tag=0表示来自文件File1,tag=2表示来自文件File2。即:map阶段的主要任务是对不同文件中的数据打标签。在reduce阶段,reduce函数获取key相同的来自File1和File2文件的value list, 然后对于同一个key,对File1和File2中的数据进行join(笛卡尔乘积)。即:reduce阶段进行实际的连接操作 3.1.3 SemiJoin SemiJoin,也叫半连接,是从分布式数据库中借鉴过来的方法。它的产生动机是:对于reduce side join,跨机器的数据传输量非常大,这成了join操作的一个瓶颈,如果能够在map端过滤掉不会参加join操作的数据,则可以大大节省网络IO。实现方法很简单:选取一个小表,假设是File1,将其参与join的key抽取出来,保存到文件File3中,File3文件一般很小,可以放到内存中。在map阶段,使用DistributedCache将File3复制到各个TaskTracker上,然后将File2中不在File3中的key对应的记录过滤掉,剩下的reduce阶段的工作与reduce side join相同
3.2 Programming Code:
Datajoinmapreduce.java
Datajoinmapreduce.javapackage Org.apache.hadoop.studyhadoop.join;import Java.io.ioexception;import Java.util.arraylist;import Java.util.list;import Org.apache.hadoop.conf.configuration;import Org.apache.hadoop.conf.configured;import Org.apache.hadoop.fs.path;import org.apache.hadoop.io.IntWritable; Import Org.apache.hadoop.io.longwritable;import Org.apache.hadoop.io.nullwritable;import Org.apache.hadoop.io.text;import Org.apache.hadoop.mapreduce.job;import Org.apache.hadoop.mapreduce.Mapper; Import Org.apache.hadoop.mapreduce.mapper.context;import Org.apache.hadoop.mapreduce.reducer;import Org.apache.hadoop.mapreduce.lib.input.fileinputformat;import Org.apache.hadoop.mapreduce.lib.output.fileoutputformat;import Org.apache.hadoop.util.tool;import org.apache.hadoop.util.toolrunner;/** * * @author zhangyy * */public class Datajoinmapreduce extends configured Implemen TS Tool {//step 1:mapper/** * public class Mapper<keyin, Valuein, Keyout, valueout> */Public StAtic class Wordcountmapper extends//mapper<longwritable, Text, longwritable, datajoinwritable> { Private longwritable Mapoutputkey = new longwritable (); Private datajoinwritable Mapoutputvalue = new datajoinwritable (); @Override public void Map (longwritable key, Text value, Context context) throws IOException, Interru ptedexception {//split string[] STRs = value.tostring (). Split (","); Invalidate if ((3! = strs.length) && (4! = strs.length)) {return; }//Set Mapoutput key Long cid = long.valueof (strs[0]); Mapoutputkey.set (CID); Set name String name = Strs[1]; Customer if (3 = = strs.length) {String phone = strs[2]; Mapoutputvalue.set ("Customer", "Name +", "+ Phone"); }//Order if (4 = = Strs.length) { String price = strs[2]; String date = strs[3]; Mapoutputvalue.set ("Order", Name + "," + Price + "," + Date "); } context.write (Mapoutputkey, Mapoutputvalue); }}//step 2:reducer public static class Wordcountreducer extends//reducer<longwritable, Dataj Oinwritable, nullwritable, text> {private text Outputvalue = new text (); @Override public void reduce (longwritable key, iterable<datajoinwritable> values, Context cont EXT) throws IOException, interruptedexception {string customerinfo = new string (); list<string> orderlist = new arraylist<string> (); for (datajoinwritable value:values) {if ("Customer". Equals (Value.gettag ())) {Custome Rinfo = Value.getdata (); } else if ("Order". Equals (Value.gettag ())) {Orderlist.add (Value.getdata ()); }} for (String order:orderlist) {Outputvalue.set (key.tostring () + "," + customeri NFO + "," + order); Context.write (Nullwritable.get (), outputvalue); }}}//step 3:job public int run (string[] args) throws Exception {//1:get configuration Configuration configuration = super.getconf (); 2:create Job Job Job = job.getinstance (//configuration,//This.getclass (). Getsi Mplename ()); Job.setjarbyclass (Datajoinmapreduce.class); Job.setnumreducetasks (tasks); 3:set Job//----map----and output///3.1:input path Inpath = new Path ( Args[0]); Fileinputformat.addinputpath (Job, Inpath); 3.2:mapper Job.setmapperclass (Wordcountmapper.class); TODO Job.setmapoutputkeyclass (Longwritable.class);Job.setmapoutputvalueclass (Datajoinwritable.class); ====================shuffle==========================//1:partition//Job.setpartitionerclass (CLS); 2:sort//Job.setsortcomparatorclass (CLS); 3:combine//Job.setcombinerclass (CLS); 4:compress//set by configuration//5:group//Job.setgroupingcomparatorclass (CLS); ====================shuffle==========================//3.3:reducer Job.setreducerclass (WordCountReduc Er.class); TODO Job.setoutputkeyclass (Nullwritable.class); Job.setoutputvalueclass (Text.class); 3.4:output path Outpath = new Path (args[1]); Fileoutputformat.setoutputpath (Job, Outpath); 4:submit Job Boolean issuccess = Job.waitforcompletion (true); Return issuccess? 0:1; public static void Main (string[] args) throws Exception {args = new string[] { "Hdfs://namenode01.hadoop.com:8020/join", "hdfs://namenode01.hadoop.com:8020/output3/" }; Get configuration Configuration Configuration = new configuration (); Configuration.set (name, value); Run Job int status = Toolrunner.run (//configuration,//new Datajoinmapreduce (),// args); Exit program System.exit (status); }}
Datajoinwritable.javapackage Org.apache.hadoop.studyhadoop.join;import Java.io.datainput;import Java.io.dataoutput;import Java.io.ioexception;import Org.apache.hadoop.io.writable;public class DataJoinWritable Implements writable {private String tag; Private String data; Public datajoinwritable () {} public datajoinwritable (string tag, string data) {This.set (tag, data); } public void set (string tag, string data) {This.settag (tag); This.setdata (data); } public String Gettag () {return tag; } public void Settag (String tag) {This.tag = tag; } public String GetData () {return data; } public void SetData (String data) {this.data = data; } @Override public int hashcode () {final int prime = 31; int result = 1; result = Prime * result + ((data = = null)? 0:data.hashcode ()); result = Prime * result + ((tag = = null)? 0:tag.hashcode ()); return ResulT } @Override public boolean equals (Object obj) {if (this = = obj) return true; if (obj = = null) return false; if (getclass () = Obj.getclass ()) return false; datajoinwritable other = (datajoinwritable) obj; if (data = null) {if (other.data! = null) return false; } else if (!data.equals (Other.data)) return false; if (tag = = null) {if (Other.tag! = null) return false; } else if (!tag.equals (Other.tag)) return false; return true; The public void is write (dataoutput out) throws IOException {Out.writeutf (This.gettag ()); Out.writeutf (This.getdata ()); } public void ReadFields (Datainput in) throws IOException {This.settag (In.readutf ()); This.setdata (In.readutf ()); } @Override Public String toString () {return tag + "," + data; }}
3.3 Running Code tests
上传文件:hdfs dfs -put customers.txt /join hdfs dfs -put orders.txt /join运行结果:
Two-time sequencing of MapReduce