A multiple-table association is similar to a single table association, and it is also a matter of dealing with the raw data and digging out the concerned information from it. As follows
The input is two files, one representing the factory table, including the factory name and address number column, and the other representing the address list, including the address name and address number column.
Request to find out the corresponding relationship between factory name and address name from input data, output factory name-Address Name table
The sample is as follows:
Factory
<span style= "FONT-SIZE:14PX;" >factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
guangzhou Honda 2
Beijing rising 1
Guangzhou Development Bank 2
Tencent 3 back of
Beijing 1
</span>
Address
<span style= "FONT-SIZE:14PX;" >addressid addressname
1 Beijing
2 guangzhou
3 shenzhen
4 Xian
</span>
Results:
<span style= "FONT-SIZE:14PX;" >factoryname addressname
Beijing Red Star Beijing
Beijing rising Beijing
Bank of Beijing Beijing
Guangzhou Honda Guangzhou
guangzhou Development Bank Guangzhou
shenzhen Thunder Shenzhen
Tencent Shenzhen
</span>
The code is as follows:
<span style= "FONT-SIZE:14PX;"
>import java.io.IOException;
Import java.util.*;
Import org.apache.hadoop.conf.Configuration;
Import Org.apache.hadoop.fs.Path;
Import org.apache.hadoop.io.IntWritable;
Import Org.apache.hadoop.io.Text;
Import Org.apache.hadoop.mapreduce.Job;
Import Org.apache.hadoop.mapreduce.Mapper;
Import Org.apache.hadoop.mapreduce.Reducer;
Import Org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
Import Org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Import Org.apache.hadoop.util.GenericOptionsParser;
public class Mtjoin {public static int time = 0; * * In the MAP to distinguish between the input row belongs to the left or right table, and then split the two columns, * Save the Connection column in the key value, the remaining columns and left and right table flag in value, the last output/public static class MAP ext Ends Mapper<object, text, text, text> {//implementation map function </span>
<span style= "FONT-SIZE:14PX;" > public void Map (Object key, Text value, Context context) throws IOException, interruptedexcept
Ion {string = Value.tostring ();//Line file String Relationtype = new string ();//Left Table identity Enter the first line of the file, do not process if (Line.contains ("factoryname") = = True | | line.contains ("addressed")
= = True) {return;
A line of preprocessing text entered StringTokenizer ITR = new StringTokenizer (lines);
String mapkey = new string ();
String mapvalue = new string ();
int i = 0;
while (Itr.hasmoretokens ()) {//First read a word String token = Itr.nexttoken ();
Determine that the address ID is saved to the "values[0]" if (Token.charat (0) >= ' 0 ' && token.charat (0) <= ' 9 ') {
Mapkey = token; if (i > 0) {RelationtypE = "1";
else {relationtype = "2";
} continue;
//Save Factory name Mapvalue + + token + "";
i++;
//output about table Context.write (new text (Mapkey), new text (Relationtype + "+" + mapvalue));
}/* Reduce resolves the map output, saves the data in value by the left and right tables, and then finds the Cartesian product and outputs it. */public static class Reduce extends Reducer<text, text, text, text> {///implement Reduce function publ IC void reduce (Text key, iterable<text> values, context) throws IOException, Interruptedexc eption {//Output header if (0 = time) {context.write (New Text ("Factoryname"), new
Text ("Addressname"));
time++;
int factorynum = 0;
String[] Factory = new STRING[10]; int Addressnum= 0;
string[] address = new STRING[10];
Iterator ite = Values.iterator ();
while (Ite.hasnext ()) {String record = Ite.next (). toString ();
int len = Record.length ();
int i = 2;
if (0 = len) {continue;
//Get left and right table identification char Relationtype = record.charat (0);
Left table if (' 1 ' = = Relationtype) {Factory[factorynum] = record.substring (i);
factorynum++; //Right table if (' 2 ' = = Relationtype) {Address[addressnum] = record.
substring (i);
addressnum++;
}//Cartesian product if (0!= factorynum && 0!= addressnum) { for (int m = 0; m < factorynum. m++) {for (int n = 0; n < Addressnum;
n++) {//Output result Context.write (new Text (factory[m)),
New Text (Address[n]));
"}}}} public static void Main (string[] args) throws Exception {
Configuration conf = new Configuration ();
This sentence is critical//Conf.set ("Mapred.job.tracker", "192.168.1.2:9001");
You can use args//string[] Ioargs = new string[] {"mtjoin_in", "Mtjoin_out"};
string[] Otherargs = new Genericoptionsparser (conf, args). Getremainingargs ();
if (otherargs.length!= 2) {System.err.println ("usage:multiple Table Join <in> <out>");
System.exit (2);
Job Job = new Job (conf, "multiple Table Join");
Job.setjarbyclass (Mtjoin.class);
Set up MAP and reduce processing class job.setmapperclass (Map.class); Job.setreducerclass (Reduce.class);
Set Output Type Job.setoutputkeyclass (Text.class);
Job.setoutputvalueclass (Text.class);
Set the input and output directory Fileinputformat.addinputpath (Job, New Path (otherargs[0));
Fileoutputformat.setoutputpath (Job, New Path (otherargs[1));
System.exit (Job.waitforcompletion (true)? 0:1); }} </span>
<span style= "FONT-SIZE:14PX;" >javac-classpath hadoop-core-1.1.2.jar:/opt/hadoop-1.1.2/lib/commons-cli-1.2.jar-d FirstProject firstProject/ Mtjoin.java
</span>
<span style= "FONT-SIZE:14PX;" >JAR-CVF mtjoin.jar-c firstproject/. </span>
<span style= "FONT-SIZE:14PX;" >
</span>
Delete an existing output
<span style= "FONT-SIZE:14PX;" >hadoop FS-RMR Output
</span>
<span style= "FONT-SIZE:14PX;" >hadoop Fs-mkdir Input
</span>
<span style= "FONT-SIZE:14PX;" >hadoop fs-put Factory input
</span>
<span style= "FONT-SIZE:14PX;" > Hadoop fs-put Address input
</span>
Run
<span style= "FONT-SIZE:14PX;" >hadoop jar Mtjoin.jar mtjoin input Output
</span>
View Results
<span style= "FONT-SIZE:14PX;" > Hadoop fs-cat output/part-r-00000</span>