Import org. Apache. hadoop. Io. text;
Import java. Io. ioexception;
Import java. util. iterator;
Import java. util. stringtokenizer;
Import org. Apache. hadoop. conf. configuration;
Import org. Apache. hadoop. conf. configured;
Import org. Apache. hadoop. fs. filesystem;
Import org. Apache. hadoop. fs. path;
Import org. Apache. hadoop. Io. intwritable;
Import org. Apache. hadoop. Io. nullwritable;
Import org. Apache. hadoop. Io. text;
Import org. Apache. hadoop. mapreduce. contextfactory;
Import org. Apache. hadoop. mapreduce. job;
Import org. Apache. hadoop. mapreduce. mapper;
Import org. Apache. hadoop. mapreduce. Cer CER;
Import org. Apache. hadoop. mapreduce. Lib. Input. fileinputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. fileoutputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. mapfileoutputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. multipleoutputs;
Import org. Apache. hadoop. mapreduce. Lib. Output. textoutputformat;
Import org. Apache. hadoop. util. genericoptionsparser;
Import org. Apache. hadoop. util. progressable;
Import org. Apache. hadoop. util. tool;
Import org. Apache. hadoop. util. toolrunner;
Import net. SourceForge. pinyin4j. pinyinhelper;
Import net. SourceForge. pinyin4j. format. hanyupinyincasetype;
Import net. SourceForge. pinyin4j. format. hanyupinyinoutputformat;
Import net. SourceForge. pinyin4j. format. hanyupinyintonetype;
Import net. SourceForge. pinyin4j. format. hanyupinyinvchartype;
Import net. SourceForge. pinyin4j. format. Exception. badhanyupinyinoutputformatcombination;
Public class multifileout extends configured implements tool {
Private Final Static string [] shengarry = {"Beijing", "Tianjin", "Shanxi", "Inner Mongolia", "Liaoning ",
"Jilin", "Heilongjiang", "Shanghai", "Jiangsu", "Zhejiang", "Anhui", "Fujian", "Jiangxi", "Shandong", "Henan ", "Hubei ",
"Hunan", "Guangdong", "Guangxi", "Hainan", "Chongqing", "Sichuan", "Guizhou", "Yunnan", "Tibet", "Shaanxi Province ", "Gansu ",
"Qinghai", "Ningxia", "Xinjiang", "Hebei "};
Private Final Static string [] sexary = {"M", "F "};
Public static string getpinyin (string SRC ){
Char [] srcary = NULL;
Srcary = SRC. tochararray ();
String [] strtmp = new string [srcary. Length];
// Set the output format of Chinese characters and Pinyin
Hanyupinyinoutputformat formatstr = new hanyupinyinoutputformat ();
Formatstr. setcasetype (hanyupinyincasetype. lowercase );
Formatstr. settonetype (hanyupinyintonetype. without_tone );
Formatstr. setvchartype (hanyupinyinvchartype. with_v );
String resultstr = "";
Int T0 = srcary. length;
Try {
For (INT I = 0; I <t0; I ++ ){
// Determine whether it can be a Chinese character
If (character. tostring (srcary [I]). Matches ("[\ u4e00-\ u9fa5] + ")){
Strtmp = pinyinhelper. tohanyupinyinstringarray (srcary [I],
Formatstr); // save all types of Chinese characters to the T2 array.
Resultstr + = strtmp [0]; // + ""; // obtain the first pronunciation of the Chinese character and connect it to the string T4.
} Else {
// If it is not a Chinese character, indirectly retrieve the character and connect it to the string T4
Resultstr + = character. tostring (srcary [I]);
}
}
} Catch (badhanyupinyinoutputformatcombination e ){
E. printstacktrace ();
}
Return resultstr;
}
Private Static class provincemapper extends
Mapper <object, text> {
@ Override
Protected void map (Object key, text value, context)
Throws ioexception, interruptedexception {
String STR = value. tostring ();
String outkey = "";
Boolean isfind = false;
If (Str. indexof ("name")> = 0)
Return;
String [] strarray = Str. Split (",");
If (strarray. length! = 33)
Return;
String sex = strarray [5];
String ADDR = strarray [7];
For (INT I = 0; I <shengarry. length; I ++ ){
For (Int J = 0; j <sexary. length; j ++ ){
Int Index = ADDR. indexof (shengarry [I]);
If (index> = 0) & (index <= 3)
& (Sex. indexof (sexary [J])> = 0 )){
Isfind = true;
Outkey = getpinyin (shengarry [I]) + sexary [J];
Break;
}
}
If (isfind)
Break;
}
If (isfind ){
Context. Write (new text (outkey), value );
} Else {
System. Out. println ("Error Data" + value. tostring ());
}
}
}
Private Static class provincereducer extends
CER <text, text, nullwritable, text> {
Private multipleoutputs MOS = NULL;
@ Override
Protected void setup (context) throws ioexception,
Interruptedexception {
Mos = new multipleoutputs (context );
}
@ Override
Protected void cleanup (context) throws ioexception,
Interruptedexception {
Mos. Close ();
}
@ Override
Protected void reduce (Text key, iterable <text> values, context)
Throws ioexception, interruptedexception {
Text value = new text ("");
String valuetmp = "";
For (Text VA: values ){
Value. Set (va. tostring ());
Try {
Mos. Write (key. tostring (), nullwritable. Get (), value );
} Catch (exception e ){
// System. Out. println ("exception" + key );
}
}
}
}
Public static void main (string [] ARGs) throws exception {
Toolrunner. Run (new configuration (), new multifileout (), argS );
}
@ Override
Public int run (string [] ARGs) throws exception {
Int result = 0;
Configuration conf = new configuration ();
String [] argarray = new genericoptionsparser (Conf, argS)
. Getremainingargs ();
If (argarray. length! = 2 ){
System. Err. println ("Usage: multifileout <in> <out> ");
System. Exit (1 );
}
Job job = new job (Conf, "multifileout ");
Job. setjarbyclass (multifileout. Class );
Job. setmapperclass (provincemapper. Class );
Job. setreducerclass (provincereducer. Class );
Job. setmapoutputkeyclass (text. Class );
Job. setmapoutputvalueclass (text. Class );
Job. setoutputkeyclass (nullwritable. Class );
Job. setoutputvalueclass (text. Class );
// Job. setoutputformatclass (wordcountoutputformat. Class );
Fileinputformat. addinputpath (job, new path (argarray [0]);
Fileoutputformat. setoutputpath (job, new path (argarray [1]);
For (INT I = 0; I <shengarry. length; I ++ ){
For (Int J = 0; j <sexary. length; j ++ ){
Multipleoutputs. addnamedoutput (job, getpinyin (shengarry [I])
+ Sexary [J], textoutputformat. Class, text. Class,
Text. Class );
}
}
Try {
Result = job. waitforcompletion (true )? 0: 1;
} Catch (classnotfoundexception | interruptedexception e ){
E. printstacktrace ();
}
Return result;
}
}