Hadoop classification output

Source: Internet
Author: User

Import org. Apache. hadoop. Io. text;

Import java. Io. ioexception;
Import java. util. iterator;
Import java. util. stringtokenizer;

Import org. Apache. hadoop. conf. configuration;
Import org. Apache. hadoop. conf. configured;
Import org. Apache. hadoop. fs. filesystem;
Import org. Apache. hadoop. fs. path;
Import org. Apache. hadoop. Io. intwritable;
Import org. Apache. hadoop. Io. nullwritable;
Import org. Apache. hadoop. Io. text;
Import org. Apache. hadoop. mapreduce. contextfactory;
Import org. Apache. hadoop. mapreduce. job;
Import org. Apache. hadoop. mapreduce. mapper;
Import org. Apache. hadoop. mapreduce. Cer CER;
Import org. Apache. hadoop. mapreduce. Lib. Input. fileinputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. fileoutputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. mapfileoutputformat;
Import org. Apache. hadoop. mapreduce. Lib. Output. multipleoutputs;
Import org. Apache. hadoop. mapreduce. Lib. Output. textoutputformat;
Import org. Apache. hadoop. util. genericoptionsparser;
Import org. Apache. hadoop. util. progressable;
Import org. Apache. hadoop. util. tool;
Import org. Apache. hadoop. util. toolrunner;

Import net. SourceForge. pinyin4j. pinyinhelper;
Import net. SourceForge. pinyin4j. format. hanyupinyincasetype;
Import net. SourceForge. pinyin4j. format. hanyupinyinoutputformat;
Import net. SourceForge. pinyin4j. format. hanyupinyintonetype;
Import net. SourceForge. pinyin4j. format. hanyupinyinvchartype;
Import net. SourceForge. pinyin4j. format. Exception. badhanyupinyinoutputformatcombination;

Public class multifileout extends configured implements tool {
Private Final Static string [] shengarry = {"Beijing", "Tianjin", "Shanxi", "Inner Mongolia", "Liaoning ",
"Jilin", "Heilongjiang", "Shanghai", "Jiangsu", "Zhejiang", "Anhui", "Fujian", "Jiangxi", "Shandong", "Henan ", "Hubei ",
"Hunan", "Guangdong", "Guangxi", "Hainan", "Chongqing", "Sichuan", "Guizhou", "Yunnan", "Tibet", "Shaanxi Province ", "Gansu ",
"Qinghai", "Ningxia", "Xinjiang", "Hebei "};
Private Final Static string [] sexary = {"M", "F "};

Public static string getpinyin (string SRC ){
Char [] srcary = NULL;
Srcary = SRC. tochararray ();
String [] strtmp = new string [srcary. Length];

// Set the output format of Chinese characters and Pinyin
Hanyupinyinoutputformat formatstr = new hanyupinyinoutputformat ();
Formatstr. setcasetype (hanyupinyincasetype. lowercase );
Formatstr. settonetype (hanyupinyintonetype. without_tone );
Formatstr. setvchartype (hanyupinyinvchartype. with_v );
String resultstr = "";
Int T0 = srcary. length;
Try {
For (INT I = 0; I <t0; I ++ ){
// Determine whether it can be a Chinese character
If (character. tostring (srcary [I]). Matches ("[\ u4e00-\ u9fa5] + ")){
Strtmp = pinyinhelper. tohanyupinyinstringarray (srcary [I],
Formatstr); // save all types of Chinese characters to the T2 array.
Resultstr + = strtmp [0]; // + ""; // obtain the first pronunciation of the Chinese character and connect it to the string T4.
} Else {
// If it is not a Chinese character, indirectly retrieve the character and connect it to the string T4
Resultstr + = character. tostring (srcary [I]);
}
}
} Catch (badhanyupinyinoutputformatcombination e ){
E. printstacktrace ();
}
Return resultstr;
}

Private Static class provincemapper extends
Mapper <object, text> {
@ Override
Protected void map (Object key, text value, context)
Throws ioexception, interruptedexception {
String STR = value. tostring ();
String outkey = "";
Boolean isfind = false;
If (Str. indexof ("name")> = 0)
Return;

String [] strarray = Str. Split (",");
If (strarray. length! = 33)
Return;

String sex = strarray [5];
String ADDR = strarray [7];
For (INT I = 0; I <shengarry. length; I ++ ){
For (Int J = 0; j <sexary. length; j ++ ){
Int Index = ADDR. indexof (shengarry [I]);
If (index> = 0) & (index <= 3)
& (Sex. indexof (sexary [J])> = 0 )){
Isfind = true;
Outkey = getpinyin (shengarry [I]) + sexary [J];
Break;
}
}

If (isfind)
Break;
}

If (isfind ){
Context. Write (new text (outkey), value );
} Else {
System. Out. println ("Error Data" + value. tostring ());
}
}
}

Private Static class provincereducer extends
CER <text, text, nullwritable, text> {
Private multipleoutputs MOS = NULL;

@ Override
Protected void setup (context) throws ioexception,
Interruptedexception {
Mos = new multipleoutputs (context );
}

@ Override
Protected void cleanup (context) throws ioexception,
Interruptedexception {
Mos. Close ();
}

@ Override
Protected void reduce (Text key, iterable <text> values, context)
Throws ioexception, interruptedexception {
Text value = new text ("");
String valuetmp = "";

For (Text VA: values ){
Value. Set (va. tostring ());

Try {
Mos. Write (key. tostring (), nullwritable. Get (), value );
} Catch (exception e ){
// System. Out. println ("exception" + key );
}
}
}
}

Public static void main (string [] ARGs) throws exception {
Toolrunner. Run (new configuration (), new multifileout (), argS );
}

@ Override
Public int run (string [] ARGs) throws exception {
Int result = 0;
Configuration conf = new configuration ();
String [] argarray = new genericoptionsparser (Conf, argS)
. Getremainingargs ();
If (argarray. length! = 2 ){
System. Err. println ("Usage: multifileout <in> <out> ");
System. Exit (1 );
}

Job job = new job (Conf, "multifileout ");
Job. setjarbyclass (multifileout. Class );
Job. setmapperclass (provincemapper. Class );
Job. setreducerclass (provincereducer. Class );
Job. setmapoutputkeyclass (text. Class );
Job. setmapoutputvalueclass (text. Class );
Job. setoutputkeyclass (nullwritable. Class );
Job. setoutputvalueclass (text. Class );
// Job. setoutputformatclass (wordcountoutputformat. Class );
Fileinputformat. addinputpath (job, new path (argarray [0]);
Fileoutputformat. setoutputpath (job, new path (argarray [1]);

For (INT I = 0; I <shengarry. length; I ++ ){
For (Int J = 0; j <sexary. length; j ++ ){
Multipleoutputs. addnamedoutput (job, getpinyin (shengarry [I])
+ Sexary [J], textoutputformat. Class, text. Class,
Text. Class );
}
}

Try {
Result = job. waitforcompletion (true )? 0: 1;
} Catch (classnotfoundexception | interruptedexception e ){
E. printstacktrace ();
}

Return result;
}
}

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.