Package Com.jumei.robot.mapreduce.tfidf;import Java.io.ioexception;import Java.util.collection;import Java.util.comparator;import Java.util.map.entry;import Java.util.set;import Java.util.stringtokenizer;import Java.util.treemap;import Org.apache.hadoop.conf.configuration;import Org.apache.hadoop.fs.filesystem;import Org.apache.hadoop.fs.path;import Org.apache.hadoop.io.longwritable;import Org.apache.hadoop.io.text;import Org.apache.hadoop.mapreduce.job;import Org.apache.hadoop.mapreduce.lib.input.fileinputformat;import Org.apache.hadoop.mapreduce.lib.input.filesplit;import Org.apache.hadoop.mapreduce.lib.input.keyvaluetextinputformat;import Org.apache.hadoop.mapreduce.lib.input.textinputformat;import Org.apache.hadoop.mapreduce.lib.output.fileoutputformat;import Org.apache.hadoop.mapreduce.lib.output.multipleoutputs;import Org.apache.hadoop.mapreduce.lib.output.textoutputformat;import Org.springframework.context.ApplicationContext; Import Org.springframework.context.support.ClassPathXmLapplicationcontext;import Com.jumei.robot.common.beans.word;import Com.jumei.robot.preprocess.ifilterstopwordservice;import com.jumei.robot.preprocess.iwordsegservice;/** * <pre > * TF-IDF Algorithm mapreduce Implementation * Sub-3job * Job 1: Count the number of occurrences of a word in the document (n) and the total number of words in the document (N) * Job 2: Count the number of documents contained in the word (d), based on the total number of documents (d), calculated tf- IDF value * Job 3: Sort job2, Output top n words with maximum TF-IDF value * Mathematical formula: * tf = n/n * IDF = Math.log (D/D); * TF-IDF = TF * IDF * </pre> * @author Deyin * */public class Tfidfmapreduce {private static Configuration conf; public static void Main (string[] args) throws Exception {conf = new Configuration (); if (Args.length < 3) {System.err.println ("arguments invalid, usgae:hadoop jar Tfidf.jar Com.jumei.robot.mapreduc E.tfidf.tfidfmapreduce TF-IDF Hadoop Map Reduce