in order to get the topn of each group, the first is to group, then each group is sorted, get topn.
test Data
Hadoop
Spark
Java Spark Spark (Hadoop)
hadoop
spark
Spark 88
Spark-
Hadoop-Hadoop
Java
70
1.1, the first step, the source data into (key, value) format, easily grouped by key
sparkconf conf = new sparkconf (). Setmaster ("local"). Setappname ("WordCount");
Internal actual call of Sparkcontext
javasparkcontext JSC = new Javasparkcontext (conf);
Read the file, converting each row of data to
javardd<string> lines = jsc.textfile ("C:\\users\\12285\\desktop\\test");//hadooprdd
javapairrdd<string, integer> pairs = Lines.maptopair (new pairfunction<string, String , integer> ( {
private static final long serialversionuid = 1L;
Public tuple2<string, integer> call (String line) throws Exception {return
new tuple2<string, integer> ( Line.split ("") [0], integer.valueof (Line.split ("") [1]);
}
);
1.2, the second step, grouped by key
Grouped by subject
javapairrdd<string, iterable<integer>> grouppairs = Pairs.groupbykey ();
1.3, the third step, the group to be sorted
1.3.1, because the RDD after grouping is javapairrdd<string, Iterable<integer>>, key is the account name, value is the collection of achievements, so will use Maptopair, Sort within a group
javapairrdd<string, iterable<integer>> top5pairs = Grouppairs.maptopair (New PairFunction<Tuple2< ; String,iterable<integer>>, String, iterable<integer>> () {private static final long Serialv
Ersionuid = 1L; Public tuple2<string, Iterable<integer>> call (tuple2<string, iterable<integer>> GroupedPair
) throws Exception {integer[] top5 = new INTEGER[5];
String Groupedkey = groupedpair._1;
iterator<integer> Groupedvalue = Groupedpair._2.iterator ();
while (Groupedvalue.hasnext ()) {Integer value = Groupedvalue.next (); for (int i = 0; i < top5.length i++) {if (top5[i] = null) {TOP5
[I] = value;
Break
The value at}else if (Top5[i] > value) {//Index moves backwards after the value in the position for (int j = 4; j > i; j--) {top5[j] = top5[j-1];
} Top5[i] = value;
Break //Otherwise, the value at//index is smaller than value, and the TOP5 always continue backward comparison}} Retu
RN New tuple2<string, iterable<integer>> (Groupedkey, Arrays.aslist (TOP5)); }
});
1.4, full code
Package com.chb.sparkDemo.TopNGroup;
Import Java.util.Arrays;
Import Java.util.Iterator;
Import org.apache.spark.SparkConf;
Import Org.apache.spark.api.java.JavaPairRDD;
Import Org.apache.spark.api.java.JavaRDD;
Import Org.apache.spark.api.java.JavaSparkContext;
Import org.apache.spark.api.java.function.PairFunction;
Import org.apache.spark.api.java.function.VoidFunction; Import Scala.
Tuple2; public class Topngrouptest {public static void main (string[] args) {sparkconf conf = new sparkconf (). Setmast
ER ("local"). Setappname ("WordCount");
Internal actual call of Sparkcontext javasparkcontext JSC = new Javasparkcontext (conf);
Read the file, converting each row of data to javardd<string> lines = Jsc.textfile ("C:\\users\\12285\\desktop\\test");//hadooprdd
javapairrdd<string, integer> pairs = Lines.maptopair (new pairfunction<string, String, integer> () {
Private static final long serialversionuid = 1L; Public tuple2<string, IntegeR> Call (String line) throws Exception {return to New tuple2<string, integer> (Line.split ("") [0], in
Teger.valueof (Line.split ("") [1]));
}
});
Grouped by subject javapairrdd<string, iterable<integer>> grouppairs = Pairs.groupbykey (); javapairrdd<string, iterable<integer>> top5pairs = Grouppairs.maptopair (New PairFunction<Tuple2< String,iterable<integer>>, String, iterable<integer>> () {private static final long Serialv
Ersionuid = 1L; Public tuple2<string, Iterable<integer>> call (tuple2<string, iterable<integer>> GroupedPair
) throws Exception {integer[] top5 = new INTEGER[5];
String Groupedkey = groupedpair._1;
iterator<integer> Groupedvalue = Groupedpair._2.iterator ();
while (Groupedvalue.hasnext ()) {Integer value = Groupedvalue.next (); for (int i = 0; i < top5.length i++) {if (top5[i] = = null) {
Top5[i] = value;
Break The value at the}else if (Top5[i] > value) {//index is moved backward for value greater than//after the position for T j = 4; J > i;
j--) {Top5[j] = top5[j-1];
} Top5[i] = value;
Break //Otherwise, the value at//index is smaller than value, and the TOP5 always continue backward comparison}} Retu
RN New tuple2<string, iterable<integer>> (Groupedkey, Arrays.aslist (TOP5));
}
}); Top5pairs.foreach (New voidfunction<tuple2<string,iterable<integer>>> () {private static Fin
Al long serialversionuid = 1L; public void Call (Tuple2<string, Iterable<iNteger>> pair) throws Exception {String groupedkey = pair._1;
System.out.println ("Grouped Key:" + groupedkey);
iterator<integer> Groupedvalue = Pair._2.iterator ();
while (Groupedvalue.hasnext ()) {Integer value = Groupedvalue.next ();
System.out.println (value);
} System.out.println ("==================================");
}
});
}
}