Tag: value function set computes BSP int Contex split broadcast
TOPN algorithm, spark implementation
Packagecom.kangaroo.studio.algorithms.topn;ImportOrg.apache.spark.api.java.JavaPairRDD;ImportOrg.apache.spark.api.java.JavaRDD;ImportOrg.apache.spark.api.java.JavaSparkContext;Importorg.apache.spark.api.java.function.FlatMapFunction;ImportOrg.apache.spark.api.java.function.Function2;Importorg.apache.spark.api.java.function.PairFunction;ImportOrg.apache.spark.broadcast.Broadcast;ImportScala. Tuple2;Importjava.io.Serializable;ImportJava.util.*; Public classTopnsparkImplementsSerializable {PrivateJavasparkcontext JSC; Broadcast<Integer>Topnum; PrivateString InputPath; /** constructor * 1. Initialize Javasparkcontext * 2. Initialize the number of broadcast variables TOPN, can be shared by all partition * 3. Initialize input path **/ PublicTopnspark (Integer Num, String path) {JSC=NewJavasparkcontext (); Topnum=Jsc.broadcast (Num); InputPath=path; } /** Program Entry function **/ Public voidrun () {/** Read data into InputPath **/Javardd<String> lines = Jsc.textfile (InputPath, 1); /** The RDD specification to 9 partitions **/Javardd<String> Rdd = LINES.COALESCE (9); /** Convert input to KV format * key is the primary key of the protocol, value is the number of sort references * Note: The key here is not unique, that is, the same key may have multiple records, so below our statute key into a unique key * Input: line, Output: KV **/Javapairrdd<string, integer> kv = Rdd.maptopair (NewPairfunction<string, String, integer>() { PublicTuple2<string, Integer> call (String s)throwsException {string[] tokens= S.split (","); return NewTuple2<string, Integer> (Tokens[0], Integer.parseint (tokens[1])); } }); /** Protocol primary key becomes unique key * Input: kv, Output: KV **/Javapairrdd<string, integer> Uniquekeys = Kv.reducebykey (NewFunction2<integer, Integer, integer>() { PublicInteger call (integer i1, integer i2)throwsException {returnI1 +I2; } }); /** Calculate the TopN of each partition * Here the number of TopN is obtained by broadcast variable, each partition is reserved TopN, total number of partitions: Partitionnum * TopN * Input: kv, output : sortmap, Length TOPN **/Javardd<sortedmap<integer, string>> partitions = uniquekeys.mappartitions (NewFlatmapfunction<iterator<tuple2<string,integer>>, Sortedmap<integer, String>>() { PublicIterable<sortedmap<integer, string>> call (iterator<tuple2<string, integer>> iter)throwsException {Final intN =Topnum.getvalue (); SortedMap<integer, string> TopN =NewTreemap<integer, string>(); while(Iter.hasnext ()) {Tuple2<string, integer> tuple =Iter.next (); Topn.put (tuple._2, tuple._1); if(Topn.size () >N) {topn.remove (Topn.firstkey ()); } } returncollections.singletonlist (TopN); } }); /** protocol all partitions of TOPN Sortmap, get the final sortmap, length topn * Reduce, data has been to the local cache, this is the final result * Input: Sortmap, Length TOPN, of course there are partitionnum, output: sortmap, length TOPN **/SortedMap<integer, string> finaltopn = Partitions.reduce (NewFunction2<sortedmap<integer, String>, Sortedmap<integer, String>, Sortedmap<integer, String> >() { PublicSortedmap<integer, string> call (Sortedmap<integer, string> M1, Sortedmap<integer, String> m2)throwsException {Final intN =Topnum.getvalue (); SortedMap<integer, string> TopN =NewTreemap<integer, string>(); for(Map.entry<integer, string>Entry:m1.entrySet ()) {Topn.put (Entry.getkey (), Entry.getvalue ()); if(Topn.size () >N) {topn.remove (Topn.firstkey ()); } } for(Map.entry<integer, string>Entry:m2.entrySet ()) {Topn.put (Entry.getkey (), Entry.getvalue ()); if(Topn.size () >N) {topn.remove (Topn.firstkey ()); } } returnTopN; } }); /** Print the final result of the local cache **/ for(Map.entry<integer, string>Entry:finalTopN.entrySet ()) {System.out.println (Entry.getkey ()+ " -- " +Entry.getvalue ()); } } Public Static voidMain (string[] args) {String InputPath= Args[0]; Topnspark Topnmapper=NewTopnspark (10, InputPath); Topnmapper.run (); }}
Big Data algorithm design pattern (1)-TopN Spark implementation