Package Cn.spark.study.core;
Import Java.util.Arrays;
Import Java.util.Iterator;
Import java.util.List;
Import org.apache.spark.SparkConf;
Import Org.apache.spark.api.java.JavaPairRDD;
Import Org.apache.spark.api.java.JavaRDD;
Import Org.apache.spark.api.java.JavaSparkContext;
Import org.apache.spark.api.java.function.FlatMapFunction;
Import org.apache.spark.api.java.function.Function;
Import Org.apache.spark.api.java.function.Function2;
Import org.apache.spark.api.java.function.VoidFunction;
Import Scala. Tuple2;
/**
* Transformation Operation Combat
* @author DD
*
*/
public class Transformationoperation {
public static void Main (string[] args) {
Maptest ();
Filtertest ();
Flatmaptest ();
Groupbykeytest ();
Reducebykeytest ();
Sortbykeytest ();
Jointest ();
}
/** * Map operator Case: * Multiply the elements in the collection by 2 */private static void Maptest () {sparkconf conf = new sparkconf (). SetA Ppname ("Map"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); list<integer> numbers = arrays.aslist (1,2,3,4,5); javardd<integer> Numberrdd = sc.parallelize (numbers); javardd<integer> Multiplenumberrdd = Numberrdd.map (New Function<integer, integer> () {private static fi nal long serialversionuid = 1L; @Override public integer Call (integer arg0) throws Exception {//TODO auto-generated method stub return arg0*2; } }); Multiplenumberrdd.foreach (New voidfunction<integer> () {@Override public void call (Integer arg0) throw s Exception {//TODO auto-generated Method stub System.out.print (arg0+ ""); } }); Sc.close ();} /** * Filter Operator Case: * Filter even */private static void Filtertest () in the collection {Sparkconf conf =new sparkconf (). Setappname ("filter"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); list<integer> numbers = arrays.aslist (1,2,3,4,5,6,7,8,9,10); javardd<integer> Numberrdd = sc.parallelize (numbers); The return value of the filter operator that is also the Function,call method is a Boolean//element in each initial RDD is passed into the call method and returns True if you want to keep the element in the new RDD, otherwise false javardd< integer> Evennumberrdd = numberrdd.filter (New Function<integer, boolean> () {private static final long ser Ialversionuid = 1L; @Override Public Boolean Call (Integer arg0) throws Exception {//TODO auto-generated method stub return arg0% 2 = = 0; } }); Evennumberrdd.foreach (New voidfunction<integer> () {private static final long serialversionuid = 1L; @Override public void Call (Integer arg0) throws Exception {System.out.println (arg0); } }); Sc.close ();} /** * FLATMAP Zi * Splits a line of text into a word */private static void Flatmaptest () {sparkconf conf = new sparkconf (). Setappname ( "Faltmap"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); List<string> linelist = arrays.aslist ("Hello", "Hello Me", "Hello World"); Javardd<string> lines = sc.parallelize (linelist); /* * Flatmap operator for RDD splits each line of text into multiple words * flatmap is actually receiving each element in the original RDD and performing various processing to return multiple elements, namely encapsulated in a new rdd in Iterable, which encapsulates all the new elements, With the new RDD size must be greater than the original RDD */javardd<string> words = Lines.flatmap (new flatmapfunction<string, string> () { Private static final long serialversionuid = 1L; @Override Public iterable<string> Call (String arg0) throws Exception {//TODO auto-generated Meth OD stub return Arrays.aslist (Arg0.split ("")); } }); Words.foreach (New voidfunction<string> () {private static final long serialversionuid = 1L; @Override public void Call (String arg0) throws Exception {//TODO auto-generated method stub System.out . println (ARG0); } }); Sc.close ();} /** * Groupbykey operator * Case: Grouping results by class */private static void Groupbykeytest () {sparkconf conf = new sparkconf () . Setappname ("Groupbykey"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<string, integer>> scores = arrays.aslist (New tuple2<string, integer> ("cl Ass1 ",", New tuple2<string, integer> ("Class2", "page"), New tuple2<string, Int Eger> ("Class1"), New tuple2<string, integer> ("Class2", 65)); Create Javapairrdd javapairrdd<string, integer> Scoresrdd = Sc.parallelizepairs (scores); javapairrdd<string, iterable<integer>> groupscores = Scoresrdd.groupbykey (); Groupscores.foreach (New voidfunction<tuple2<String,iterable<integer>>> () {@Override public void call (Tuple2<string, iterable<integer& Gt;> arg0) throws Exception {//TODO auto-generated Method Stub System.out.println ("Class:" +arg0 . _1); Iterator<integer> it = Arg0._2.iterator (); while (It.hasnext ()) {System.out.println (It.next ()); } System.out.println ("===================================="); } }); Sc.close ();} /** * Reducebykey operator * Case: Ask each class total */private static void Reducebykeytest () {sparkconf conf = new sparkconf (). Setappna Me ("Reducebykey"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<string, integer>> scores = arrays.aslist (New tuple2<string, integer> ("Class1", 80 ), New tuple2<string, integer> ("Class2", "page"), New tuple2<string, integer> ("Class1", 90), New Tuple2<string, INteger> ("Class2", 65)); javapairrdd<string, integer> Scoresrdd = Sc.parallelizepairs (scores); javapairrdd<string, integer> totalscores = Scoresrdd.reducebykey (new Function2<integer, Integer, Integer> () {private static final long serialversionuid = 1L; @Override public integer Call (integer arg0, integer arg1) throws Exception {//TODO auto-generated Meth OD stub return arg0+arg1; } }); Totalscores.foreach (New voidfunction<tuple2<string,integer>> () {@Override public void call (TUPL E2<string, integer> arg0) throws Exception {//TODO auto-generated method stub System.out.pri Ntln (arg0._1+ ":" +arg0._2); } }); Sc.close ();} /** * Sortbykey operator * Case study: Sort student scores */private static void Sortbykeytest () {sparkconf conf = new sparkconf (). Setappname ("Sortbykey"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); List<tUple2<integer, string>> scores = arrays.aslist (New Tuple2<integer, string> (Ten, "Leo"), New Tuple2<integer, string> ("KSC"), New Tuple2<integer, string> ("my"), new TUPL E2<integer, string> ("Jack")); Javapairrdd<integer, string> Scoresrdd = Sc.parallelizepairs (scores); Default true Ascending, False descending javapairrdd<integer, string> Sortedrdd = Scoresrdd.sortbykey (); Sortedrdd.foreach (New voidfunction<tuple2<integer,string>> () {@Override public void call (Tuple2 <integer, string> arg0) throws Exception {System.out.println (arg0._1+ ":" +arg0._2); } }); Sc.close ();} /** * Join * case: Print student results */private static void Jointest () {sparkconf conf = new sparkconf (). Setappnam E ("Joinandcogroup"). Setmaster ("local"); Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<integer, string>> StudeNtslist = arrays.aslist (New Tuple2<integer, string> (1, "Leo"), New Tuple2<integer, STRING&G t; (2, "Jack"), New Tuple2<integer, string> (3, "Tom")); List<tuple2<integer, integer>> scoreslist = arrays.aslist (New Tuple2<integer, Integer> (1,100 ), New Tuple2<integer, integer> (2,90), New Tuple2<integer, integer> (3,60)); Parallelization of two sets Javapairrdd<integer, string> Studentsrdd = Sc.parallelizepairs (studentslist); Javapairrdd<integer, integer> Scoresrdd = Sc.parallelizepairs (scoreslist); Use the join operator to correlate two Rdd Javapairrdd<integer, tuple2<string, integer>> studentscores = Studentsrdd.join ( SCORESRDD); Studentscores.foreach (New voidfunction<tuple2<integer,tuple2<string,integer>>> () {@Override public void Call (Tuple2<integer, tuple2<string, integer>> arg0) throws Exception { TODO auto-Generated method stub System.out.println ("Student ID:" +arg0._1); System.out.println ("Student Name:" +arg0._2._1); System.out.println ("Student score:" +arg0._2._2); System.out.println ("=========================================="); } });}
}
Various operators for transformation operations in Spark (Java edition)