Various operators for transformation operations in Spark (Java edition)

Last Update:2016-05-07 Source: Internet

Author: User

Tags iterable

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Package Cn.spark.study.core;

Import Java.util.Arrays;
Import Java.util.Iterator;
Import java.util.List;

Import org.apache.spark.SparkConf;
Import Org.apache.spark.api.java.JavaPairRDD;
Import Org.apache.spark.api.java.JavaRDD;
Import Org.apache.spark.api.java.JavaSparkContext;
Import org.apache.spark.api.java.function.FlatMapFunction;
Import org.apache.spark.api.java.function.Function;
Import Org.apache.spark.api.java.function.Function2;
Import org.apache.spark.api.java.function.VoidFunction;

Import Scala. Tuple2;

/**
* Transformation Operation Combat
* @author DD
*
*/
public class Transformationoperation {
public static void Main (string[] args) {
Maptest ();
Filtertest ();
Flatmaptest ();
Groupbykeytest ();
Reducebykeytest ();
Sortbykeytest ();
Jointest ();
}

/** * Map operator Case: * Multiply the elements in the collection by 2 */private static void Maptest () {sparkconf conf = new sparkconf (). SetA    Ppname ("Map"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf);    list<integer> numbers = arrays.aslist (1,2,3,4,5);    javardd<integer> Numberrdd = sc.parallelize (numbers); javardd<integer> Multiplenumberrdd = Numberrdd.map (New Function<integer, integer> () {private static fi        nal long serialversionuid = 1L;            @Override public integer Call (integer arg0) throws Exception {//TODO auto-generated method stub        return arg0*2;    }    }); Multiplenumberrdd.foreach (New voidfunction<integer> () {@Override public void call (Integer arg0) throw        s Exception {//TODO auto-generated Method stub System.out.print (arg0+ "");    }    }); Sc.close ();} /** * Filter Operator Case: * Filter even */private static void Filtertest () in the collection {Sparkconf conf =new sparkconf (). Setappname ("filter"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf);    list<integer> numbers = arrays.aslist (1,2,3,4,5,6,7,8,9,10);    javardd<integer> Numberrdd = sc.parallelize (numbers); The return value of the filter operator that is also the Function,call method is a Boolean//element in each initial RDD is passed into the call method and returns True if you want to keep the element in the new RDD, otherwise false javardd< integer> Evennumberrdd = numberrdd.filter (New Function<integer, boolean> () {private static final long ser        Ialversionuid = 1L;            @Override Public Boolean Call (Integer arg0) throws Exception {//TODO auto-generated method stub        return arg0% 2 = = 0;    }    });        Evennumberrdd.foreach (New voidfunction<integer> () {private static final long serialversionuid = 1L;        @Override public void Call (Integer arg0) throws Exception {System.out.println (arg0);    }    }); Sc.close ();} /** * FLATMAP Zi * Splits a line of text into a word */private static void Flatmaptest () {sparkconf conf = new sparkconf (). Setappname (    "Faltmap"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf);    List<string> linelist = arrays.aslist ("Hello", "Hello Me", "Hello World");    Javardd<string> lines = sc.parallelize (linelist); /* * Flatmap operator for RDD splits each line of text into multiple words * flatmap is actually receiving each element in the original RDD and performing various processing to return multiple elements, namely encapsulated in a new rdd in Iterable, which encapsulates all the new elements,        With the new RDD size must be greater than the original RDD */javardd<string> words = Lines.flatmap (new flatmapfunction<string, string> () {        Private static final long serialversionuid = 1L; @Override Public iterable<string> Call (String arg0) throws Exception {//TODO auto-generated Meth        OD stub return Arrays.aslist (Arg0.split (""));    }    });        Words.foreach (New voidfunction<string> () {private static final long serialversionuid = 1L;  @Override      public void Call (String arg0) throws Exception {//TODO auto-generated method stub System.out        . println (ARG0);    }    }); Sc.close ();}                    /** * Groupbykey operator * Case: Grouping results by class */private static void Groupbykeytest () {sparkconf conf = new sparkconf ()    . Setappname ("Groupbykey"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<string, integer>> scores = arrays.aslist (New tuple2<string, integer> ("cl Ass1 ",", New tuple2<string, integer> ("Class2", "page"), New tuple2<string, Int    Eger> ("Class1"), New tuple2<string, integer> ("Class2", 65));    Create Javapairrdd javapairrdd<string, integer> Scoresrdd = Sc.parallelizepairs (scores);    javapairrdd<string, iterable<integer>> groupscores = Scoresrdd.groupbykey (); Groupscores.foreach (New voidfunction<tuple2<String,iterable<integer>>> () {@Override public void call (Tuple2<string, iterable<integer& Gt;> arg0) throws Exception {//TODO auto-generated Method Stub System.out.println ("Class:" +arg0            . _1);            Iterator<integer> it = Arg0._2.iterator ();            while (It.hasnext ()) {System.out.println (It.next ());        } System.out.println ("====================================");    }    }); Sc.close ();} /** * Reducebykey operator * Case: Ask each class total */private static void Reducebykeytest () {sparkconf conf = new sparkconf (). Setappna    Me ("Reducebykey"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<string, integer>> scores = arrays.aslist (New tuple2<string, integer> ("Class1", 80            ), New tuple2<string, integer> ("Class2", "page"), New tuple2<string, integer> ("Class1", 90), New Tuple2<string, INteger> ("Class2", 65));    javapairrdd<string, integer> Scoresrdd = Sc.parallelizepairs (scores); javapairrdd<string, integer> totalscores = Scoresrdd.reducebykey (new Function2<integer, Integer, Integer>        () {private static final long serialversionuid = 1L; @Override public integer Call (integer arg0, integer arg1) throws Exception {//TODO auto-generated Meth        OD stub return arg0+arg1;    }    }); Totalscores.foreach (New voidfunction<tuple2<string,integer>> () {@Override public void call (TUPL E2<string, integer> arg0) throws Exception {//TODO auto-generated method stub System.out.pri        Ntln (arg0._1+ ":" +arg0._2);    }    }); Sc.close ();} /** * Sortbykey operator * Case study: Sort student scores */private static void Sortbykeytest () {sparkconf conf = new sparkconf (). Setappname    ("Sortbykey"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf); List<tUple2<integer, string>> scores = arrays.aslist (New Tuple2<integer, string> (Ten, "Leo"), New Tuple2<integer, string> ("KSC"), New Tuple2<integer, string> ("my"), new TUPL    E2<integer, string> ("Jack"));    Javapairrdd<integer, string> Scoresrdd = Sc.parallelizepairs (scores);    Default true Ascending, False descending javapairrdd<integer, string> Sortedrdd = Scoresrdd.sortbykey (); Sortedrdd.foreach (New voidfunction<tuple2<integer,string>> () {@Override public void call (Tuple2        <integer, string> arg0) throws Exception {System.out.println (arg0._1+ ":" +arg0._2);    }    }); Sc.close ();} /** * Join * case: Print student results */private static void Jointest () {sparkconf conf = new sparkconf (). Setappnam    E ("Joinandcogroup"). Setmaster ("local");    Javasparkcontext sc = new Javasparkcontext (conf); List<tuple2<integer, string>> StudeNtslist = arrays.aslist (New Tuple2<integer, string> (1, "Leo"), New Tuple2<integer, STRING&G    t; (2, "Jack"), New Tuple2<integer, string> (3, "Tom")); List<tuple2<integer, integer>> scoreslist = arrays.aslist (New Tuple2<integer, Integer> (1,100    ), New Tuple2<integer, integer> (2,90), New Tuple2<integer, integer> (3,60));    Parallelization of two sets Javapairrdd<integer, string> Studentsrdd = Sc.parallelizepairs (studentslist);    Javapairrdd<integer, integer> Scoresrdd = Sc.parallelizepairs (scoreslist); Use the join operator to correlate two Rdd Javapairrdd<integer, tuple2<string, integer>> studentscores = Studentsrdd.join (    SCORESRDD);        Studentscores.foreach (New voidfunction<tuple2<integer,tuple2<string,integer>>> () {@Override            public void Call (Tuple2<integer, tuple2<string, integer>> arg0) throws Exception { TODO auto-Generated method stub System.out.println ("Student ID:" +arg0._1);            System.out.println ("Student Name:" +arg0._2._1);            System.out.println ("Student score:" +arg0._2._2);        System.out.println ("=========================================="); }    });}

}

Various operators for transformation operations in Spark (Java edition)

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More