In my opinion, the action operator in spark programming acts like a trigger to trigger the previous transformation operator. The transformation operation has lazy loading, and you do not load it immediately after you have defined it, and all of the preceding transformation operators are executed only when an action operator executes. The usual action operators are listed in the following code: (Java Edition)
Package Cn.spark.study.core;
Import Java.util.Arrays;
Import java.util.List;
Import Java.util.Map;
Import org.apache.spark.SparkConf;
Import Org.apache.spark.api.java.JavaPairRDD;
Import Org.apache.spark.api.java.JavaRDD;
Import Org.apache.spark.api.java.JavaSparkContext;
Import org.apache.spark.api.java.function.Function;
Import Org.apache.spark.api.java.function.Function2;
Import Scala. Tuple2;
/**
* Action Operation Combat
* @author DD
*
*/
public class Actionoperation {
public static void Main (string[] args) {
Reducetest ();
Collecttest ();
Counttest ();
Taketest ();
Countbykeytest ();
}
1 /**2 * Reduce operator3 * Case: Additive and4 */5 Private Static voidreducetest () {6sparkconf conf =Newsparkconf ()7. Setappname ("reduce")8. Setmaster ("local");9Javasparkcontext sc =Newjavasparkcontext (conf);Ten Onelist<integer> numberlist = arrays.aslist (1,2,3,4,5,6,7,8,9,10); A -Javardd<integer> Numbersrdd =sc.parallelize (numberlist); - the //use the reduce operation to accumulate numbers in a collection - intsum = Numbersrdd.reduce (NewFunction2<integer, Integer, integer>() { - - @Override + PublicInteger call (integer arg0, integer arg1)throwsException { - returnarg0+arg1; + } A }); at - System.out.println (sum); - - sc.close (); - } - in /** - * Collect operator to * The data on the cluster can be pulled to a local traversal (deprecated) + */ - Private Static voidcollecttest () { thesparkconf conf =Newsparkconf () *. Setappname ("collect") $. Setmaster ("local");Panax NotoginsengJavasparkcontext sc =Newjavasparkcontext (conf); - thelist<integer> numberlist = arrays.aslist (1,2,3,4,5,6,7,8,9,10); + AJavardd<integer> Numbersrdd =sc.parallelize (numberlist); the +javardd<integer> doublenumbers = Numbersrdd.map (NewFunction<integer, integer>() { - $ @Override $ PublicInteger call (integer arg0)throwsException { - //TODO auto-generated Method Stub - returnArg0*2; the } - });Wuyi the //the action action of foreach is to traverse the elements in the RDD on the remote cluster, while the collect action is to put the RDD on the distributed cluster - //data is generally not recommended in this way, because if the amount of data in the RDD is large, such as more than 10,000, then performance will Wu //poor, because to go from remote to a large number of network transmission, the data get to local, sometimes also can occur oom exception, memory overflow - //It is recommended that you use the foreach operation to process the final Rdd AboutList<integer> doublenumlist =Doublenumbers.collect (); $ for(Integer num:doublenumlist) { - System.out.println (num); - } - sc.close (); A } + the /** - * Count operator $ * The number of elements in the RDD can be counted the */ the Private Static voidcounttest () { thesparkconf conf =Newsparkconf () the. Setappname ("Count") -. Setmaster ("local"); inJavasparkcontext sc =Newjavasparkcontext (conf); the thelist<integer> numberlist = arrays.aslist (1,2,3,4,5,6,7,8,9,10); About theJavardd<integer> Numbersrdd =sc.parallelize (numberlist); the the //use the Count operation on an RDD to count the number of elements in the Rdd + LongCount =Numbersrdd.count (); - System.out.println (count); the Bayi sc.close (); the } the - /** - * Take operator the * Pull the first n data from the remote Rdd to a local the */ the Private Static voidtaketest () { thesparkconf conf =Newsparkconf () -. Setappname ("Take") the. Setmaster ("local"); theJavasparkcontext sc =Newjavasparkcontext (conf); the 94list<integer> numberlist = arrays.aslist (1,2,3,4,5,6,7,8,9,10); the theJavardd<integer> Numbersrdd =sc.parallelize (numberlist); the 98 //The take operation is similar to the collect operation and also fetches the RDD data from the remote cluster, but the collect operation gets the RDD About //all data, take gets just the first n data -list<integer> Top3number = Numbersrdd.take (3);101 for(Integer num:top3number) {102 System.out.println (num);103 }104 sc.close (); the }106 107 /**108 * Saveastextfile operator109 * the */111 Private Static voidsaveastextfiletest () { thesparkconf conf =Newsparkconf ()113. Setappname ("Saveastextfile"); the theJavasparkcontext sc =Newjavasparkcontext (conf); the 117list<integer> numberlist = arrays.aslist (1,2,3,4,5,6,7,8,9,10);118 119Javardd<integer> Numbersrdd =sc.parallelize (numberlist); - 121javardd<integer> doublenumbers = Numbersrdd.map (NewFunction<integer, integer>() {122 123 @Override124 PublicInteger call (integer arg0)throwsException { the //TODO auto-generated Method Stub126 returnArg0*2;127 } - });129 the //The saveastextfile operator can store data in the RDD directly in HDFs131 //But here we can only specify the folder that is saved as the directory, then in fact, it will be saved as a directory the ///double_number.txt/part-00000 File133Doublenumbers.saveastextfile ("Hdfs://spark1:9000/double_number.txt");134 135 sc.close ();136 }137 138 /**139 * Countbykey operator $ */141 142 Private Static voidcountbykeytest () {143sparkconf conf =Newsparkconf ()144. Setappname ("Take")145. Setmaster ("local");146Javasparkcontext sc =Newjavasparkcontext (conf);147 148list<tuple2<string, string>> studentslist =Arrays.aslist (149 NewTuple2<string, string> ("Class1", "Leo"), Max NewTuple2<string, string> ("Class2", "Jack"),151 NewTuple2<string, string> ("Class1", "Marry"), the NewTuple2<string, string> ("Class2", "Tom"),153 NewTuple2<string, string> ("Class2", "David"));154 155javapairrdd<string, string> Studentsrdd =Sc.parallelizepairs (studentslist);156 157 //Countbykey operator can count the number of each key corresponding element158 //the type returned by Countbykey is directly map<string,object>159 themap<string, object> studentscounts =Studentsrdd.countbykey ();161 162 for(Map.entry<string, object>StudentsCount:studentsCounts.entrySet ()) {163System.out.println (Studentscount.getkey () + ":" +Studentscount.getvalue ());164 }165 sc.close ();166}
SOURCE Quote: http://blog.csdn.net/kongshuchen/article/details/51344124
Various action operator operations in Spark (Java edition)