Importjava.util.Arrays;Importorg.apache.spark.SparkConf;ImportOrg.apache.spark.api.java.JavaPairRDD;ImportOrg.apache.spark.api.java.JavaRDD;ImportOrg.apache.spark.api.java.JavaSparkContext;Importorg.apache.spark.api.java.function.FlatMapFunction;ImportOrg.apache.spark.api.java.function.Function2;Importorg.apache.spark.api.java.function.PairFunction;Importorg.apache.spark.api.java.function.VoidFunction;ImportScala. Tuple2;/*** Use Java to develop WordCount program for local test spark * *@authorDT Big Data Dream FactoryHttp://weibo.com/ilovepains */ Public classWORDCOUNTSPK { Public Static voidMain (string[] args) {/*** 1th step: Create a Spark Configuration object sparkconf, set the configuration information for the Spark program's runtime, * For example, by Setmaster to set the URL of the master of the Spark Cluster to which the program is linked, if set to Loca L, on behalf of the Spark program to run locally, especially suitable for the machine configuration conditions very poor (such as only 1G of memory) of beginners **/sparkconf conf=NewSparkconf (). Setappname ("Spark WordCount written by Java"). Setmaster ("local")); /*** 2nd step: Create a Sparkcontext Object * Sparkcontext is the only entry for all the functions of the Spark program, whether in Scala, Java, Python *, R, etc. must have a sparkcontext (different Language specific class name is different, if Java is javasparkcontext) * Sparkcontext Core role: Initialize the Spark application to run the core components, including Dagscheduler, TaskScheduler, * Schedulerbackend also responsible for the Spark program to master registration program, etc. * Sparkcontext is the most important object in the entire spark application*/Javasparkcontext SC=NewJavasparkcontext (conf);//the bottom is actually Scala's sparkcontext. /*** 3rd Step: Create a Javardd * Javardd based on a specific data source (HDFs, HBase, Local FS, DB, S3, etc.) through Javasparkcontext three ways: based on external data sources (such as HDFS) , according to the Scala collection, the other RDD operation * Data is divided into a series of partitions by the RDD, and the data assigned to each partition belongs to the processing scope of a task * Note: The file path cannot be directly used in the Windows path of the reverse skew \, to change to L Inux under the slope*/Javardd<String> lines =SC. textfile ("D:/hu.txt"); /*** 4th step: The initial Javardd for the transformation level of processing, such as map, filter and other high-order functions, such as programming, to perform specific data calculation * Step 4.1: Tell each line of the string split into a single word*/Javardd<String> words =lines. FlatMap (NewFlatmapfunction<string, string> () {//if it is Scala, it can be written as a Val because of the SAM conversion.//words =//Lines.flatmap//{line =//line.split ("")} PublicIterable<string> Call (String line)throwsException {returnArrays.aslist (Line.split ("")); } }); /*** 4th step: The initial Javardd for the transformation level of processing, such as map, filter and other high-order functions, such as programming, to carry out specific data calculation * Step 4.2: On the basis of word splitting for each word instance count 1, that is, word = = (Word, 1)*/Javapairrdd<string, integer> pairs =words. Maptopair (NewPairfunction<string, String, integer>() { PublicTuple2<string, integer>Call (String word)throwsException {return NewTuple2<string, integer> (Word, 1); } }); /*** 4th step: The initial RDD transformation level of processing, such as map, filter and other high-order functions, such as programming, to perform specific data calculation * Step 4.3: Count Each Word instance counts as 1 total number of occurrences of each word in the file */Javapairrdd<string, integer> wordscount =pairs. Reducebykey (NewFunction2<integer, Integer, integer> () {//For the same key, the accumulation of value (both local and reducer levels simultaneously reduce) Publicinteger Call (integer v1, integer v2)throwsException {returnV1 +v2; } }); Wordscount.foreach (NewVoidfunction<tuple2<string, integer>>() { Public voidCall (tuple2<string, integer> pairs)throwsException {System.out.println (pairs._1+ " : " +pairs._2); } }); Sc.close (); }}
Spark Java Sample Code WordCount