1 Packagestuspark.com;2 3 ImportScala. Tuple2;4 5 Importorg.apache.spark.SparkConf;6 ImportOrg.apache.spark.api.java.JavaPairRDD;7 ImportOrg.apache.spark.api.java.JavaRDD;8 ImportOrg.apache.spark.api.java.JavaSparkContext;9 Importorg.apache.spark.api.java.function.FlatMapFunction;Ten Importorg.apache.spark.api.java.function.Function; One ImportOrg.apache.spark.api.java.function.Function2; A Importorg.apache.spark.api.java.function.PairFunction; - ImportOrg.apache.spark.storage.StorageLevel; - the Importjava.util.Arrays; - ImportJava.util.Iterator; - Importjava.util.List; - ImportJava.util.regex.Pattern; + - Public Final classJavawordcount { + Private Static FinalPattern SPACE = Pattern.compile (""); A //the Pattern object is a compiled representation of a regular expression at //The compile () method means that the regular expression RegExp is compiled, returning regexp the compiled pattern - - Public Static voidMain (string[] args)throwsException { - - //the file represents the local path, and vice versa represents the HDFs path -String FilePath = "File:\\e:\\test.txt"; in -sparkconf sparkconf =NewSparkconf (). Setappname ("Javawordcount") to. Setmaster ("local[2]"); + //Set the program name setting local mode -Javasparkcontext CTX =NewJavasparkcontext (sparkconf); the //Creating a Javasparkcontext object instance SC * $Javardd<string> lines = Ctx.textfile (FilePath, 1);Panax Notoginseng //convert sc.parallelize directly from a collection (List (1,2,3,4,5,6,7,8,9,10)) - //convert sc.textfile from HDFs file ("HDFs://") the //convert from local file to Sc.textfile ("file:/") + A Lines.cache (); the lines.persist (Storagelevel.memory_only ()); + //Persistent Rdd - /* $ * The cache () method means that the data of the RDD is persisted to memory in a non-serialized manner, $ * The cache is a transformtion, is lazy and must be triggered by an action, - * In order to actually cache the RDD into memory. - * the * persist () method means: Manually select the persistence level and persist using the specified method - * disk_only: DiskWuyi disk_only_2: disk; double copy the memory_only: Memory, deserialization, store the RDD as a deserialization, and if the contents of the Rdd do not exist, the remaining partitions will be recalculated at a later time and will not be brushed to disk. - memory_only_2: Memory, deserialization, double copy Wu Memory_only_ser: Memory, serialization, this serialization, each partition in byte data storage, the benefit is to bring better space storage, but the CPU is expensive - memory_only_ser_2: Memory; serialization; double Copy About memory_and_disk: Memory + disk, deserialization, double copy, Rdd to deserialize memory, if the contents of the RDD can not be saved, the remaining will be saved to disk $ memory_and_disk_2: Memory + disk; deserialization; double copy - memory_and_disk_ser: Memory + disk; serialization - memory_and_disk_ser_2: Memory + disk; serialization; double Copy - * */ A + the //parallelization Collections - //an important parameter in a parallel array is partitions, which describes the number of data sets that the array is cut. Spark will run a task on every partitions $list<integer> data = arrays.aslist (1, 2, 3, 4, 5); the //Array connection list, when updating one of them, another automatic update theJavardd<integer> Distdata =ctx.parallelize (data); the //Distributing local Scala collections to form an RDD the //Initializes a collection that already exists - in //The filter () parameter is a function that filters out elements that are not eligible, and the return value is a new Rdd theLines.filter (NewFunction<string,boolean>(){ the About PublicBoolean Call (String arg0)throwsException { the //TODO auto-generated Method Stub the return NULL; the } + - }); the //The map parameter is a function that applies to each of the RDD elements, and the return value is the new RddBayiJavardd<integer> linelengths =Lines the. Map (NewFunction<string, integer>() { the PublicInteger Call (String s) { - returns.length (); - } the }); the //reduce aggregates, but the incoming function is two parameter inputs that return a value that must satisfy the Exchange law and the binding law the intTotallength =linelengths the. Reduce (NewFunction2<integer, Integer, integer>() { - Publicinteger Call (integer A, integer b) { the returnA +b; the } the });94 //Flatmap is similar to map, but Flatmap produces multiple results thejavardd<string> words =Lines the. FLATMAP (NewFlatmapfunction<string, string>() { the //of the iterable iteration98 PublicIterable<string>Call (String s) { About returnArrays.aslist (Space.split (s)); - }101 });102 //Maptopair to save collection data as key value103javapairrdd<string, integer> ones =words104. Maptopair (NewPairfunction<string, String, integer>() { the PublicTuple2<string, integer>Call (String s) {106 return NewTuple2<string, Integer> (S, 1);107 }108 });109 //Reducebykey The value according to the key aggregation theJavapairrdd<string, integer> counts =ones111. Reducebykey (NewFunction2<integer, Integer, integer>() { the Publicinteger Call (integer i1, integer i2) {113 returnI1 +I2; the } the }); the //Collect package returns an array117list<tuple2<string, integer>> output =Counts.collect ();118 for(tuple2<?,? >tuple:output) {119System.out.println (tuple._1 () + ":" +tuple._2 ()); - }121 ctx.stop ();122 }123}
Simple Java project development, the required jar package see Baidu Network disk
Link: Https://pan.baidu.com/s/1jqWwBBNIm1kbQoFSCppEZQ Password: Y4XR
Learn essays--spark Getting Started with Java development