Java code:
Note: Do not use any jar dependencies during packaging.
Import Java. util. arrays; import Java. util. list; import Java. util. regEx. pattern; import Org. apache. hadoop. io. intwritable; import Org. apache. hadoop. io. text; import Org. apache. hadoop. mapred. textoutputformat; import Org. apache. spark. sparkconf; import Org. apache. spark. API. java. javapairrdd; import Org. apache. spark. API. java. javardd; import Org. apache. spark. API. java. extends parkcontext; import Org. apache. spark. API. J Ava. function. flatmapfunction; import Org. apache. spark. API. java. function. function2; import Org. apache. spark. API. java. function. pairfunction; import Scala. tuple2; public final class javawordcount {Private Static final pattern Space = pattern. compile (""); public static void main (string [] ARGs) throws exception {If (ARGs! = NULL & args. length <2) {system. err. println ("Usage: javawordcount "); system. exit (1);} sparkconf = new sparkconf (). setappname ("javawordcount"); extends parkcontext CTX = new extends parkcontext (sparkconf); javardd lines = CTX. textfile (ARGs [0], 1); javardd words = lines. flatmap (New flatmapfunction () {Private Static final long serialversio Nuid =-5362343741541430068l; @ override public iterable call (string s) {return arrays. aslist (space. split (s) ;}}); javapairrdd ones = words. maptopair (New pairfunction () {Private Static final long serialversionuid = 1l; @ override public tuple2 call (string S) {return New tuple2 (s, 1) ;}}); javapairrdd counts = ones. performancebykey (New function2 () {Private Static final long serialversionuid = 3338227964635666047l; @ override public integer call (integer I1, integer I2) {return I1 + I2 ;}}); List output = counts. collect (); For (tuple2 Tuple: Output) {system. out. println (tuple. _ 1 () + ":" + tuple. _ 2 ();} // Output Mode 1: save to the file: // counts. saveastextfile (ARGs [1]); // Output Mode 2: Save to HDFS: counts. saveashadoopfile (ARGs [1], text. class, intwritable. class, textoutputformat. class); CTX. stop () ;}}Spark on Yarn run the script:
#!/bin/shecho $SPARK_JARspark-submit --class org.apache.spark.examples.JavaWordCount --master yarn-client --num-executors 3 --driver-memory 400m --executor-memory 500m --executor-cores 1 /usr/local/spark/spark-example-1.0.0-SNAPSHOT.jar hdfs://cluster1:9000/tmp/1.txt hdfs://cluster1:9000/out/spark/wordcount
Execution result:
Spark example-wordcount modified version