1. Introduction of dependency
<project xmlns= "http://maven.apache.org/POM/4.0.0" xmlns:xsi= "Http://www.w3.org/2001/XMLSchema-instance
" xsi:schemalocation= "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" >
< modelversion>4.0.0</modelversion>
<groupId>com.irt.rootcloud</groupId>
< artifactid>beam-spark</artifactid>
<version>0.0.1-SNAPSHOT</version>
< packaging>jar</packaging>
<name>beam-spark</name>
<url>http:// maven.apache.org</url>
<properties>
<project.build.sourceencoding>utf-8</ project.build.sourceencoding>
<spark.version>1.6.2</spark.version>
</properties >
<dependencies>
<dependency> <groupId>org.apache.beam</groupId> &L
t;artifactid>beam-sdks-java-core</artifactid> <version>0.4.0</version> </dependency> <dependency> <groupId>org.apache.beam</groupId> <artifact Id>beam-runners-spark</artifactid> <version>0.4.0</version> </dependency> < Dependency> <groupId>org.apache.spark</groupId> <artifactid>spark-core_2.10</artifactid > <version>${spark.version}</version> </dependency> <dependency> <groupid>org. Apache.spark</groupid> <artifactId>spark-streaming_2.10</artifactId> <version>${ spark.version}</version> </dependency> <dependency> <groupid>com.fasterxml.jackson.m Odule</groupid> <artifactid>jackson-module-scala_2.10</artifactid> <version>2.7.2</version> </dependency> <depe ndency> <groupId>junit</groupId> <artifactId>junit</artifactId> <VERSION>3.8.1&L t;/version> <scope>test</scope> </dependency> </dependencies> <build> <plugi ns> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactid>maven-shade-plu Gin</artifactid> <configuration> <createdependencyreducedpom>false</createdependencyreduced
pom> <filters> <filter> <artifact>*:* </artifact> <excludes> <exclude>meta-inf/*. Sf</exclude> <exclude>meta-inf/*. Dsa</exclude> <exclude>meta-inf/*. rsa</exclude> </excludes> </filter> </filters> </configuration> &L T;executions> <execution> <phase>package</phase> <goals> <goal>shade</go al> </goals> <configuration> <shadedartifactattached>true</shadedartifactattach ed> <shadedClassifierName>shaded</shadedClassifierName> </configuration> </execu
tion> </executions> </plugin> </plugins> </build> </project>
2. Writing Workcount test code
Import Org.apache.beam.runners.spark.SparkRunner;
Import Org.apache.beam.sdk.Pipeline;
Import Org.apache.beam.sdk.io.TextIO;
Import Org.apache.beam.sdk.options.Default;
Import org.apache.beam.sdk.options.Description;
Import org.apache.beam.sdk.options.PipelineOptions;
Import Org.apache.beam.sdk.options.PipelineOptionsFactory;
Import org.apache.beam.sdk.options.Validation.Required;
Import Org.apache.beam.sdk.runners.PipelineRunner;
Import Org.apache.beam.sdk.transforms.Aggregator;
Import Org.apache.beam.sdk.transforms.Count;
Import Org.apache.beam.sdk.transforms.DoFn;
Import org.apache.beam.sdk.transforms.MapElements;
Import Org.apache.beam.sdk.transforms.PTransform;
Import Org.apache.beam.sdk.transforms.ParDo;
Import org.apache.beam.sdk.transforms.SimpleFunction;
Import Org.apache.beam.sdk.transforms.Sum;
Import Org.apache.beam.sdk.values.KV;
Import org.apache.beam.sdk.values.PCollection;
Import Com.codahale.metrics.DerivativeGauge; /** * An example, counts words in Shakespeare and includes Beam best practices. * * <p>this class, {@link WordCount}, is the second in a series of four successively more detailed * ' Word count ' Examples.
You could first want to take a look at {@link Minimalwordcount}. * After your ' ve looked at this example and then see the {@link debuggingwordcount} * pipeline, for introduction of additional
Concepts. * * <p>for A detailed walkthrough of this example, see * <a href= "http://beam.apache.org/use/walkthroughs/" &G
T * http://beam.apache.org/use/walkthroughs/* </a> * * <p>basic concepts, also in the Minimalwordcount ex Ample: * Reading text files; Counting a pcollection; Writing to text files * * <p>new Concepts: * <pre> * 1. Executing a Pipeline both locally and using the selected Runner * 2. Using ParDo with static Dofns defined Out-of-line * 3. Building a composite transform * 4. Defining your own pipeline options * </pre> * <p>concept #1: You can ExecuTe this pipeline either locally or using by selecting another runner.
* These is now command-line options and isn't hard-coded as they were in the minimalwordcount * example.
* * <p>to Change the runner, specify: * <pre>{@code *--runner=your_selected_runner *} * </pre> * * <p>to Execute this pipeline, specify a local output file (if using the * {@code directrunner}) or output Prefi
X on a supported distributed file system. * <pre>{@code *--output=[your_local_file | Your_output_prefix] *}</pre> * * <p>the input file defaults to a public data set containing the text of King Lear, * by William Shakespeare.
You can override it and choose your own input with {@code--inputfile}. */public class WordCount {/** * Concept #2: A can make your pipeline assembly code less verbose by defining your DOFNS * Statically out-of-line. This dofn tokenizes lines of the text into individual words; We pass it * to a ParDo in thePipeline. */Static Class EXTRACTWORDSFN extends Dofn<string, string> {private final aggregator<long, long> empty
Lines = Createaggregator ("Emptylines", New Sum.sumlongfn ()); @ProcessElement public void Processelement (Processcontext c) {if (C.element (). Trim (). IsEmpty ()) {Empty
Lines.addvalue (1L);
}//Split the line into words.
string[] Words = C.element (). Split ("[^a-za-z ']+");
Output each of the word encountered into the output pcollection.
for (String word:words) {if (!word.isempty ()) {c.output (word); }}}}/** a simplefunction that converts a Word and Count into a printable string. */public static class Formatastextfn extends Simplefunction<kv<string, long>, string> {@Override p
Ublic String Apply (kv<string, long> input) {return Input.getkey () + ":" + input.getvalue (); }}/** * A ptransform that converts a Pcollection containing lines of text into a pcollection of * formatted word counts. * * <p>concept #3: This is a custom composite transform that bundles the transforms (ParDo and * Count) as a R Eusable Ptransform Subclass.
Using Composite transforms allows for easy reuse, * Modular testing, and an improved monitoring experience. */public static class CountWords extends Ptransform<pcollection<string>, pcollection<kv<string, Lo ng>>> {@Override public pcollection<kv<string, long>> expand (pcollection<string> lines
) {//Convert lines of text into individual words.
pcollection<string> words = lines.apply (Pardo.of (New EXTRACTWORDSFN ()));
Count the number of times each word occurs.
pcollection<kv<string, long>> wordcounts = words.apply (Count.<string>perelement ());
return wordcounts; }}/** * Options supported by {@link WORDCOunt}. * * <p>concept #4: Defining your own configuration options. Here, you can add your own arguments * to is processed by the command-line parser, and specify default values for them.
You can then * access the options values in your pipeline code.
* * <p>inherits standard configuration options. */public Interface Wordcountoptions extends Pipelineoptions {/** * By default, this example reads from a pub LIC DataSet containing the text of * King Lear.
Set This option to choose a different input file or glob.
*/@Description ("Path of the file to read from") @Default. String ("D:\\dubbo.xsd") string Getinputfile ();
void Setinputfile (String value);
/** * Set This required option to specify where to write the output.
*/@Description ("Path of the file to write to") @Required String getoutput ();
void Setoutput (String value); } public static void Main (string[] args) {args=new string[]{"--output=d:\\apache-beam-workdcount.txt", "--runner=sparkrunner", "--sparkmaster=local[4]"};
Wordcountoptions options = Pipelineoptionsfactory.fromargs (args). Withvalidation (). as (Wordcountoptions.class);
Pipeline p = pipeline.create (options); Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the/static FORMATASTEXTF
N () to the ParDo transform. P.apply ("ReadLines", TextIO.Read.from (Options.getinputfile ())). Apply (new CountWords ()). Apply (Mapelements.via (n
EW FORMATASTEXTFN ()). Apply ("Writecounts", TextIO.Write.to (Options.getoutput ()));
P.run (). Waituntilfinish (); }
}
3. Note
Args=new string[]{"--output=d:\\apache-beam-workdcount.txt", "--runner=sparkrunner", "--sparkMaster=local[4]"};
This line of code is only convenient when testing the code locally, manually assigning parameters, and if it is actually submitted to the spark cluster, this is not required, and no secondary line code is required. Instead, specify the parameters from the spark submission task such as:
Spark-submit--class com.irt.rootcloud.beam.spark.WordCount--master spark://host:port target/ Beam-examples-1.0.0-shaded.jar--runner=sparkrunner