Command:
Java-jar Myspark-1.0-snapshot.jar Myspark-1.0-snapshot.jar hdfs://single:9000/input/word.txt hdfs://single:9000/ Output/out1
Error message:
..........
14/11/23 06:14:18 INFO sparkdeployschedulerbackend:granted executor ID app-20141123061418-0011/0 on HostPort single : 8091 with 8 cores, 200.0 MB RAM
14/11/23 06:14:18 INFO appclient$clientactor:executor updated:app-20141123061418-0011/0 are now RUNNING
Exception in thread "main" Java.io.IOException:No FileSystem for Scheme:hdfs
At Org.apache.hadoop.fs.FileSystem.getFileSystemClass (filesystem.java:2421)
At Org.apache.hadoop.fs.FileSystem.createFileSystem (filesystem.java:2428)
At org.apache.hadoop.fs.filesystem.access$200 (filesystem.java:88)
At Org.apache.hadoop.fs.filesystem$cache.getinternal (filesystem.java:2467)
At Org.apache.hadoop.fs.filesystem$cache.get (filesystem.java:2449)
At Org.apache.hadoop.fs.FileSystem.get (filesystem.java:367)
At Org.apache.hadoop.fs.Path.getFileSystem (path.java:287)
At Org.apache.hadoop.mapred.FileInputFormat.listStatus (fileinputformat.java:221)
At Org.apache.hadoop.mapred.FileInputFormat.getSplits (fileinputformat.java:270)
At Org.apache.spark.rdd.HadoopRDD.getPartitions (hadooprdd.scala:140)
At org.apache.spark.rdd.rdd$ $anonfun $partitions$2.apply (rdd.scala:207)
At org.apache.spark.rdd.rdd$ $anonfun $partitions$2.apply (rdd.scala:205)
At Scala. Option.getorelse (option.scala:120)
At Org.apache.spark.rdd.RDD.partitions (rdd.scala:205)
At Org.apache.spark.rdd.MappedRDD.getPartitions (mappedrdd.scala:28)
At org.apache.spark.rdd.rdd$ $anonfun $partitions$2.apply (rdd.scala:207)
At org.apache.spark.rdd.rdd$ $anonfun $partitions$2.apply (rdd.scala:205)
At Scala. Option.getorelse (option.scala:120)
At Org.apache.spark.rdd.RDD.partitions (rdd.scala:205)
At Org.apache.spark.SparkContext.runJob (sparkcontext.scala:898)
At Org.apache.spark.rdd.RDD.count (rdd.scala:726)
At Youling.studio.main$.main (main.scala:33)
At Youling.studio.Main.main (Main.scala)
Scala code:
Package Youling.studio
Import Org.apache.spark.sparkcontext._
Import Org.apache.spark. {sparkconf, Sparkcontext}
Import Scala.collection.mutable.ListBuffer
/**
* Created by Administrator on 2014/11/23.
*/
Object Main {
def main (args:array[string]) {
if (args.length!=3) {
println ("Cmd:java-jar *.jar input Output")
System.exit (0)
}
Val jars = listbuffer[string] ()
Args (0). Split (', '). Map (Jars + = _)
Val conf = new sparkconf ()
Conf.setmaster ("spark://single:8081")
. Setsparkhome ("/cloud/spark-0.9.1-bin-hadoop2")
. Setappname ("word count")
. Setjars (Jars)
. Set ("Spark.executor.memory", "200m")
Val sc = new Sparkcontext (conf)
Val data = Sc.textfile (args (1))
Data.cache
println (Data.count)
Data.flatmap (_.split (")). Map ((_,1)). Reducebykey (_+_). Map (x=> (x._2,x._1)). Sortbykey (FALSE). Map (x=>, x._1). Saveastextfile (args (2))
}
}
Error reason: IntelliJ idea hit the jar runtime did not find the HDFs type file system
Workaround: Modify the Pom file for the MAVEN project and manually specify the red section below
<project xmlns= "http://maven.apache.org/POM/4.0.0" xmlns:xsi= "Http://www.w3.org/2001/XMLSchema-instance"
xsi:schemalocation= "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd" >
<modelVersion>4.0.0</modelVersion>
<groupId>youling.studio.spark</groupId>
<artifactId>myspark</artifactId>
<version>1.0-SNAPSHOT</version>
<inceptionYear>2008</inceptionYear>
<properties>
<scala.version>2.10.3</scala.version>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>scala-tools Maven2 repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>scala-tools Maven2 repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>0.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>0.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/</sourceDirectory>
<testSourceDirectory>src/test/</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.10.3</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:* </artifact>
<excludes>
<exclude>meta-inf/*. Sf</exclude>
<exclude>meta-inf/*. Dsa</exclude>
<exclude>meta-inf/*. Rsa</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation= "Org.apache.maven.plugins.shade.resource.AppendingTransformer" >
<resource>reference.conf</resource>
</transformer>
<transformer
implementation= "Org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" >
<manifestEntries>
<Main-Class>youling.studio.Main</Main-Class>
</manifestEntries>
</transformer>
<transformer
implementation= "Org.apache.maven.plugins.shade.resource.AppendingTransformer" >
<resource>META-INF/services/org.apache.hadoop.fs.FileSystem</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
</project>
Spark Program exception: Exception in thread ' main ' java.io.IOException:No FileSystem for Scheme:hdfs