Package Com.yl.wordcount
Import Java.io.File
Import Org.apache.spark. {sparkconf, Sparkcontext}
Import Scala.collection.Iterator
Import Scala.io.Source
/**
* WordCount to sort and exclude discontinued words
*/
Object Wordcountstopwords {
def main (args:array[string]) {
Val conf = new sparkconf (). Setmaster ("spark://localhost:7077"). Setappname ("WordCount")
Val sc = new Sparkcontext (conf)
Val outFile = "/users/admin/spark/sparkoutput"
var stopwords:iterator[string] = null
Val stopwordsfile = new File ("/users/admin/src" + "/tingyongci.txt")
if (stopwordsfile.exists ()) {
Stopwords = Source.fromfile (stopwordsfile). getlines
}
Val stopwordlist = stopwords.tolist
Val textfile = Sc.textfile ("/users/admin/spark/spark-1.5.1-bin-hadoop2.4/readme.md")
Val result = Textfile.flatmap (_.split ("")). Filter (!_.isempty). Filter (!stopwordlist.contains (_)). Map ((_,1)). Reducebykey (_+_). Map{case (Word,count) = (Count,word)}.sortbykey (false)
Result.saveastextfile (OutFile)
}
}
Iii. Spark Getting Started: 5 most used word found in text, excluding common inactive words