1.
Val lines=sc.textfile ("hdfs://") = = Loaded comes in as an rdd resilient distributed dataset elastic data set
Val Errors=lines.filter (_.startswith ("ERROR")) # #transformation
Val errors.persist () # #缓存RDD
Val Mysql_error=errors.filter (_.contains ("MySQL")). Count # #action
Val Http_error=errors.filter (_.contains ("http")). Count # #action
2.
Map is an array of data returned for each row, Flatmap is all the data returned by an array
Val rdd=sc.parallelize (List (2,4,6,7,8))--Initialize the RDD
Val Rdd1=rdd.map (2*_)--2 per number of times
Rdd1.collect--Show RDD1
The expert wants to write this way. Val rdd_1=sc.parallelize (List (3,4,6,8,9)). Map (3*_). Filter (_>20). Collect
# val Rdd_count=rdd_1.flatmap (_.split (', ')). Map ((_,1)). Reducebykey (_+_)
3.val s2=sc.textfile ("/luo/s1.txt"). FlatMap (Line=>line.split (",")). Map (word=> (word,1)). Reducebykey (_+_ )--Path is HDFs path
s2.saveastextfile ("/luo/result")
4.val s1=sc.textfile ("/luo/s1.txt")
Val rdd_count1= S1.flatmap (_.split (', ')). Map ((_,1)). Reducebykey (_+_)
Val rdd_count2= rdd_count1.groupbykey--GROUP by Key
R1.count--How much data
Val rdd_count3=s1.flatmap (_.split (', ')). Map ((_,1)). Reducebykey (_+_). Map (x=> (x._2,x._1)). Sortbykey (FALSE). Map (x=> (x._2,x._1)). Saveastextfile ("/LUO/RESULT1")
5.val rd1=sc.parallelize (List (' A ', 1), (' A ', 2)), Val rd2=sc.parallelize (list (' B ', 1), (' B ', 2)))
Val Rd3=rd1 Union Rd2--Result: array[(Char, Int)] = Array ((a,1), (a,2), (b,1), (b,2))
Rd1.lookup (' a ')
6 Val r1=sc.parallelize (List (' A ', 3), (' A ', 5), (' B ', 6), (' B ', 9)), Val r2=sc.parallelize (List (' A ', 6), (' A ', ' '), (' B ', (' B ', 34)))
The result of Val R3=r1 join R2 is: array[(Char, (int, int))] = Array ((b, (6,23)), (b, (6,34)), (b, (9,23)), (b, (9,34)), (A, (3,6)), (a) (3,1 2)), (A, (5,6)), (A, (5,12))) Cartesian set
R1.lookup (' a ')
7 Val sum=sc.parallelize (List (1,2,4,5,65))
Val Sum1=sum.reduce (_+_)--summation
8.data.map (_.split (' \ t ') (0)). Filter (_< "201202012"). Count--(0) means take the first element
Data.filter (_.split (' \ t '). length==3). Map (_.split (' \ t ') (1)). Map (_,1). Reducebykey (_+_). Map (x=> (x._2,x._1)). Sortbykey (FALSE). Map (x=> (x._2,x._1)). Saveastextfile ("/luo")---Reduce after the data becomes a tuple 38:33
9. Val rdd=sc.parallelize (List (2,4,6,7,8))
Val evenfiy= (X:int) =>if (x%2==0) x else None
10.spark-submit--master spark://127.0.0.1:7077--name wordcountbyscala--class spark.wordcount--executor-memory 1G-- Total-executor-cores 2/home/luozt/spark/spark08241.jar/home/luozt/spark/readme.md/home/luozt/spark_data
Val Tt=rdd.map (evenfiy)
10.val list= "Hello.world". ToCharArray
Val List1=1 to ten ToList
List.zip (List1)
List.zipall (List1, ' Q ', 2)
List.zipwithindex
Sparksql
Val hivecontext=new Org.apache.spark.sql.hive.HiveContext (SC)
Import Hivecontext._
Hivecontext.sql ("Show Tables"). Take (10)//Fetch the first ten tables to see
1.val sqlcontext=new Org.apache.spark.sql.SQLContext (SC)
Import Sqlcontext._
Case class Person (Name:string,age Int)
Val People=data.map (_.split (', ')). Map (P==>person (p (0), p (1). ToInt))
People.saveasparquetfile ("/luo/luo.parquet")--write
Val par=hivecontext.parquetfile ("/luo/luo.parquet")--read
Par.registerastable ("par")
Val ps=sql ("Select name from par where age>20")
2.case class Word (wid:int,aid:int,times:int)
Val wo=sc.textfile ("/luo/aa.txt"). Map (_.split (' \ t ')). Filter (_.length==3). Map (W=>word (w (0). TOINT,W (1). toint,w (2). ToInt)
Wo.registerastable ("Wor")
SQL ("SELECT * from wor");
3.DSL
' Wid stands for wid this column
such as: Val rs=word.where (' Times>50). where (' Wid>2). Select (' Aid '). Limit (10)
4.spark operation on Hive
Val hivecontext=new Org.apache.spark.sql.hive.HiveContext (SC)
Import Hivecontext._
HQL ("Show Tables")
HQL ("select * from words where times>50 limit"). Collect
Spark-shell Study Notes