PackageRDDImportOrg.apache.spark. {sparkconf, Sparkcontext}/*** Created by Legotime on 2016/5/5. */ObjectPairrdd {defmyfunc1(Index:Int,Iter:Iterator[(String)]) :Iterator[String] = {Iter.toList.map (x = ="[PartID:"+ Index +", Val:"+ x +"]"). Iterator}defMYFUNC2(Index:Int,Iter:Iterator[(Int,String)]):Iterator[String]={Iter.toList.map (x ="[PartID:"+ Index +", Val:"+ x +"]"). Iterator}defMain(args:array[String]) {Valconf =NewSparkconf (). Setappname ("Pair RDD"). Setmaster ("Local")Valsc =NewSparkcontext (CONF)ValSinglerdd = Sc.parallelize (List("Scala","Python","Java","Spark","Hadoop"),2) Singlerdd.mappartitionswithindex (myfunc1). Collect.foreach (println)//[partid:0, Val:scala]//[partid:0, Val:python]//[partid:1, Val:java]//[partid:1, Val:spark]//[partid:1, Val:hadoop]//-----------------------------single Pairrdd------------------------------------- ValPairrdd = Singlerdd.map (x = = (x.length,x)) Pairrdd.mappartitionswithindex (MYFUNC2). Collect.foreach (println)//[partid:0, Val: (5,scala)]//[partid:0, Val: (6,python)]//[partid:1, Val: (4,java)]//[partid:1, Val: (5,spark)]//[partid:1, Val: (6,hadoop)]//merge values with the same key Pairrdd.reducebykey (_+_). Collect (). foreach (println)//First local partition same key aggregation, then partition and partition combined//(4,java)//(6,pythonhadoop)//(5,scalaspark)//Group values with the same key Pairrdd.groupbykey (). Collect.foreach (println)//(4,compactbuffer (Java))//(6,compactbuffer (Python, Hadoop))//(5,compactbuffer (Scala, Spark))//apply a function to each value in key Pairrdd.mapvalues (x=>"I am"+x). Collect.foreach (println)//(5,i am Scala)//(6,i am Python)//(4,i am Java)//(5,i am Spark)//(6,i am Hadoop) Pairrdd.flatmapvalues (x=>"I am"+x). Collect.foreach (Print)//(5,i) (5,) (5,a) (5,m) (5,) (5,s) (5,c) (5,a) (5,l) (5,a) (6,i) (6,) (6,a) (6,m) (6,) (6,p) (6,y) (6,t) (6,h) (6,o) (6,n) (4, I) (4,) (4,a) (4,m) (4,) (4,J) (4,a) (4,v) (4,a) (5,i) (5,) (5,a) (5,m) (5,) (5,s) (5,p) (5,a) (5,r) (5,k) (6,i) (6,) (6,a) (6,m) (6, ) (6,h) (6,a) (6,d) (6,o) (6,o) (6,p) 16/05/05 22:27:52 INFO sparkcontext:starting job:collect at pairrdd.scala:55 PairRDD.keys.collect.foreach (println) PairRDD.values.collect.foreach (println) Pairrdd.sortbykey (). Collect.foreach (println)//(4,java)//(5,scala)//(5,spark)//(6,python)//(6,hadoop)//-----------------------------two x Pairrdd------------------------------------- ValTemppairrdd = Sc.parallelize (List((5,"Flink"))) TempPairRDD.collect.foreach (println) Pairrdd.subtract (Temppairrdd). Mappartitionswithindex (MYFUNC2). Collect.foreach (println) Pairrdd.join (Temppairrdd). Collect.foreach (println)//Make sure Temppairrdd first key exists (left connection) Pairrdd.leftouterjoin (Temppairrdd). Collect.foreach (println)//(4, (Java,none))//(6, (Python,none))//(6, (Hadoop,none))//(5, (Scala,some (Flink)))//(5, (Spark,some (Flink)))//Make sure Pairrdd first key exists (right connection) Pairrdd.rightouterjoin (Temppairrdd). Collect.foreach (println)//(4, (Compactbuffer (Java), Compactbuffer ()))//(6, (Compactbuffer (Python, Hadoop), Compactbuffer ()))//(5, (Compactbuffer (Scala, Spark), Compactbuffer (Flink))) Pairrdd.cogroup (Temppairrdd). Collect.foreach (println)//(4, (Compactbuffer (Java), Compactbuffer ()))//(6, (Compactbuffer (Python, Hadoop), Compactbuffer ()))//(5, (Compactbuffer (Scala, Spark), Compactbuffer (Flink)))//==============================pairrdd action Operation =================================== ValActionrdd = Sc.parallelize (List((1,2),(3,4),(5,6),(1,6)),2) ActionRDD.countByValue.foreach (println)//(1 )//((5,6), 1)//((1,6), 1)//((3,4), 1) ActionRDD.countByKey.foreach (println)//(PER)//(3,1)//(5,1) Actionrdd.collectasmap (). foreach (println)//(5,6)//(1,6)//(3,4) Actionrdd.lookup (1). foreach (println)//2//6 }}
For more information on how to do this: http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html
Conversion in RDD and action (ii) PAIRRDD operation