Packagecom.xxxImportOrg.apache.spark.storage.StorageLevelImportOrg.apache.spark. {sparkconf, sparkcontext}//the RDD test in SparkObject Rddtest {def main (args:array[string]): Unit={val conf=NewSparkconf (). Setmaster ("local[*]"). Setappname ("Rdd API Test") Val SC=sparkcontext.getorcreate (conf)//Maptest (SC)//Distincttest (SC)//Filtertest (SC)//Keybytest (SC)//Sortbytest (SC)//Topntest (SC)//Repartitiontest (SC)//Groupbytest (SC)Aggsumtest (SC) sc.stop ()} def maptest (Sc:sparkcontext)={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt", 3) Val Mapresult= File.map (x =>{//map is characterized by an input corresponding to an output, no return value, the corresponding return value will be () NILVal info = x.split ("\\t") (Info (0), info (1))//converted to tuples }) //Take is an action that takes out the first n data sent to the driver, and is typically used for development testingMapresult.take (10). foreach (println)//the difference between map and mappartition: Map is a conversion that records a record, Mappartition is//one partition (partition) conversion onceVal Mappartitionresult = file.mappartitions (x = = {//A partition corresponds to a partitionvar info =NewArray[string] (3) for(line <-x) yield{//Yield: function: There is a return value, all records are returned after a collectioninfo = line.split ("\\t") (Info (0), info (1) )}}) Mappartitionresult.take (10). foreach (println)//Turn a row into multiple rows, use Flatmap to flatten a new_tweet record into two login recordsVal flatmaptest = File.flatmap (x=>{val Info= X.split ("\\t") Info (1) Match { Case"New_tweet" = for(I <-1 to 2) yield S "${info (0)} login ${info (2)}" Case_ =Array (x)}}) Flatmaptest.take (10). foreach (println) println (File.count ()) println (Flatmaptest.count ())}//distinct: Row weight, remove duplicate data, not data conversion, belong to the aggregation of datadef distincttest (sc:sparkcontext) ={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt", 3) Val Userrdd= File.map (X=>x.split ("\\t") (0) . Distinct () Userrdd.foreach (println)}//Filter: Filteringdef filtertest (sc:sparkcontext) ={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt", 3) Val Loginfilter= File.filter (X=>x.split ("\\t") (1) = = "Login") Loginfilter.take (10). foreach (println) println (Loginfilter.count ())}//keyby, input as Value,key calculated from calculationdef keybytest (sc:sparkcontext) ={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt", 3) Val Useractiontype= File.keyby (x=>{val Info= X.split ("\\t") s"${info (0)}--${info (1)}"}) Useractiontype.take (10). foreach (println)}//SortBy Sortdef sortbytest (sc:sparkcontext) ={val file= Sc.textfile ("File:///c:\\users\\zuizui\\desktop\\readme.txt") //if the amount of data is small, you want to sort the group, numpartitions set to 1.//The default is Holy market, Jiang Xu bar The second parameter is set to False//val sortby = File.sortby (X=>x.split ("\\s+") (1). toint,numpartitions = 1)//when there are different numbers of spaces behind, use \\s+ to splitVal sortby = File.sortby (X=>x.split ("\\s+") (1). ToInt,false, numpartitions = 1)//when there are different numbers of spaces behind, use \\s+ to splitSortby.foreach (println)} def topntest (Sc:sparkcontext)={val list= List (1,23,34,54,56,100)//Convert the set into an rdd using parallelize, or MkrddVal Rdd = sc.parallelize (list,2)//Add a diet change, make takeordered, and top sort order reversedImplicit val tonordered =Newordering[int]{override def compare (X:int, y:int): Int=Y.compareto (x)} val takeordered= rdd.takeordered (3)//from small to large remove the first three stripsTakeordered.foreach (println) Val TopN= Rdd.top (3)//from big to small remove the first three stripsTopn.foreach (println)}//re-partitioningdef repartitiontest (sc:sparkcontext) ={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt") Val Result= File.repartition (5)//repartition is wide dependence, so-called wide dependence is//The data in each of the original RDD partitions is then written to each partition of the new Rdd .//narrow dependency: A partition in the original RDD partition data is fully written to a partition in the new RDD//narrow reliance reduces transmission between networksFile.foreachpartition (x=>{var sum= 0X.foreach (x=>sum+=1) println (s"Data for this partition has ${sum}")}) result.foreachpartition (x={var sum= 0X.foreach (x=>sum+=1) println (s"Data for this partition has ${sum}")}) Val Coalesce= Result.coalesce (3)//using narrow dependency, originally had five partitions, now becomes three words,//One of these is unchanged, and 22 of the other four partitions are added to the other two new partitions, respectively, by a narrow dependencyCoalesce.foreachpartition (x=>{var sum= 0X.foreach (x=>sum+=1) println (s"Coalesce data for this partition is ${sum}")})} def groupbytest (Sc:sparkcontext)={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt") Val Groupedby= File.groupby (X=>x.split ("\\t") (0)) //GROUP by easy occurrence number tiltGroupedby.foreachpartition (x=>{println (s"Groupbyrdd partition, Total: ${x.size} records")}) Groupedby.foreach (x={println (s"A record of Groupbyrdd, key for ${x._1},value on the number of sets of records is: ${x._2.size}")}) Groupedby.foreach (x={var sum= 0X._2.foreach ( line={line.split ("\\t") (1) Match { Case"Login" = sum + + 1 Case_ =}}) println (s"User: ${x._1" The number of logons is: $sum ")})} def aggsumtest (Sc:sparkcontext)={val list= List (1,2,4,5) Val Rdd= Sc.parallelize (list,3) //reduce calculates sumVal Reduceresult = Rdd.reduce ((v1,v2) =>v1+v2)//fold calculate sumVal flodresult = rdd.fold (0) ((v1,v2) =>v1+v2)//aggregate to concatenate elements into a stringVal Aggresult = Rdd.aggregate ("") ((c,v) = ={C Match { Case"" =v.tostring Case_ + S "$c, $v"}}, (C1,C2)={C1 Match { Case"" =C2 Case_=>s "$c 1, $c 2"}}) println (s"Reduceresult: $reduceResult") println (s"Flodresult: $flodResult") println (s"Aggresult: $aggResult")} def persisttest (Sc:sparkcontext)={val file= Sc.textfile ("File:///g:\\bd14\\user-logs-large.txt")//File.cache ()File.persist (storagelevel.memory_only)//equivalent to cache (), Chi Gazai in memory//calculate number of users//Calculate IP number//calculate the number of each user on each IP }}
Scalaapi RDDAPI Common operations in spark