Compared to Java, Scala's code is much leaner:
import org.apache.spark._ import sparkcontext._ object secondarysort { def main (args: array[string]) { val sparkconf = new sparkconf (). Setappname (" Secondary Sort ") sparkconf.set (" Mapreduce.framework.name ", " yarn "); sparkconf.set (" Spark.rdd.compress ", " true "); sparkconf.set (" Spark.serializer "," Org.apache.spark.serializer.KryoSerializer "); sparkconf.set ("spark.storage.memoryFraction", "0.5"); sparkconf.set ("Spark.akka.frameSize", "); Sparkconf.set ("Spark.default.parallelism", "1"); val sc = new sparkcontext (sparkconf) val file = sc.textfile ("Hdfs://namenode:9000/test/secsortdata") val Rdd = file.map (Line => line.split ("\ T")). map (x => (x (0), X (1)). Groupbykey (). sortbykey (True). Map (x = > (X._1,x._2.tolist.sortwith (_>_))) val rdd2 = rdd.flatmap{ x => val len = x . _2.length val array = new array[(String,String)] (len) for (I <- 0 until len) { array (i) = (x._1,x._2 (i)) } array } sc.stop () }}
is not very simple, compared to the mapreduce hundreds of lines of code is really convenient and fast
Spark two-time sort