標籤:key var color hash 基本 toms 結合 介紹 imp
一、自訂分區
1.概述
預設的是Hash的分區策略,這點和Hadoop是類似的,具體的分區介紹,參見:68491115
2.實現
package cn.itcast.spark.day3import java.net.URLimport org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}import scala.collection.mutable/** * Created by root on 2016/5/18. */object UrlCountPartition { def main(args: Array[String]) { val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]") val sc = new SparkContext(conf) //rdd1將資料切分,元組中放的是(URL, 1) val rdd1 = sc.textFile("c://itcast.log").map(line => { val f = line.split("\t") (f(1), 1) }) val rdd2 = rdd1.reduceByKey(_ + _) val rdd3 = rdd2.map(t => { val url = t._1 val host = new URL(url).getHost (host, (url, t._2)) }) val ints = rdd3.map(_._1).distinct().collect() val hostParitioner = new HostParitioner(ints)// val rdd4 = rdd3.partitionBy(new HashPartitioner(ints.length)) val rdd4 = rdd3.partitionBy(hostParitioner).mapPartitions(it => { it.toList.sortBy(_._2._2).reverse.take(2).iterator }) rdd4.saveAsTextFile("c://out4") //println(rdd4.collect().toBuffer) sc.stop() }}/** * 決定了資料到哪個分區裡面 * @param ins */class HostParitioner(ins: Array[String]) extends Partitioner { val parMap = new mutable.HashMap[String, Int]() var count = 0 for(i <- ins){ parMap += (i -> count) count += 1 } override def numPartitions: Int = ins.length override def getPartition(key: Any): Int = { parMap.getOrElse(key.toString, 0) }}
// 與Hadoop相通,不再贅述
二、自訂排序
基本上就是結合之前的隱式轉換了:(這裡使用範例類可以不用new就能得到執行個體,另外也可以用於模式比對)
package cn.itcast.spark.day3import org.apache.spark.{SparkConf, SparkContext}object OrderContext { implicit val girlOrdering = new Ordering[Girl] { override def compare(x: Girl, y: Girl): Int = { if(x.faceValue > y.faceValue) 1 else if (x.faceValue == y.faceValue) { if(x.age > y.age) -1 else 1 } else -1 } }}/** * Created by root on 2016/5/18. *///sort =>規則 先按faveValue,比較年齡//name,faveValue,ageobject CustomSort { def main(args: Array[String]) { val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]") val sc = new SparkContext(conf) val rdd1 = sc.parallelize(List(("yuihatano", 90, 28, 1), ("angelababy", 90, 27, 2),("JuJingYi", 95, 22, 3))) import OrderContext._ val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false) println(rdd2.collect().toBuffer) sc.stop() }}/** * 第一種方式 * @param faceValue * @param agecase class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable { override def compare(that: Girl): Int = { if(this.faceValue == that.faceValue) { that.age - this.age } else { this.faceValue -that.faceValue } }} *//** * 第二種,通過隱式轉換完成排序 * @param faceValue * @param age */case class Girl(faceValue: Int, age: Int) extends Serializable
大資料入門第二十二天——spark(三)自訂分區、排序與尋找