Package Com.profile.main
Import Org.apache.spark.sql.expressions.Window
Import Org.apache.spark.sql.functions._
Import Com.profile.tools. {datetools, Jdbctools, Logtools, Sparktools}
Import Com.dhd.comment.Constant
Import com.profile.comment.Comments
/**
* Test class//Use Dataframe to solve spark topn problems: grouping, sorting, fetching TOPN
* @author
* Date 2017-09-27 14:55
*/
Object Test {
def main (args:array[string]): Unit = {
Val Sc=sparktools.getsparkcontext
Val sqlcontext = new Org.apache.spark.sql.SQLContext (SC)
Import Sqlcontext.implicits._
Val df = sc.parallelize (Seq (
(0, "Cat26", 30.9), (0, "cat13", 22.1), (0, "Cat95", 19.6), (0, "cat105", 1.3),
(1, "cat67", 28.5), (1, "CAT4", 26.8), (1, "cat13", 12.6), (1, "cat23", 5.3),
(2, "cat56", 39.6), (2, "CAT40", 29.7), (2, "cat187", 27.9), (2, "cat68", 9.8),
(3, "Cat8", 35.6)). TODF ("Hour", "Category", "Totalvalue")
Df.show
/*
+----+--------+----------+
| hour| category| totalvalue|
+----+--------+----------+
| 0| cat26| 30.9|
| 0| cat13| 22.1|
| 0| cat95| 19.6|
| 0| cat105| 1.3|
| 1| cat67| 28.5|
| 1| cat4| 26.8|
| 1| cat13| 12.6|
| 1| cat23| 5.3|
| 2| cat56| 39.6|
| 2| cat40| 29.7|
| 2| cat187| 27.9|
| 2| cat68| 9.8|
| 3| cat8| 35.6|
+----+--------+----------+
*/
/* Val W = Window.partitionby ($ "Hour"). ($ "totalvalue". Desc)
Take TOP1
Val dfTop1 = Df.withcolumn ("rn", Rownumber.over (W)). WHERE ($ "rn" = = = 1). Drop ("RN")
Note: Row_number () is RowNumber () in the spark1.x version and is Row_number () in version 2.x
Take TOP3
Val dfTop3 = Df.withcolumn ("rn", Rownumber.over (W)). WHERE ($ "RN" <= 3). Drop ("RN")
dftop1.show*/
/*
+----+--------+----------+
| hour| category| totalvalue|
+----+--------+----------+
| 1| cat67| 28.5|
| 3| cat8| 35.6|
| 2| cat56| 39.6|
| 0| cat26| 30.9|
+----+--------+----------+
*/
Dftop3.show
/*
+----+--------+----------+
| hour| category| totalvalue|
+----+--------+----------+
| 1| cat67| 28.5|
| 1| cat4| 26.8|
| 1| cat13| 12.6|
| 3| cat8| 35.6|
| 2| cat56| 39.6|
| 2| cat40| 29.7|
| 2| cat187| 27.9|
| 0| cat26| 30.9|
| 0| cat13| 22.1|
| 0| cat95| 19.6|
+----+--------+----------+
*/
Solving spark topn problems with Rdd: grouping, sorting, fetching TOPN
Val rdd1 = sc.parallelize (Seq (
(0, "Cat26", 30.9), (0, "cat13", 22.1), (0, "Cat95", 19.6), (0, "cat105", 1.3),
(1, "cat67", 28.5), (1, "CAT4", 26.8), (1, "cat13", 12.6), (1, "cat23", 5.3),
(2, "cat56", 39.6), (2, "CAT40", 29.7), (2, "cat187", 27.9), (2, "cat68", 9.8),
(3, "Cat8", 35.6)))
Val rdd2 = rdd1.map (x = = (X._1, (x._2, X._3))). Groupbykey ()
/*
Rdd2.collect
res9:array[(Int, iterable[(String, Double)])] = Array ((0,compactbuffer (cat26,30.9), (cat13,22.1), (cat95,19.6), ( cat105,1.3)),
(1,compactbuffer (cat67,28.5), (cat4,26.8), (cat13,12.6), (cat23,5.3)),
(2,compactbuffer (cat56,39.6), (cat40,29.7), (cat187,27.9), (cat68,9.8)),
(3,compactbuffer ((cat8,35.6))))
*/
Val n_value = 1//Fetch TOP 3
Val rdd3 = rdd2.map (x = = {
Val i2 = X._2.tobuffer
Val i2_2 = I2.sortby (_._2)
if (I2_2.length > N_value) i2_2.remove (0, (I2_2.length-n_value))
(X._1, i2_2.toiterable)
})
/*
Rdd3.collect
res8:array[(Int, iterable[(String, Double)])] = Array ((0,arraybuffer (cat95,19.6), (cat13,22.1), (cat26,30.9))),
(1,arraybuffer (cat13,12.6), (cat4,26.8), (cat67,28.5))),
(2,arraybuffer (cat187,27.9), (cat40,29.7), (cat56,39.6))),
(3,arraybuffer ((cat8,35.6))))
*/
Val rdd4 = rdd3.flatmap (x = = {
Val y = x._2
for (w <-y) yield (x._1, w._1, w._2)
})
Rdd4.collect
/*
res3:array[(Int, String, Double)] = Array ((0,cat95,19.6), (0,cat13,22.1), (0,cat26,30.9),
(1,cat13,12.6), (1,cat4,26.8), (1,cat67,28.5),
(2,cat187,27.9), (2,cat40,29.7), (2,cat56,39.6),
(3,cat8,35.6))
*/
RDD4.TODF ("Hour", "Category", "Totalvalue"). Show
/* +----+--------+----------+
| hour| category| totalvalue|
+----+--------+----------+
| 0| cat95| 19.6|
| 0| cat13| 22.1|
| 0| cat26| 30.9|
| 2| cat187| 27.9|
| 2| cat40| 29.7|
| 2| cat56| 39.6|
| 1| cat13| 12.6|
| 1| cat4| 26.8|
| 1| cat67| 28.5|
| 3| cat8| 35.6|
+----+--------+----------+*/
}
}
Solve spark topn problems with dataframe: grouping, sorting, fetching TOPN