1 PackageSogolog2 3 ImportOrg.apache.hadoop.io. {longwritable, Text}4 ImportOrg.apache.hadoop.mapred.TextInputFormat5 ImportOrg.apache.spark.rdd.RDD6 ImportOrg.apache.spark. {sparkconf, sparkcontext}7 8 9 Ten classRddfile { Onedef readfiletordd (path:string): rdd[string] = { AVal conf =NewSparkconf (). Setmaster ("local"). Setappname ("Sougodemo") -Val sc =Newsparkcontext (conf); - //Use this method to avoid Chinese garbled characters theSc.hadoopfile ("j:\\scala\\workspace\\first-spark-demo\\sougofile\\sogouq.reduced", Classof[textinputformat], classof[longwritable], Classof[text]). map{ -Pair =NewString (pair._2.getbytes, 0, Pair._2.getlength, "GBK")} - } -}
1 PackageSogolog2 3 ImportOrg.apache.spark.rdd.RDD4 5 /**6 * List of users searching for more than 3 different keywords and their search keywords7 */8 Object UserSearchKeyWordLT3 {9def main (args:array[string]): Unit = {Ten //1. Read in the file OneVal textfile =NewRddfile (). Readfiletordd ("j:\\scala\\workspace\\first-spark-demo\\sougofile\\sogouq.reduced") A - //2, map operation, each line of users, keywords read into the new Rdd -Val userkeywordtuple:rdd[(string,string)] = Textfile.map (line=>{ theVal arr = line.split ("\ t") -(Arr (1), arr (2)) - }) - + //3, reduce operation, the same user's keywords to merge -Val userkeywordreduced = Userkeywordtuple.reducebykey ((x, y) ={ + //Go heavy A if(X.contains (y)) { at x -}Else{ -X+ "," +y - } - }) - in //4. Use filter for final filtering -Val finalresult = Userkeywordreduced.filter (x=>{ to //filter users with less than 10 keywords +X._2.split (","). length>=10 - }) the * //5. Print out the results $ finalresult.collect (). foreach (println)Panax Notoginseng } -}
Operation Result:
Use spark for Sogou log Analysis instances--list of users who search for more than 10 different keywords and their search keywords