Package com.xh.movies import Org.apache.spark.rdd.RDD import Org.apache.spark. {sparkconf, sparkcontext} import scala.collection.mutable import org.apache.log4j.
{Level,logger}/** * Created by ssss on 3/11/2017.
* Need understand what ' s relationshop between DataSet & RDD * Occupations Small data set need to be broadcast * Production env should use parquet, but not easy for user to read the contents * Here we use 4 files below * 1, "rat Ings.dat ": Userid::movieid::rating::timestamp///for each threat, should assign ratings of risk of Asset * 2, "Users.dat": Userid::gender::age::occupationid::zip-code * 3, "Movies.dat": movieid::title::genre S * 4, "Occupations.dat": Occupationid::occupationname */Object Moviereviewssystemuserbehavioranalysis {def main (AR Gs:array[string]): unit = {logger.getlogger ("org"). Setlevel (level.error)//only print ERROR log var Masterur L = "local[4]"//4 cores default the process run My laptop to study and testing var datapath = "data/medium/"//here is where the data saved/** * anyway The process should be run on PROD in jar mode * So we need prod master URL in case/if (args.length ;
1) {//here The spark submit need 2 param in prod env at least masterurl = args (0) datapath = args (1)} Spark Conetext musted. Create the Spark context val conf = new sparkconf (). Setmaster (MasterUrl). Setappname ("Use Ranalysis ") Val sscc = new Sparkcontext (conf)/** 0001 * After have the SC we need read data from local or
Hadoop * We use RDD Read file */val usersrdd:rdd[string] = sscc.textfile (datapath + "Users.dat") Val moviesrdd:rdd[string] = sscc.textfile (datapath + "Movies.dat") val occupationsrdd:rdd[string] = Sscc.textfile (DataPath + "Occupations.dat") val ratingsrdd:rdd[string] = sscc.textfile (datapath + "Ratings.dat")/** * Accroding Movie ID get User Info */val Basicuserrdd = Usersrdd.map (_.split ("::")). map{User => (User (3), (User (0), User (1), user (2))} val occpuation = Occupationsrdd.map (_.split ("::")). Map (Job => (Job (0), Job (1)) Val UserInfo = Basi Cuserrdd.join (occpuation)//userinfo.collect (). foreach (println)//result--> ID user info, occuption Name (4, (2175,m,25), College/grad Student)) works//ratings.dat ": Userid::movieid val targetmovie = RATINGSR Dd.map (_.split ("::")). Map (x => (x (0), X (1)). Filter (_._2.equals ("1139")) Val Targetuser = Userinfo.map (x => (x._ 2._1._1), x._2) Val finalinfo = Targetmovie.join (targetuser)//finalinfo.collect (). foreach (println)//result (35 (1139, (3518,f,18), executive/managerial)) println (Finalinfo.count ())/** 0002 * Get the must populate m Ives from rating File table by key value Reducebykey * "Ratings.dat": Userid::movieid::rating::timestamp * /Val Populaterdd =Ratingsrdd.map (_.split ("::")). Map (x => (x (1), X (0), X (2)) Val Gettotalpoint = Populaterdd.map (x => (X._1, x._3.to int,1))//Get the (key, value) tuple. Reducebykey ((v1, v2) => (v1._1+v2._1, v1._2+v2._2))//Operatio N to reduce//are so complex and so clever get the total point and all people each movie. Map (x=> . _1/x._2._2,x._1))//get the average/people per movie. Sortbykey (False). Map (x => (x._2,x._1)) Desc Take//get top//gettotalpoint.persist ()//gettotalpoint.checkpoint (). Cache ()//gettotalpoint.collect (). foreach (println)//Result---> (2329, (1798027,640)//GETTOTALPOINT.FOREAC
H (println)//(3607,5)/** * Get the most popule movies mean I many people saww the movie and get top 15 * * val mustpopulte = Ratingsrdd.map (_.split ("::")). Map (x => (x (1), 1)). Reducebykey (_+_). SortBy (_._2,false). Tak
E (15) Mustpopulte.foreach (println)//(2858,3428) (260,2991)/**0003 * * Calculate the top movies by Gender, we can not get the "data from rating", so we need to join user table. Need * aggergate Opeartion * Mapjo In shuffle are the killer in distribute system, while mapjoin would not cause shuffle * But what ' s the mapjoin? Maybe broadcast, which a small table or file * 1, "Ratings.dat": Userid::movieid::rating::timestamp * 2, " Users.dat ": Userid::gender::age::occupationid::zip-code * *//first create a wide table from rating and U Sers Val spiltuser =usersrdd.map (_.split ("::")). Map (x=> (x (0), X (1), X (2), X (3)) Val spiltrating = Ratingsrdd.map ( _.split ("::")). Map (x=> (x (0), X (1), X (2), X (3))) Val Wdie = Spiltuser.map (x=> (x._1,x._2)) Val genderwideinfo = Spiltrating.map (X=> (X._1, (X._1,x._2,x._3)). Join (Wdie) Genderwideinfo.cache ()//genderwideinfo.take. Forea CH (println) (2828, (2828,3948,5), M) Val malewideinfo = genderwideinfo.filter (x => x._2._2.equals ("M")). Map (x => (x._2._1._2, X._2._1._3. todouble,1))//moveid, rating point people number. Reducebykey (v1, v2) => (v1._1+v2._1, V1._2+v2._2)). m AP (x=> (X._2._1.todouble/x._2._2,x._1)). Sortbykey (FALSE). Map (x => (x._2,x._1))//Desc. Take (15)// Get top Val fmalewideinfo = genderwideinfo.filter (x => x._2._2.equals ("F")). Map (x => (x._2._1._2, X._2._1._3. todouble,1))//moveid, rating point people number. Reducebykey (v1, v2) => (v1._1+v2._1, V1._2+v2._2)). m AP (x=> (X._2._1.todouble/x._2._2,x._1)). Sortbykey (FALSE). Map (x => (x._2,x._1))//Desc. Take (15)// Get top Malewideinfo.foreach (println) Fmalewideinfo.foreach (println)//Get the ages top N number ex. 20 -29, 30-39 so on, so group by maybe need filter//age has been ETL/** *-age is chosen from the Follo Wing Ranges: * 1: "Under 18" * 18: "18-24" * 25: "25-34" * 35: "35-44" * 45: "45-49" * 50: "50-55" * 56: " 56+ "* *///get userid and User age val Userageinterval = Usersrdd.map (_.split (":: ")). Map (x=> (x (0), X ( 2)). Filter (_._2.equals ("a"))//"Users.dat": Userid::gender::age//here Just you do there are a little data i n usrage interval table, broadcast----data to---> executor (one or many tasks with each exector)//since the D ATA has been filter by age, so only using user ID, fine val finaluseridset = mutable. HashSet () + + Userageinterval.map (_._1). Collect ()//here should is use 2 + + val finaluseridsetbroadcast = sscc.broad
Cast (Finaluseridset)//Here define how to broadcast data set//"Ratings.dat": Userid::movieid::rating::timestamp "Movies.dat": movieid::title val movieid2nmae = Moviesrdd.map (_.split ("::")). Map (x=> (x (0), X (1)). Collect ( ). tomap val Finaluserinfos = Ratingsrdd.map (_.split (":: ")). Map (x => (x (0), X (1)). Filter (x=> finalUserIdSetBroadCast.value.contains (x._1)). Map (x => (x._2,1))/ /movied and calculate. Reducebykey (_+_). SortBy (_._2,false) take//false (mean). Map ( X => (Movieid2nmae.getorelse (x._1,null), x._2))//(Men in Black (1997), 971) println ("Top N by Age:") finaluse
Rinfos.foreach (println) while (true) {}//using to check status by Web Sscc.stop ()}}