Import java.io.PrintWriter Import org.apache.log4j. {level, Logger} import org.apache.spark.mllib.linalg.SparseVector Import Org.apache.spark.mllib.regression.LabeledPoint Import Org.apache.spark.rdd.RDD Import Org.apache.spark. {sparkconf, sparkcontext} import org.apache.spark.mllib.classification. {Logisticregressionmodel, LOGISTICREGRESSIONWITHSGD} import scala.collection.Map/** * Created by Root on 2016/9/17 00 */Object Recommonder {def main (args:array[string]) {Logger.getlogger ("Org.apache.spark"). Setlevel (level.er ROR) Val sc = new Sparkcontext (new sparkconf (). Setappname ("Rcmd"). Setmaster ("local")/read the file, press \ t to separate, subscript 0 is the label, subscript 1 The val data:rdd[array[string] = sc.textfile ("D:\\Program files\\feiq\\recv files\\spark20160827\\ recommendation System \\DataG Enerator\\000000_0 "). Map (_.split (" T "))//build a large vector, build all the features into a vector, and turn them into a map, in order to map each sample to a sparse vector//flatten to get all the features, go heavy, plus subscript Val d Ict:map[string, Long] = Data.flatmap (_ (1). Split (";")). Map (_.split (":") (0)). Distinct (). zipwiThindex (). Collectasmap ()//Build the training dataset, where sample represents a sample containing tags and feature val traindata:rdd[labeledpoint] = Data.map (sample=> {//Because Mllib only receives 1.0 and 0.0 to classify, here we pattern match, turn into 1.0 and 0.0 val label = sample (0) Match {case "1" => 1.0 case _ => 0.0}//Find a non-0 element subscript, look up subscript in dictionary map with the characteristics of the current sample, for non-minus subscript val indexs = sample (1). Split (";"). Map (_.split (":") (0)) map (feature=>{val index:long = Dict.getorelse (feature,-1l) Index.toint})/ /Boolean value, here is not a 0 value is 1.0 val value = Array.fill (indexs.length) (1.0) New Labeledpoint (Label,new sparsevector
E,indexs,value)); //Logistic regression algorithm training, 10 for iteration times, 0.9 for step, all parameters can be adjusted val Model:logisticregressionmodel = Logisticregressionwithsgd.train (traind ata,10,0.9)//Get feature weights val weights:array[double] = model.weights.toArray//Dictionary flip Val map:map[long, String] = Dict.map (x=> (x._2,x._1)) Val pt = new PrintWriter ("D:\\output\\a.txt")//traversal, the subscript of the weights and the subscript one by one of the dictionary map correspond, here gets the result for ( i<-0 until Weights.lenGTH) {val str = map.getorelse (i, "") + "\ T" +weights (i) println (str) pt.write (str) pt.println ()}
Pt.flush () Pt.close ()}