Scala-spark version Xgboost package using __spark

Source: Internet
Author: User
Tags diff xgboost
Xgboost Test

Call form
/opt/app/spark-1.6.1/bin/spark-shell--master yarn-client--conf spark.executor.extrajavaoptions= '-XX:PermSize= 1024M '--driver-memory 6g--num-executors 80
(continued)--executor-memory 6g--executor-cores 1--jars/opt/app/spark-1.6.1/lib/ Xgboost4j-spark-0.5-jar-with-dependencies.jar

Import Org.apache.spark.mllib.linalg.Vectors
Import Org.apache.spark.mllib.regression.LabeledPoint
Import Org.apache.spark.mllib.tree.RandomForest
Import Org.apache.spark.sql.hive.HiveContext
Import Org.apache.spark. {sparkconf, Sparkcontext}
Import Ml.dmlc.xgboost4j.scala.spark.XGBoost

Val df=sqlcontext.sql ("Select * from Databasename.tmp_ym_hotel_multiple_features_table_new_train")
Val Data=df.select (DF ("Order_cii_notcancelcii"), DF ("City"), DF ("Order_cii_ahead_1day"), DF ("Order_cii_ahead_3days _avg ")
, DF ("Order_cii_ahead_7days_avg"), DF ("Order_cii_30days_avg"), DF ("Order_cii_ahead_sameoneweek"), DF ("Order_cii_ Ahead_sametwoweeks_avg ")
, DF ("star"), DF ("Goldstar"), DF ("level"), DF ("Ratingservice"), DF ("Novoters"), DF ("Week_day"), DF ("Working_day"), DF ( "Cii_ahead_sameoneweek")
, DF ("Cii_ahead_sametwoweeks_avg"), DF ("Cii_ahead_samethreeweeks_avg"), DF ("Cii_ahead_samefourweeks_avg"), DF (" Simple_estimate_constant ")
, DF ("Cii_ahead_1day_avg"), DF ("Cii_ahead_3days_avg"), DF ("Cii_ahead_7days_avg"), DF ("Order_ahead_lt_1days"), DF (" Order_ahead_lt_2days ")
, DF ("Order_ahead_lt_3days"), DF ("Order_ahead_lt_7days"), DF ("Order_ahead_lt_14days"), DF ("Order_alldays"), DF (" Click_ahead_1day ")
, DF ("Click_ahead_2days"), DF ("Click_ahead_3days"), DF ("Click_ahead_7days"), DF ("Click_ahead_14days"), DF ("Browse_ 0day_uv ")
, DF ("Browse_1day_uv"), DF ("Browse_2day_uv"), DF ("Browse_3day_uv"), DF ("Browse_4day_uv"), DF ("Browse_5day_uv")
, DF ("Browse_6day_uv"), DF ("Browse_7_14day_uv"), DF ("Browse_14daymore_uv"), DF ("Order_cii_14days_avg"), DF ("Order_ Cii_21days_avg ")
, DF ("Order_cii_ahead_samethreeweeks_avg"), DF ("Order_cii_ahead_samefourweeks_avg"))


Import Org.apache.spark.mllib.linalg.Vectors
Import Org.apache.spark.mllib.regression.LabeledPoint
Val Traindata=data.map{line =>
Val label=line (0). tostring.todouble
Val value0= (1 to). Map (i=> line (i). tostring.todouble)
Val Featurevector=vectors.dense (Value0.toarray)
Labeledpoint (label, Featurevector)
}


Test start +1
Val Numround = 800
Val Parammap = List (
"ETA"-> 0.1f,
"Max_depth"-> 6,//The maximum depth of the number. The default value is 6, and the value range is: [1,∞]
"Silent"-> 0,//0 indicates that the runtime information is printed out, and 1 indicates that it is silent and does not print run-time information. The default value is 0
"Objective"-> "reg:linear",/define learning tasks and corresponding learning goals
"Eval_metric"-> "Rmse", and/or the evaluation index required to verify the data
"Nthread"-> 1//xgboost the number of threads at run time. The default value is the maximum number of threads that the current system can obtain
). Tomap

Val model = Xgboost.train (Traindata, Parammap, numround, nworkers =, Useexternalmemory = False)

Val sql_test= "select * from Databasename.tmp_ym_hotel_multiple_features_table_test_7days"//October 31 ~11 Month 6th
Val Df1=sqlcontext.sql (Sql_test)
Val Data1=df1.select (Df1 ("Masterhotel"), Df1 ("City"), Df1 ("Order_cii_ahead_1day"), Df1 ("Order_cii_ahead_3days_avg" ), DF1 ("Order_cii_ahead_7days_avg")
, Df1 ("Order_cii_30days_avg"), Df1 ("Order_cii_ahead_sameoneweek"), Df1 ("Order_cii_ahead_sametwoweeks_avg"), Df1 (" Star "), Df1 (" Goldstar ")
, DF1 ("level"), DF1 ("Ratingservice"), Df1 ("Novoters"), Df1 ("Week_day"), Df1 ("Working_day"), Df1 ("Cii_ahead_ Sameoneweek "), Df1 (" Cii_ahead_sametwoweeks_avg ")
, Df1 ("Cii_ahead_samethreeweeks_avg"), Df1 ("Cii_ahead_samefourweeks_avg"), Df1 ("Simple_estimate_constant"), Df1 (" Cii_ahead_1day_avg ")
, Df1 ("Cii_ahead_3days_avg"), Df1 ("Cii_ahead_7days_avg"), Df1 ("Order_ahead_lt_1days"), Df1 ("Order_ahead_lt_2days")
, Df1 ("Order_ahead_lt_3days"), Df1 ("Order_ahead_lt_7days"), Df1 ("Order_ahead_lt_14days"), Df1 ("Order_alldays")
, Df1 ("Click_ahead_1day"), Df1 ("Click_ahead_2days"), Df1 ("Click_ahead_3days"), Df1 ("Click_ahead_7days")
, Df1 ("Click_ahead_14days"), Df1 ("Browse_0day_uv"), Df1 ("Browse_1day_uv"), Df1 ("Browse_2day_uv"), Df1 ("Browse_3day_ UV ")
, Df1 ("Browse_4day_uv"), Df1 ("Browse_5day_uv"), Df1 ("Browse_6day_uv"), Df1 ("Browse_7_14day_uv"), Df1 ("Browse_ 14daymore_uv ")
, Df1 ("Order_cii_14days_avg"), Df1 ("Order_cii_21days_avg"), Df1 ("Order_cii_ahead_samethreeweeks_avg"), Df1 ("Order_ Cii_ahead_samefourweeks_avg "))



Lablepoint Construction
Modify....
Val Testdata=data1.map{line =>
Val label=line (0). tostring.todouble
Val value0= (1 to). Map (i=> line (i). tostring.todouble)
Val Featurevector=vectors.dense (Value0.toarray)
Featurevector
}
Val predtrain = model.predict (testData)
Val s=predtrain.collect () (0)


S.length


True Value
Val Data2=df1.select (Df1 ("Masterhotel"), Df1 ("Order_cii_notcancelcii"), Df1 ("Rank1"), Df1 ("OrderDate"))
Val actual_frame=data2.todf ()

Building Dataframe Type Result sets
Case Class ResultSet (Masterhotel:int,//Parent Hotel ID
Quantity:double,//Real output
Rank:int,//Sort
Date:string,//Date
Frcst_cii:double//Forecast output
)

Val Ac_1=actual_frame.collect ()
Val pr_1=predtrain.collect () (0)


Val output0= (0 until Ac_1.length). Map (I =>resultset (ac_1 (i) (0). Tostring.toint,
Ac_1 (i) (1). Tostring.todouble,
Ac_1 (i) (2). Tostring.toint,
Ac_1 (i) (3). ToString,
Pr_1 (i) (0). tostring.todouble
)). TODF ()


Add a column
Val output=output0.withcolumn ("diff", ABS ($ "Quantity"-$ "Frcst_cii"))

Calculate mae@100, mae@500
Val Mae100=output.filter ($ "rank" <=100). GroupBy ("date"). AVG ("diff")
Val mae500=output.groupby ("date"). AVG ("diff")
Val Mae100=mae100.sort ("date"). Collect ()
Val Mae500=mae500.sort ("date"). Collect ()


Results Print
Mae100.foreach (i => println ("MAE100", I))
Mae500.foreach (i => println ("MAE500", I))


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.