Scala-spark version Xgboost package using _

Scala-spark version Xgboost package using __spark

Last Update:2018-08-20 Source: Internet

Author: User

Tags diff xgboost

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Xgboost Test

Call form
/opt/app/spark-1.6.1/bin/spark-shell--master yarn-client--conf spark.executor.extrajavaoptions= '-XX:PermSize= 1024M '--driver-memory 6g--num-executors 80
(continued)--executor-memory 6g--executor-cores 1--jars/opt/app/spark-1.6.1/lib/ Xgboost4j-spark-0.5-jar-with-dependencies.jar

Import Org.apache.spark.mllib.linalg.Vectors
Import Org.apache.spark.mllib.regression.LabeledPoint
Import Org.apache.spark.mllib.tree.RandomForest
Import Org.apache.spark.sql.hive.HiveContext
Import Org.apache.spark. {sparkconf, Sparkcontext}
Import Ml.dmlc.xgboost4j.scala.spark.XGBoost

Val df=sqlcontext.sql ("Select * from Databasename.tmp_ym_hotel_multiple_features_table_new_train")
Val Data=df.select (DF ("Order_cii_notcancelcii"), DF ("City"), DF ("Order_cii_ahead_1day"), DF ("Order_cii_ahead_3days _avg ")
, DF ("Order_cii_ahead_7days_avg"), DF ("Order_cii_30days_avg"), DF ("Order_cii_ahead_sameoneweek"), DF ("Order_cii_ Ahead_sametwoweeks_avg ")
, DF ("star"), DF ("Goldstar"), DF ("level"), DF ("Ratingservice"), DF ("Novoters"), DF ("Week_day"), DF ("Working_day"), DF ( "Cii_ahead_sameoneweek")
, DF ("Cii_ahead_sametwoweeks_avg"), DF ("Cii_ahead_samethreeweeks_avg"), DF ("Cii_ahead_samefourweeks_avg"), DF (" Simple_estimate_constant ")
, DF ("Cii_ahead_1day_avg"), DF ("Cii_ahead_3days_avg"), DF ("Cii_ahead_7days_avg"), DF ("Order_ahead_lt_1days"), DF (" Order_ahead_lt_2days ")
, DF ("Order_ahead_lt_3days"), DF ("Order_ahead_lt_7days"), DF ("Order_ahead_lt_14days"), DF ("Order_alldays"), DF (" Click_ahead_1day ")
, DF ("Click_ahead_2days"), DF ("Click_ahead_3days"), DF ("Click_ahead_7days"), DF ("Click_ahead_14days"), DF ("Browse_ 0day_uv ")
, DF ("Browse_1day_uv"), DF ("Browse_2day_uv"), DF ("Browse_3day_uv"), DF ("Browse_4day_uv"), DF ("Browse_5day_uv")
, DF ("Browse_6day_uv"), DF ("Browse_7_14day_uv"), DF ("Browse_14daymore_uv"), DF ("Order_cii_14days_avg"), DF ("Order_ Cii_21days_avg ")
, DF ("Order_cii_ahead_samethreeweeks_avg"), DF ("Order_cii_ahead_samefourweeks_avg"))

Import Org.apache.spark.mllib.linalg.Vectors
Import Org.apache.spark.mllib.regression.LabeledPoint
Val Traindata=data.map{line =>
Val label=line (0). tostring.todouble
Val value0= (1 to). Map (i=> line (i). tostring.todouble)
Val Featurevector=vectors.dense (Value0.toarray)
Labeledpoint (label, Featurevector)
}

Test start +1
Val Numround = 800
Val Parammap = List (
"ETA"-> 0.1f,
"Max_depth"-> 6,//The maximum depth of the number. The default value is 6, and the value range is: [1,∞]
"Silent"-> 0,//0 indicates that the runtime information is printed out, and 1 indicates that it is silent and does not print run-time information. The default value is 0
"Objective"-> "reg:linear",/define learning tasks and corresponding learning goals
"Eval_metric"-> "Rmse", and/or the evaluation index required to verify the data
"Nthread"-> 1//xgboost the number of threads at run time. The default value is the maximum number of threads that the current system can obtain
). Tomap

Val model = Xgboost.train (Traindata, Parammap, numround, nworkers =, Useexternalmemory = False)

Val sql_test= "select * from Databasename.tmp_ym_hotel_multiple_features_table_test_7days"//October 31 ~11 Month 6th
Val Df1=sqlcontext.sql (Sql_test)
Val Data1=df1.select (Df1 ("Masterhotel"), Df1 ("City"), Df1 ("Order_cii_ahead_1day"), Df1 ("Order_cii_ahead_3days_avg" ), DF1 ("Order_cii_ahead_7days_avg")
, Df1 ("Order_cii_30days_avg"), Df1 ("Order_cii_ahead_sameoneweek"), Df1 ("Order_cii_ahead_sametwoweeks_avg"), Df1 (" Star "), Df1 (" Goldstar ")
, DF1 ("level"), DF1 ("Ratingservice"), Df1 ("Novoters"), Df1 ("Week_day"), Df1 ("Working_day"), Df1 ("Cii_ahead_ Sameoneweek "), Df1 (" Cii_ahead_sametwoweeks_avg ")
, Df1 ("Cii_ahead_samethreeweeks_avg"), Df1 ("Cii_ahead_samefourweeks_avg"), Df1 ("Simple_estimate_constant"), Df1 (" Cii_ahead_1day_avg ")
, Df1 ("Cii_ahead_3days_avg"), Df1 ("Cii_ahead_7days_avg"), Df1 ("Order_ahead_lt_1days"), Df1 ("Order_ahead_lt_2days")
, Df1 ("Order_ahead_lt_3days"), Df1 ("Order_ahead_lt_7days"), Df1 ("Order_ahead_lt_14days"), Df1 ("Order_alldays")
, Df1 ("Click_ahead_1day"), Df1 ("Click_ahead_2days"), Df1 ("Click_ahead_3days"), Df1 ("Click_ahead_7days")
, Df1 ("Click_ahead_14days"), Df1 ("Browse_0day_uv"), Df1 ("Browse_1day_uv"), Df1 ("Browse_2day_uv"), Df1 ("Browse_3day_ UV ")
, Df1 ("Browse_4day_uv"), Df1 ("Browse_5day_uv"), Df1 ("Browse_6day_uv"), Df1 ("Browse_7_14day_uv"), Df1 ("Browse_ 14daymore_uv ")
, Df1 ("Order_cii_14days_avg"), Df1 ("Order_cii_21days_avg"), Df1 ("Order_cii_ahead_samethreeweeks_avg"), Df1 ("Order_ Cii_ahead_samefourweeks_avg "))

Lablepoint Construction
Modify....
Val Testdata=data1.map{line =>
Val label=line (0). tostring.todouble
Val value0= (1 to). Map (i=> line (i). tostring.todouble)
Val Featurevector=vectors.dense (Value0.toarray)
Featurevector
}
Val predtrain = model.predict (testData)
Val s=predtrain.collect () (0)

S.length

True Value
Val Data2=df1.select (Df1 ("Masterhotel"), Df1 ("Order_cii_notcancelcii"), Df1 ("Rank1"), Df1 ("OrderDate"))
Val actual_frame=data2.todf ()

Building Dataframe Type Result sets
Case Class ResultSet (Masterhotel:int,//Parent Hotel ID
Quantity:double,//Real output
Rank:int,//Sort
Date:string,//Date
Frcst_cii:double//Forecast output
)

Val Ac_1=actual_frame.collect ()
Val pr_1=predtrain.collect () (0)

Val output0= (0 until Ac_1.length). Map (I =>resultset (ac_1 (i) (0). Tostring.toint,
Ac_1 (i) (1). Tostring.todouble,
Ac_1 (i) (2). Tostring.toint,
Ac_1 (i) (3). ToString,
Pr_1 (i) (0). tostring.todouble
)). TODF ()

Add a column
Val output=output0.withcolumn ("diff", ABS ($ "Quantity"-$ "Frcst_cii"))

Calculate mae@100, mae@500
Val Mae100=output.filter ($ "rank" <=100). GroupBy ("date"). AVG ("diff")
Val mae500=output.groupby ("date"). AVG ("diff")
Val Mae100=mae100.sort ("date"). Collect ()
Val Mae500=mae500.sort ("date"). Collect ()

Results Print
Mae100.foreach (i => println ("MAE100", I))
Mae500.foreach (i => println ("MAE500", I))

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More