基於Python的Xgboost模型實現__Python

來源:互聯網
上載者:User
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 28 13:22:08 2016
@author: yy
"""

## part1: 模型訓練
import pandas as pd
import numpy as np
import os
from datetime import datetime
import xgboost as xgb

traindata=pd.read_csv("/home/op1/yy/traindata.csv")
traindata["add_date"]= pd.to_datetime(traindata["orderdate"]) #轉化為日期格式

#時間過濾
traindata_a=traindata[(traindata["add_date"]<'2016-10-31')] #訓練集
testdata_a=traindata[(traindata["add_date"]>='2016-10-31') & (traindata["add_date"]<='2016-11-06') & (traindata["rank1"]<=500)] #預測集
                     
#特徵列表
colnames=[ 
#'masterhotel'
'order_cii_notcancelcii'        
,'city'             
,'order_cii_ahead_1day'
,'order_cii_ahead_3days_avg'
,'order_cii_ahead_7days_avg'
,'order_cii_30days_avg'
,'order_cii_ahead_sameoneweek'
,'order_cii_ahead_sametwoweeks_avg'
,'star'                
,'goldstar'            
,'level'      
,'ratingservice'       
,'novoters'            
,'week_day'       
,'working_day'
,'cii_ahead_sameoneweek'
,'cii_ahead_sametwoweeks_avg'
,'cii_ahead_samethreeweeks_avg'
,'cii_ahead_samefourweeks_avg'
,'simple_estimate_constant'
,'cii_ahead_1day_avg'
,'cii_ahead_3days_avg'
,'cii_ahead_7days_avg'
,'order_ahead_lt_1days'
,'order_ahead_lt_2days'
,'order_ahead_lt_3days'
,'order_ahead_lt_7days'
,'order_ahead_lt_14days'
,'order_alldays'     
,'click_ahead_1day'    
,'click_ahead_2days'   
,'click_ahead_3days'   
,'click_ahead_7days'   
,'click_ahead_14days'  
,'browse_0day_uv'      
,'browse_1day_uv'      
,'browse_2day_uv'      
,'browse_3day_uv'      
,'browse_4day_uv'      
,'browse_5day_uv'      
,'browse_6day_uv'      
,'browse_7_14day_uv'   
,'browse_14daymore_uv' 
,'order_cii_14days_avg'               
,'order_cii_21days_avg' 
,'order_cii_ahead_samethreeweeks_avg' 
,'order_cii_ahead_samefourweeks_avg']

#dtrain = xgb.DMatrix( data, label=label)
#dtrain = xgb.DMatrix(data, label=label)
#訓練集構建

label=traindata_a[colnames[0]]
dtrain = xgb.DMatrix(traindata_a[colnames[1:]], label=traindata_a[colnames[0]])

num_round=800
params = {} 
params["objective"] = "reg:linear" 
params["eta"] = 0.1
params["max_depth"] = 4 
params["eval_metric"] = "rmse"
params["silent"] = 0
plst = list(params.items()) #Using 5000 rows for early stopping. 

bst = xgb.train( plst, dtrain, num_round)

#預測集構建
dtest=xgb.DMatrix(testdata_a[colnames[1:]])

#結果預測
y_bar=bst.predict(dtest)

#y.shape
##  評價指標
#testdata_a[colnames[0,]]
#testdata_a.columns
actual_values=testdata_a[["masterhotel","add_date","order_cii_notcancelcii","rank1"]]
actual_values["y_bar"]=y_bar
actual_values["mae"]=abs(actual_values["y_bar"]-actual_values["order_cii_notcancelcii"])

top100=actual_values[actual_values["rank1"]<=100]
mae100=top100.groupby("add_date").mean()
mae500=actual_values.groupby("add_date").mean()

## part2: 儲存模型,方便後續直接調用
bst.save_model('xgb.model') # 用於儲存訓練出的模型

#模型初始化
bst=xgb.Booster() #注意:名字要保持一致,否則報錯。
bst.load_model('/home/op1/yuanmin/xgb.model')

y_bar=bst.predict(dtest)


## 參數重要性

pd.Series(bst.get_fscore()).sort_values(ascending=False)

pd.Series(bst.get_fscore()).sort_values(ascending=False)/sum(pd.Series(bst.get_fscore()).sort_values(ascending=False)) #重要性歸一化


附兩篇參考文章:

1、https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

2、http://www.cnblogs.com/haobang008/p/5909207.html


相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.