Related article: Https://github.com/JohnLangford/vowpal_wabbit/wiki
Https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format
# download TITANICDF data source from the Internet
Titanicdf<-read.csv (' Http://math.ucdenver.edu/RTutorial/titanic.txt ', sep= ' \ t ')
# Save the data source to a local
# write.table (titanicdf, ' titanicDF.txt ', row.names=f)
# write.csv (TITANICDF, "Titanicdf.csv", row.names=f)
# Create a new variable title
Titanicdf$title<-ifelse (Grepl (' Mr ', titanicdf$name), ' Mr ', IfElse (Grepl (' Mrs ', Titanicdf$name), ' Mrs ', IfElse ( Grepl (' Miss ', titanicdf$name), ' Miss ', ' nothing '))
# Convert to Factor type
Titanicdf$title<-as.factor (Titanicdf$title)
Head (TITANICDF)
# # Name PClass Age Sex
# 1 Allen, Miss Elisabeth Walton 1st 29.00 Female
# 2 Allison, Miss Helen loraine 1st 2.00 Female
# 3 Allison, Mr Hudson Joshua Creighton 1st 30.00 Male
# 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 Female
# 5 Allison, Master Hudson Trevor 1st 0.92 Male
# 6 Anderson, Mr Harry 1st 47.00 Male
# # survived Title
# # 1 1 Miss
# # 2 0 Miss
# 3 0 Mr
# 4 0 Mrs
# 5 1 Nothing
# 6 1 Mr
STR (TITANICDF)
# # ' Data.frame ': 1313 obs. of 6 variables:
# $ name:factor w/1310 levels "abbing, Mr Anthony",..: 22 25 26 27 24 31 45 46 50 54 ...
# # $ pclass:factor w/3 Levels "1st", "2nd", "3rd": 1111111111...
# # $ Age:num 29 2 30 25 0.92 47 63 39 58 71 ...
# # $ sex:factor W/2 Levels "female", "male": 1121221212...
# # $ survived:int 1000111010...
# # $ title:factor W/4 levels "Miss", "Mr", "Mrs",..:1123421232...
# View Data loss Scenarios
Library (Dfexplore)
# # Loading Required Package:ggplot2
Dfplot (TITANICDF)
Sum (is.na (Titanicdf$age)) # Age has 557 record data missing
# # [1] 557
# Fill the missing data of the age variable with the median number
Titanicdf$age[is.na (titanicdf$age)]<-median (titanicdf$age,na.rm=t)
# See if the TITTITANICDF data has missing values again
Dfplot (TITANICDF)
# Reorganize the columns of the data to put the target "survived" on the last side
Titanicdf<-titanicdf[c ("PClass", "Age", "Sex", "Title", "survived")]
Head (TITANICDF)
# # PClass Age Sex Title survived
# # 1 1st 29.00 Female Miss 1
# # 2 1st 2.00 female Miss 0
# # 3 1st 30.00 male Mr 0
# # 4 1st 25.00 Female Mrs 0
# # 5 1st 0.92 male nothing 1
# # 6 1st 47.00 male Mr 1
# binarize All factors: Virtual variable (dummy variable) processing for all factor variables
Library (caret)
# # Loading Required Package:lattice
Titanicdummy<-dummyvars (~.,DATA=TITANICDF,FULLRANK=F)
Titanicdf<-as.data.frame (Predict (TITANICDUMMY,NEWDATA=TITANICDF))
Head (TITANICDF)
# # pclass.1st pclass.2nd pclass.3rd age Sex.female Sex.male Title.miss
# 1 1 0 0 29.00 1 0 1
# 2 1 0 0 2.00 1 0 1
# 3 1 0 0 30.00 0 1 0
# 4 1 0 0 25.00 1 0 0
# 5 1 0 0 0.92 0 1 0
# 6 1 0 0 47.00 0 1 0
# # title.mr Title.mrs Title.nothing survived
# 1 0 0 0 1
# 2 0 0 0 0
# 3 1 0 0 0
# 4 0 1 0 0
# 5 0 0 1 1
# 6 1 0 0 1
Https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format
[Label] [Importance [tag]]| Namespace Features | Namespace Features ... | Namespace Features
Isclassification<-t
outcomename<-"survived"
labelname<-titanicdf[, "survived"]
weightname<-""
Objdf<-titanicdf
Predictors<-names (OBJDF) [!names (OBJDF)%in% C (outcomename,weightname)]
# LABELS & Importance
if (Is.null (labelname)) {
outcomename<-"Ignoreme"
objdf[,outcomename]<-"0 |"
} else {
if (isclassification) {
# Everything should be-1 1 for classification
Objdf[,outcomename]<-ifelse (objdf[,outcomename]>0,1,-1)
}
if (weightname! = ")
Objdf[,outcomename]<-psate (Objdf[,outcomename],objdf[,weightname], "|")
Else
Objdf[,outcomename]<-paste (Objdf[,outcomename], "|")
}
# pairing column names with data: Adding 1 blank character before each variable
For (i in predictors) {
Objdf[,i]<-ifelse (Objdf[,i]==1,paste0 (", i),
IfElse (Objdf[,i]==0, ", Paste0 (", I, ":", Objdf[,i])))
}
# Reorder Columns: Regroup columns
Objdf<-objdf[c (outcomename,predictors)]
Head (OBJDF)
# # survived pclass.1st pclass.2nd pclass.3rd age Sex.female
# 1 1 | pclass.1st age:29 Sex.female
# 2-1 | pclass.1st Age:2 Sex.female
# 3-1 | pclass.1st age:30
# 4-1 | pclass.1st age:25 Sex.female
# 5 1 | pclass.1st age:0.92
# 6 1 | pclass.1st age:47
# # Sex.male Title.miss title.mr Title.mrs title.nothing
# 1 Title.miss
# 2 Title.miss
# # 3 Sex.male title.mr
# 4 Title.mrs
# # 5 Sex.male title.nothing
# # 6 Sex.male title.mr
Write.table (OBJDF, "Vw.txt", sep= "", quote=f,row.names=f,col.names=f)
Convert data to format Vowpal Wabbit Input Files in R language