Convert data to format Vowpal Wabbit Input Files in R language

Source: Internet
Author: User

Related article: Https://github.com/JohnLangford/vowpal_wabbit/wiki

Https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format

# download TITANICDF data source from the Internet
Titanicdf<-read.csv (' Http://math.ucdenver.edu/RTutorial/titanic.txt ', sep= ' \ t ')
# Save the data source to a local
# write.table (titanicdf, ' titanicDF.txt ', row.names=f)
# write.csv (TITANICDF, "Titanicdf.csv", row.names=f)
# Create a new variable title
Titanicdf$title<-ifelse (Grepl (' Mr ', titanicdf$name), ' Mr ', IfElse (Grepl (' Mrs ', Titanicdf$name), ' Mrs ', IfElse ( Grepl (' Miss ', titanicdf$name), ' Miss ', ' nothing '))
# Convert to Factor type
Titanicdf$title<-as.factor (Titanicdf$title)
Head (TITANICDF)
# # Name PClass Age Sex
# 1 Allen, Miss Elisabeth Walton 1st 29.00 Female
# 2 Allison, Miss Helen loraine 1st 2.00 Female
# 3 Allison, Mr Hudson Joshua Creighton 1st 30.00 Male
# 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 Female
# 5 Allison, Master Hudson Trevor 1st 0.92 Male
# 6 Anderson, Mr Harry 1st 47.00 Male
# # survived Title
# # 1 1 Miss
# # 2 0 Miss
# 3 0 Mr
# 4 0 Mrs
# 5 1 Nothing
# 6 1 Mr
STR (TITANICDF)
# # ' Data.frame ': 1313 obs. of 6 variables:
# $ name:factor w/1310 levels "abbing, Mr Anthony",..: 22 25 26 27 24 31 45 46 50 54 ...
# # $ pclass:factor w/3 Levels "1st", "2nd", "3rd": 1111111111...
# # $ Age:num 29 2 30 25 0.92 47 63 39 58 71 ...
# # $ sex:factor W/2 Levels "female", "male": 1121221212...
# # $ survived:int 1000111010...
# # $ title:factor W/4 levels "Miss", "Mr", "Mrs",..:1123421232...
# View Data loss Scenarios
Library (Dfexplore)
# # Loading Required Package:ggplot2
Dfplot (TITANICDF)




Sum (is.na (Titanicdf$age)) # Age has 557 record data missing
# # [1] 557
# Fill the missing data of the age variable with the median number
Titanicdf$age[is.na (titanicdf$age)]<-median (titanicdf$age,na.rm=t)
# See if the TITTITANICDF data has missing values again
Dfplot (TITANICDF)




# Reorganize the columns of the data to put the target "survived" on the last side
Titanicdf<-titanicdf[c ("PClass", "Age", "Sex", "Title", "survived")]
Head (TITANICDF)
# # PClass Age Sex Title survived
# # 1 1st 29.00 Female Miss 1
# # 2 1st 2.00 female Miss 0
# # 3 1st 30.00 male Mr 0
# # 4 1st 25.00 Female Mrs 0
# # 5 1st 0.92 male nothing 1
# # 6 1st 47.00 male Mr 1
# binarize All factors: Virtual variable (dummy variable) processing for all factor variables
Library (caret)
# # Loading Required Package:lattice
Titanicdummy<-dummyvars (~.,DATA=TITANICDF,FULLRANK=F)
Titanicdf<-as.data.frame (Predict (TITANICDUMMY,NEWDATA=TITANICDF))
Head (TITANICDF)
# # pclass.1st pclass.2nd pclass.3rd age Sex.female Sex.male Title.miss
# 1 1 0 0 29.00 1 0 1
# 2 1 0 0 2.00 1 0 1
# 3 1 0 0 30.00 0 1 0
# 4 1 0 0 25.00 1 0 0
# 5 1 0 0 0.92 0 1 0
# 6 1 0 0 47.00 0 1 0
# # title.mr Title.mrs Title.nothing survived
# 1 0 0 0 1
# 2 0 0 0 0
# 3 1 0 0 0
# 4 0 1 0 0
# 5 0 0 1 1
# 6 1 0 0 1
Https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format


[Label] [Importance [tag]]| Namespace Features | Namespace Features ... | Namespace Features


Isclassification<-t
outcomename<-"survived"
labelname<-titanicdf[, "survived"]
weightname<-""
Objdf<-titanicdf
Predictors<-names (OBJDF) [!names (OBJDF)%in% C (outcomename,weightname)]
# LABELS & Importance
if (Is.null (labelname)) {
outcomename<-"Ignoreme"
objdf[,outcomename]<-"0 |"
} else {
if (isclassification) {
# Everything should be-1 1 for classification
Objdf[,outcomename]<-ifelse (objdf[,outcomename]>0,1,-1)
}
if (weightname! = ")
Objdf[,outcomename]<-psate (Objdf[,outcomename],objdf[,weightname], "|")
Else
Objdf[,outcomename]<-paste (Objdf[,outcomename], "|")
}
# pairing column names with data: Adding 1 blank character before each variable
For (i in predictors) {
Objdf[,i]<-ifelse (Objdf[,i]==1,paste0 (", i),
IfElse (Objdf[,i]==0, ", Paste0 (", I, ":", Objdf[,i])))
}


# Reorder Columns: Regroup columns
Objdf<-objdf[c (outcomename,predictors)]
Head (OBJDF)
# # survived pclass.1st pclass.2nd pclass.3rd age Sex.female
# 1 1 | pclass.1st age:29 Sex.female
# 2-1 | pclass.1st Age:2 Sex.female
# 3-1 | pclass.1st age:30
# 4-1 | pclass.1st age:25 Sex.female
# 5 1 | pclass.1st age:0.92
# 6 1 | pclass.1st age:47
# # Sex.male Title.miss title.mr Title.mrs title.nothing
# 1 Title.miss
# 2 Title.miss
# # 3 Sex.male title.mr
# 4 Title.mrs
# # 5 Sex.male title.nothing
# # 6 Sex.male title.mr
Write.table (OBJDF, "Vw.txt", sep= "", quote=f,row.names=f,col.names=f)

Convert data to format Vowpal Wabbit Input Files in R language

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.