The 1th Chapter uses R language
#machine learing for Heckers
#chapter 1
Library (GGPLOT2) library (PLYR)
#.tsv files are segmented with tabs
#字符串默认为factor类型, so stringsasfactors false prevents conversion
#header置FALSE防止将第一行当做表头
#定义空字符串为NA: na.strings = ""
UFO <-Read.delim ("ML_FOR_HACKERS/01-INTRODUCTION/DATA/UFO/UFO_AWESOME.TSV", sep = "\ t", stringsasfactors = False, Header = False, na.strings = "")
View the first 6 rows of a dataset
Tail () to see the following 6 rows
#names () can either write to a column name or read a column name
Names (UFO) <-C ("dateoccurred", "datereported", "Location", "Shortdescription", "Duration", "LongDescription" )
#as. Date usage, you can convert a string to a Date object, the format can be set, refer to help
#错误: Input too long, consider malformed data
#畸形数据处理
Head (Ufo[which (NCHAR (ufo$dateoccurred)! = 8 | nchar (ufo$datereported)! = 8), 1])
#新建向量, Boolean F is a line that does not meet the requirements
#计数不符要求的行数 and leave only the lines that meet the requirements
Good.rows <-ifelse (NCHAR (ufo$dateoccurred)! = 8 | nchar (ufo$datereported)! = 8, FALSE, TRUE) Length (which (! good.rows)) UFO <-Ufo[good.rows,]
The result of the operation is 731, and the book is 371, it should be wrong in the book
#转换
Ufo$dateoccurred <-as. Date (ufo$dateoccurred, format = "%y%m%d") ufo$datereported <-as. Date (ufo$datereported, format = "%y%m%d")
#输入为字符串, make a spot cleaning.
#strsplit用于分割字符串, a string that encounters a mismatch throws an exception, is captured by Trycatch, and returns a missing
#gsub将原始数据中的空格去掉 (by substitution)
#条件语句用于检查是否多个逗号, return missing
Get.location <-function (l) { split.location <-tryCatch (Strsplit (L, ",") [[1]], error = function (e) return (C ( Na, NA)) clean.location <-gsub ("^", "", split.location) if (length (clean.location) > 2) { return (C ( Na, NA) } else{ return (clean.location) }}
#lapply (list-apply) uses the function on a vector element and returns the list
City.state <-lapply (ufo$location, get.location)
#将list转换成matrix
#do. Call performs a function invocation on a list
#transform函数给ufo创建两个新列, the ToLower function will be uppercase and lowercase, in order to unify the format
Location.matrix <-Do.call (Rbind, city.state) UFO <-Transform (UFO, uscity = location.matrix[, 1], usstate = ToLower ( location.matrix[, 2]), stringsasfactors = FALSE)
#识别非美国地名, and set to NA
Us.states <-C ("AK", "Al", "Ar", "AZ", "Ca", "Co", "CT", "de", "FL", "GA", "Hi", "ia", "id", "il", "in", "KS", "KY" , "La", "Ma", "MD", "Me", "Mi", "mn", "Mo", "MS", "Mt", "NC", "nd", "ne", "NH", "NJ", "NM", "NV", "NY", "Oh", "OK", "O" R "," Pa "," ri ", " SC "," SD "," TN "," TX "," UT "," va "," VT "," WA "," wi "," WV "," WY ") ufo$usstate <-Us.states[match (UFO $USState, Us.states)]ufo$uscity[is.na (ufo$usstate)] <-na
#只留下美国境内的记录
ufo.us <-subset (UFO,!IS.NA (usstate))
#对时间维度进行分析:
#预处理: Overview of the time range
Summary (ufo.us$dateoccurred) quick.hist <-ggplot (ufo.us, aes (x = dateoccurred)) + geom_histogram () + scale_x_date ( Date_breaks = "Years") print (quick.hist)
#取出1990年后的数据并作图
ufo.us <-subset (ufo.us, dateoccurred >= as. Date ("1990-01-01")) quick.hist.new <-Ggplot (ufo.us, aes (x = dateoccurred)) + geom_histogram () + scale_x_date (date_ breaks = "Years") print (quick.hist.new)
Number of sightings #统计每个年-month
#时间信息转化为以月为单位, the number of sightings per month data frame
#产生一个以月为单位的序列, contains all month information and generates a data frame in combination with the location
Ufo.us$yearmonth <-strftime (ufo.us$dateoccurred, format = "%y-%m") sightings.counts <-ddply (ufo.us,. ( Usstate, Yearmonth), nrow) Date.range <-seq. Date (from = as. Date (min (ufo.us$dateoccurred)), to = as. Date (Max (ufo.us$dateoccurred)), by = "Month") date.strings <-strftime (Date.range, "%y-%m") states.dates <-lapply (Us.states, function (s) cbind (S, date.strings)) states.dates <-data.frame (Do.call (Rbind, States.dates), Stringsasfactors = FALSE)
#将两个数据框合并, the merge function, passed in two data frames, you can merge the same columns, by.x and by.y specify column names
#all置为TRUE可以将未匹配处填充为NA
#进一步将all. Sithtings detail optimization, including missing value 0 and conversion variable type
All.sightings <-Merge (States.dates, sightings.counts, by.x = C ("s", "date.strings"), by.y = C ("Usstate", " Yearmonth "), all = TRUE) names (all.sightings) <-C (" state "," Yearmonth "," sightings ") all.sightings$sightings[is.na ( all.sightings$sightings)] <-0all.sightings$yearmonth <-as. Date (Rep (date.range, Length (us.states))) all.sightings$state <-as.factor (ToUpper (all.sightings$state))
#分析数据
#geom_line表示曲线图, Facet_wrap is used to create block-drawn graphics and use categorical variables state
#theme_bw设定了图形背景主题
#scale_color_manual定义第二行中字符串 the value of "Darkblue", which corresponds to the value of "Darkblue"
State.plot <-Ggplot (all.sightings, aes (x = yearmonth, y = sightings)) + Geom_line (aes (color = "Darkblue") + F Acet_wrap (~state, nrow = ten, Ncol = 5) + THEME_BW () + scale_color_manual (values = C ("Darkblue" = "Darkblue"), GUID E = "None") + Xlab ("time") + Ylab ("Number of sightings") + Ggtitle ("Number of UFO sightings by Month-year and U.S. state (1990-2010)) print (State.plot)
[Reading notes] machine learning: Practical Case Analysis (1)