#使用数据: UFO Data
#读入数据, the file is delimited by tabs, so using Read.delim, the parameter Sep sets the delimiter to \ t
#所有的read函数都把string读成factor类型, this type is used to represent categorical variables, so set Stringsasfactors to False
#header = f Indicates that there are no headers in the file
#na. string= ", means to set the empty element to the special value Na in R, to read all empty elements into NA
Ufo<-read.delim (' ufo_awesome.tsv ', sep= ' \ t ', stringsasfactors=f,header=f,na.string= ')
#head (data): first 6 rows to view
#tail (data): 6 rows after view
Head (UFO)
#添加表头
Names (UFO) <-C ("dateoccurred", "datereported", "Location", "Shortdescription", "Duration", "longdescription")
#由看出日期格式为YYYYMMDD, convert it to a date format in the R language
Ufo$dateoccurred<-as. Date (ufo$dateoccurred,format= '%y%m%d ')
#错误于strptime (x, format, TZ = "GMT"): The input string is too long because some date strings are greater than 8 characters and therefore need to be processed
#用以下代码查看日期长度不为8的
#nchar函数表示得出字符串长度
Head (Ufo[which (nchar (ufo$dateoccurred)!=8|nchar (ufo$datereported)!=8), 1])
Good.rows <-ifelse (NCHAR (ufo$dateoccurred)! = 8 | nchar (ufo$datereported)! = 8,false,true)
Length (which (!good.rows))
#结果有731条记录 (Chinese book is 371, should be wrong) date is not 8 characters, can be deleted directly from the data set
UFO <-Ufo[good.rows,]
#接着将所有日期字符串转为R语言中的日期型
Ufo$dateoccurred <-as. Date (ufo$dateoccurred, format = "%y%m%d")
Ufo$datereported <-as. Date (ufo$datereported, format = "%y%m%d")
#接下来要清理目击地点数据, the original data is city,state, split it into two columns
#定义一个函数, if the format returns Na,na incorrectly, if you return to the city state list correctly
#gsub函数用于替换
Get.location <-function (l)
{
Split.location <-TryCatch (Strsplit (L, ",") [[1]],
Error = function (e) return (C (Na, NA))
Clean.location <-gsub ("^", "", split.location)
if (length (clean.location) > 2)
{
Return (c (Na,na))
}
Else
{
Return (clean.location)
}
}
#接着用lapply函数对ufo的Location每一个记录都采用get. Location function for processing, results saved to city.state
City.state <-lapply (ufo$location, get.location)
#要把城市和州信息作为不同的两列加入数据框中, the list needs to be converted to matrix
#do. The call function means that each record of city.state is converted to matrix by row merge
Location.matrix <-Do.call (Rbind, City.state)
#用transform函数为ufo添加了两个列
UFO <-Transform (ufo,uscity = location.matrix[, 1],usstate = location.matrix[, 2],stringsasfactors = FALSE)
#state. ABB is the capital of the United States, and will not be the US state name all to NA
Ufo$usstate <-State.abb[match (Ufo$usstate, State.abb)]
#只保留美国州名的地点的记录, deposit records into ufo.us
ufo.us <-subset (UFO,!IS.NA (usstate))
#观察一下目击时间, from 1400 to 2010
Summary (ufo.us$dateoccurred)
#画一下直方图看看目击时间分布
Quick.hist <-Ggplot (ufo.us, aes (x = dateoccurred)) +geom_histogram () + scale_x_date (breaks = "Years")
Print (quick.hist)
#如, most of the findings occurred in 1960-2010, and we only focus on 1990-2010 data
#只取1990年后的数据
ufo.us <-subset (ufo.us, dateoccurred >= as. Date ("1990-01-01"))
#再画一次直方图
#我们的目的是发现目击事件的周期性规律, aggregations should be aggregated over a period of time, such as monthly aggregation, so you need to count the number of UFO sightings per state each month
#以下代码给ufo. US adds a new column that is Yearmonth, which extracts the year and month of the witness time, and Strftime is the string that is converted into a date type
Ufo.us$yearmonth <-strftime (ufo.us$dateoccurred, format = "%y-%m")
#使用ddply函数, ufo.us, by Usstate and Yearmonth
Sightings.counts <-ddply (ufo.us,. ( Usstate,yearmonth), Nrow)
#没有1990年2月和4月, so we need to add in the month of 0 sightings.
#先建一个包括所有月份的时间序列数据到date. Range
Date.range <-seq. Date (from = as. Date (min (ufo.us$dateoccurred)), to = as. Date (Max (ufo.us$dateoccurred)), by = "Month")
#然后转为年-month format to Date.strings
Date.strings <-strftime (Date.range, "%y-%m")
#接着需要建一个包含所有年月和州的数据框
States.dates <-lapply (State.abb, function (s) cbind (S, date.strings))
States.dates <-data.frame (Do.call (Rbind, states.dates), stringsasfactors = FALSE)
#合并数据all = True indicates that no match is included and is set to NA
All.sightings <-Merge (states.dates,sightings.counts,by.x = C ("s", "date.strings"), By.y = C ("Usstate", "YearMonth") , all = TRUE)
Names (all.sightings) <-C ("state", "Yearmonth", "sightings")
#将为NA的全部设为0
All.sightings$sightings[is.na (all.sightings$sightings)] <-0
#将date. Strings conversion to date type
All.sightings$yearmonth <-as. Date (Rep (date.range, Length (STATE.ABB)))
#将州名转换为分类变量
All.sightings$state <-As.factor (all.sightings$state)
#画图, Geom_line represents a graph, Facet_wrap represents a variable by state, and each factor draws a graph because there are 50 states that specify 10 rows and 5 columns.
#theme_bw表示用白色背景和黑色网格线画图
State.plot <-Ggplot (all.sightings, aes (x = Yearmonth,y = sightings)) +
Geom_line (AES (color = "Darkblue")) +
Facet_wrap (~state, nrow = ten, Ncol = 5) +
THEME_BW () +
Scale_color_manual (values = C ("Darkblue" = "Darkblue"), guide = "none") +
Scale_x_date (breaks = "5 years") +
Xlab ("Years") +
Ylab ("Number of sightings") +
Ggtitle ("Number of UFO sightings by Month-year and U.S. state (1990-2010)")
Machine learning for hackers reading notes (i) using R language