Machine learning for hackers reading notes (i) using R language

Last Update:2015-10-21 Source: Internet

Author: User

Tags ggplot

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

#使用数据: UFO Data

#读入数据, the file is delimited by tabs, so using Read.delim, the parameter Sep sets the delimiter to \ t

#所有的read函数都把string读成factor类型, this type is used to represent categorical variables, so set Stringsasfactors to False

#header = f Indicates that there are no headers in the file

#na. string= ", means to set the empty element to the special value Na in R, to read all empty elements into NA

Ufo<-read.delim (' ufo_awesome.tsv ', sep= ' \ t ', stringsasfactors=f,header=f,na.string= ')

#head (data): first 6 rows to view

#tail (data): 6 rows after view

Head (UFO)

#添加表头

Names (UFO) <-C ("dateoccurred", "datereported", "Location", "Shortdescription", "Duration", "longdescription")

#由看出日期格式为YYYYMMDD, convert it to a date format in the R language

Ufo$dateoccurred<-as. Date (ufo$dateoccurred,format= '%y%m%d ')

#错误于strptime (x, format, TZ = "GMT"): The input string is too long because some date strings are greater than 8 characters and therefore need to be processed

#用以下代码查看日期长度不为8的

#nchar函数表示得出字符串长度

Head (Ufo[which (nchar (ufo$dateoccurred)!=8|nchar (ufo$datereported)!=8), 1])

Good.rows <-ifelse (NCHAR (ufo$dateoccurred)! = 8 | nchar (ufo$datereported)! = 8,false,true)

Length (which (!good.rows))

#结果有731条记录 (Chinese book is 371, should be wrong) date is not 8 characters, can be deleted directly from the data set

UFO <-Ufo[good.rows,]

#接着将所有日期字符串转为R语言中的日期型

Ufo$dateoccurred <-as. Date (ufo$dateoccurred, format = "%y%m%d")

Ufo$datereported <-as. Date (ufo$datereported, format = "%y%m%d")

#接下来要清理目击地点数据, the original data is city,state, split it into two columns

#定义一个函数, if the format returns Na,na incorrectly, if you return to the city state list correctly

#gsub函数用于替换

Get.location <-function (l)
{
Split.location <-TryCatch (Strsplit (L, ",") [[1]],
Error = function (e) return (C (Na, NA))
Clean.location <-gsub ("^", "", split.location)
if (length (clean.location) > 2)
{
Return (c (Na,na))
}
Else
{
Return (clean.location)
}
}

#接着用lapply函数对ufo的Location每一个记录都采用get. Location function for processing, results saved to city.state

City.state <-lapply (ufo$location, get.location)

#要把城市和州信息作为不同的两列加入数据框中, the list needs to be converted to matrix

#do. The call function means that each record of city.state is converted to matrix by row merge

Location.matrix <-Do.call (Rbind, City.state)

#用transform函数为ufo添加了两个列

UFO <-Transform (ufo,uscity = location.matrix[, 1],usstate = location.matrix[, 2],stringsasfactors = FALSE)

#state. ABB is the capital of the United States, and will not be the US state name all to NA

Ufo$usstate <-State.abb[match (Ufo$usstate, State.abb)]

#只保留美国州名的地点的记录, deposit records into ufo.us

ufo.us <-subset (UFO,!IS.NA (usstate))

#观察一下目击时间, from 1400 to 2010

Summary (ufo.us$dateoccurred)

#画一下直方图看看目击时间分布

Quick.hist <-Ggplot (ufo.us, aes (x = dateoccurred)) +geom_histogram () + scale_x_date (breaks = "Years")

Print (quick.hist)

#如, most of the findings occurred in 1960-2010, and we only focus on 1990-2010 data

#只取1990年后的数据

ufo.us <-subset (ufo.us, dateoccurred >= as. Date ("1990-01-01"))

#再画一次直方图

#我们的目的是发现目击事件的周期性规律, aggregations should be aggregated over a period of time, such as monthly aggregation, so you need to count the number of UFO sightings per state each month

#以下代码给ufo. US adds a new column that is Yearmonth, which extracts the year and month of the witness time, and Strftime is the string that is converted into a date type

Ufo.us$yearmonth <-strftime (ufo.us$dateoccurred, format = "%y-%m")

#使用ddply函数, ufo.us, by Usstate and Yearmonth

Sightings.counts <-ddply (ufo.us,. ( Usstate,yearmonth), Nrow)

#没有1990年2月和4月, so we need to add in the month of 0 sightings.

#先建一个包括所有月份的时间序列数据到date. Range

Date.range <-seq. Date (from = as. Date (min (ufo.us$dateoccurred)), to = as. Date (Max (ufo.us$dateoccurred)), by = "Month")

#然后转为年-month format to Date.strings

Date.strings <-strftime (Date.range, "%y-%m")

#接着需要建一个包含所有年月和州的数据框

States.dates <-lapply (State.abb, function (s) cbind (S, date.strings))

States.dates <-data.frame (Do.call (Rbind, states.dates), stringsasfactors = FALSE)

#合并数据all = True indicates that no match is included and is set to NA

All.sightings <-Merge (states.dates,sightings.counts,by.x = C ("s", "date.strings"), By.y = C ("Usstate", "YearMonth") , all = TRUE)

Names (all.sightings) <-C ("state", "Yearmonth", "sightings")

#将为NA的全部设为0

All.sightings$sightings[is.na (all.sightings$sightings)] <-0

#将date. Strings conversion to date type

All.sightings$yearmonth <-as. Date (Rep (date.range, Length (STATE.ABB)))

#将州名转换为分类变量

All.sightings$state <-As.factor (all.sightings$state)

#画图, Geom_line represents a graph, Facet_wrap represents a variable by state, and each factor draws a graph because there are 50 states that specify 10 rows and 5 columns.

#theme_bw表示用白色背景和黑色网格线画图

State.plot <-Ggplot (all.sightings, aes (x = Yearmonth,y = sightings)) +
Geom_line (AES (color = "Darkblue")) +
Facet_wrap (~state, nrow = ten, Ncol = 5) +
THEME_BW () +
Scale_color_manual (values = C ("Darkblue" = "Darkblue"), guide = "none") +
Scale_x_date (breaks = "5 years") +
Xlab ("Years") +
Ylab ("Number of sightings") +
Ggtitle ("Number of UFO sightings by Month-year and U.S. state (1990-2010)")

Machine learning for hackers reading notes (i) using R language

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More