[Reading notes] machine learning: Practical Case Analysis (1)

Source: Internet
Author: User

The 1th Chapter uses R language

#machine learing for Heckers
#chapter 1

Library (GGPLOT2) library (PLYR)

  

#.tsv files are segmented with tabs
#字符串默认为factor类型, so stringsasfactors false prevents conversion
#header置FALSE防止将第一行当做表头
#定义空字符串为NA: na.strings = ""

UFO <-Read.delim ("ML_FOR_HACKERS/01-INTRODUCTION/DATA/UFO/UFO_AWESOME.TSV",                   sep = "\ t", stringsasfactors = False, Header = False,                   na.strings = "")

  

View the first 6 rows of a dataset

Tail () to see the following 6 rows

#names () can either write to a column name or read a column name

Names (UFO) <-C ("dateoccurred", "datereported", "Location",                 "Shortdescription", "Duration", "LongDescription" )

  

#as. Date usage, you can convert a string to a Date object, the format can be set, refer to help

#错误: Input too long, consider malformed data
#畸形数据处理

Head (Ufo[which (NCHAR (ufo$dateoccurred)! = 8                | nchar (ufo$datereported)! = 8), 1])

  

#新建向量, Boolean F is a line that does not meet the requirements
#计数不符要求的行数 and leave only the lines that meet the requirements

Good.rows <-ifelse (NCHAR (ufo$dateoccurred)! = 8                     | nchar (ufo$datereported)! = 8, FALSE, TRUE) Length (which (! good.rows)) UFO <-Ufo[good.rows,]

The result of the operation is 731, and the book is 371, it should be wrong in the book

#转换

Ufo$dateoccurred <-as. Date (ufo$dateoccurred, format = "%y%m%d") ufo$datereported <-as. Date (ufo$datereported, format = "%y%m%d")

  

#输入为字符串, make a spot cleaning.
#strsplit用于分割字符串, a string that encounters a mismatch throws an exception, is captured by Trycatch, and returns a missing
#gsub将原始数据中的空格去掉 (by substitution)
#条件语句用于检查是否多个逗号, return missing

Get.location <-function (l) {  split.location <-tryCatch (Strsplit (L, ",") [[1]], error = function (e) return (C ( Na, NA))  clean.location <-gsub ("^", "", split.location)  if (length (clean.location) > 2) {    return (C ( Na, NA)  }  else{    return (clean.location)  }}

  

#lapply (list-apply) uses the function on a vector element and returns the list

City.state <-lapply (ufo$location, get.location)

  

#将list转换成matrix
#do. Call performs a function invocation on a list
#transform函数给ufo创建两个新列, the ToLower function will be uppercase and lowercase, in order to unify the format

Location.matrix <-Do.call (Rbind, city.state) UFO <-Transform (UFO, uscity = location.matrix[, 1], usstate = ToLower ( location.matrix[, 2]),                  stringsasfactors = FALSE)

  

#识别非美国地名, and set to NA

Us.states <-C ("AK", "Al", "Ar", "AZ", "Ca", "Co", "CT", "de", "FL", "GA", "Hi", "ia", "id",                "il", "in", "KS", "KY" , "La", "Ma", "MD", "Me", "Mi", "mn", "Mo", "MS", "Mt",                "NC", "nd", "ne", "NH", "NJ", "NM", "NV", "NY", "Oh", "OK", "O" R "," Pa "," ri ",                " SC "," SD "," TN "," TX "," UT "," va "," VT "," WA "," wi "," WV "," WY ") ufo$usstate <-Us.states[match (UFO $USState, Us.states)]ufo$uscity[is.na (ufo$usstate)] <-na

  

#只留下美国境内的记录

ufo.us <-subset (UFO,!IS.NA (usstate))

  

#对时间维度进行分析:
#预处理: Overview of the time range

Summary (ufo.us$dateoccurred) quick.hist <-ggplot (ufo.us, aes (x = dateoccurred)) + geom_histogram () + scale_x_date ( Date_breaks = "Years") print (quick.hist)

  

#取出1990年后的数据并作图

ufo.us <-subset (ufo.us, dateoccurred >= as. Date ("1990-01-01")) quick.hist.new <-Ggplot (ufo.us, aes (x = dateoccurred)) + geom_histogram () + scale_x_date (date_ breaks = "Years") print (quick.hist.new)

  

Number of sightings #统计每个年-month
#时间信息转化为以月为单位, the number of sightings per month data frame
#产生一个以月为单位的序列, contains all month information and generates a data frame in combination with the location

Ufo.us$yearmonth <-strftime (ufo.us$dateoccurred, format = "%y-%m") sightings.counts <-ddply (ufo.us,. ( Usstate, Yearmonth), nrow) Date.range <-seq. Date (from = as. Date (min (ufo.us$dateoccurred)), to                        = as. Date (Max (ufo.us$dateoccurred)), by = "Month") date.strings <-strftime (Date.range, "%y-%m") states.dates <-lapply (Us.states, function (s) cbind (S, date.strings)) states.dates <-data.frame (Do.call (Rbind, States.dates), Stringsasfactors = FALSE)

  

#将两个数据框合并, the merge function, passed in two data frames, you can merge the same columns, by.x and by.y specify column names
#all置为TRUE可以将未匹配处填充为NA
#进一步将all. Sithtings detail optimization, including missing value 0 and conversion variable type

All.sightings <-Merge (States.dates, sightings.counts,                        by.x = C ("s", "date.strings"),                        by.y = C ("Usstate", " Yearmonth "), all = TRUE) names (all.sightings) <-C (" state "," Yearmonth "," sightings ") all.sightings$sightings[is.na ( all.sightings$sightings)] <-0all.sightings$yearmonth <-as. Date (Rep (date.range, Length (us.states))) all.sightings$state <-as.factor (ToUpper (all.sightings$state))

  

#分析数据
#geom_line表示曲线图, Facet_wrap is used to create block-drawn graphics and use categorical variables state
#theme_bw设定了图形背景主题
#scale_color_manual定义第二行中字符串 the value of "Darkblue", which corresponds to the value of "Darkblue"

State.plot <-Ggplot (all.sightings, aes (x = yearmonth, y = sightings)) +   Geom_line (aes (color = "Darkblue") +   F Acet_wrap (~state, nrow = ten, Ncol = 5) +   THEME_BW () +   scale_color_manual (values = C ("Darkblue" = "Darkblue"), GUID  E = "None") +   Xlab ("time") +   Ylab ("Number of sightings") +   Ggtitle ("Number of UFO sightings by Month-year and U.S. state (1990-2010)) print (State.plot)

  

[Reading notes] machine learning: Practical Case Analysis (1)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.