R Language Learning Note (14): Cluster analysis

Last Update:2017-11-05 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

#聚类分析par (Mfrow=c) #计算距离install. Packages ("Flexclust") data (nutrient,package= "Flexclust") head (nutrient,4)

Energy Protein fat Calcium iron
Beef Braised 340 20 28 9 2.6
Hamburger 245 21 17 9 2.7
Beef Roast 420 15 39 7 2.0
Beef Steak 375 19 32 9 2.6

D<-dist (Nutrient) As.matrix (d) [1:4,1:4]

Beef Braised Hamburger beef roast Beef steak
Beef braised 0.00000 95.6400 80.93429 35.24202
Hamburger 95.64000 0.0000 176.49218 130.87784
Beef Roast 80.93429 176.4922 0.00000 45.76418
Beef Steak 35.24202 130.8778 45.76418 0.00000

#层次聚类分析
Par (nfrow=c ())
Data (nutrient,package= "Flexclust")
Row.names (Nutrient) <-tolower (Row.names (nutrient))
Nutrient.scaled<-scale (Nutrient)

D<-dist (nutrient.scaled)

Fit.average<-hclust (d,method= "average")
Plot (fit.average,hang=-1,cex=.8,main= "average Linkage Clustering")

#选择聚类的个数
Install.packages ("Nbclust")
Library (Nbclust)
Devasknewpage (Ask=true)
Nc<-nbclust (nutrient.scaled,distance= "Euclidean", min.nc=2,max.nc=15,method= "average")
Table (Nc$best.n[1,])

Barplot (Table (nc$best.n[1,]), xlab= "Number of Clusters", ylab= "number of Criteria", main= "number of Clusters Chosen by 26 Criteria ")

#获取最终的聚类方案
Par (mfrow=c ())

Clusters<-cutree (fit.average,k=5)
Table (clusters)

1 2 3) 4 5
7 16 1) 2 1

Aggregate (Nutrient,by=list (cluster=clusters), median)

Cluster energy protein fat calcium iron
1 1 340.0 19 29 9 2.50
2 2 170.0 20 8 13 1.45
3 3 160.0 26 5 14 5.90
4 4 57.5 9 1 78 5.70
5 5 180.0 22 9 367 2.50

Aggregate (As.data.frame (nutrient.scaled), By=list (cluster=clusters), median)

Cluster energy protein fat calcium iron
1 1 1.3101024 0.0000000 1.3785620-0.4480464 0.08110456
2 2-0.3696099 0.2352002-0.4869384-0.3967868-0.63743114
3 3-0.4684165 1.6464016-0.7534384-0.3839719 2.40779157
4 4-1.4811842-2.3520023-1.1087718 0.4361807 2.27092763
5 5-0.2708033 0.7056007-0.3981050 4.1396825 0.08110456

Plot (fit.average,hang=-1,cex=.8,main= "Average Linkage clustering Cluster solution")

Rect.hclust (fit.average,k=5)

#划分聚类分析
Install.packages ("Rattle")
#install. Packages ("RGTK2")
Install.packages ("Https://cran.r-project.org/bin/windows/contrib/3.3/RGtk2_2.20.31.zip", Repos=null)
Install.packages ("Httr")
Library ("Rattle")
Library ("RGtk2")
Library ("Httr")
A <-GET ("Https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data")
Wine <-read.csv (textconnection (content (a)), header=f)

Names (wine) <-c ("Type", "Alcohol", "Malic acid", "ash", "alcalinity of Ash", "magnesium", "Total phenols", "Flavanoids" , "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "od280/od315 of diluted Wines", "proline")
#data (wine,package= "rattle")
Head (wine)
Df<-scale (Wine[-1])

Wssplot (DF)
Library (Nbclust)
Set.seed (1234)
Devasknewpage (Ask=true)
Nc<-nbclust (df,min.nc=2,max.nc=15,method= "Kmeans")
Table (Nc$best.n[1,])

Barplot (Table (nc$best.n[1,]), xlab= "Number of Clusters", ylab= "number of Criteria", main= "number of Clusters Chosen by 26 Criteria ")

Set.seed (1234)
Fit.km<-kmeans (DF,3,NSTART=25)
Fit.km$size
Fit.km$centers

Ct.km<-table (Wine$type,fit.km$cluster)
ct.km

1 2 3
1 59 0 0
2 3 65 3
3 0 0 48

Library (Flexclust)
Randindex (ct.km)

Ari
0.897495

#围绕中心点的划分
Library (Cluster)
Set.seed (1234)
Fit.pam<-pam (Wine[-1],k=3,stand=true)
Fit.pam$method

Clusplot (fit.pam,main= "bivariate Cluster Plot")

Library (Flexclust)
Randindex (Ct.pam)

Ari
0.6994957

#围绕中心点的划分
Library (Cluster)
Set.seed (1234)
Fit.pam<-pam (Wine[-1],k=3,stand=true)
Fit.pam$medoids
Clusplot (fit.pam,main= "bivariate Cluster Plot2")

Ct.pam<-table (wine$type,fit.pam$clustering)

Randindex (Ct.pam)

Ari
0.6994957

#避免不存在的类
Install.packages ("Fmultivar")
Library (Fmultivar)
Set.seed (1234)
Df<-rnorm2d (1000,rho=.5)
Df<-as.data.frame (DF)
Plot (df,main= "binariate Normal distribution with rho=0.5")

#wssplot (DF)
Library (Nbclust)
Nc<-nbclust (df,min.nc=2,max.nc=15,method= "Kmeans")
Dev.new ()
Barplot (Table (nc$best.n[1,]), xlab= "Number of Clusters", ylab= "number of Criteria", main= "number of Clusters Chosen by 26 Criteria ")

Library (GGPLOT2)
Library (Cluster)
Fit<-pam (df,k=2)
Df$clustering<-factor (fit$clustering)
Ggplot (Data=df,aes (x=v1,y=v2,color=clustering,shape=clustering)) +geom_point () +ggtitle ("Clustering of Bivariate Normal Data ")

Plot (nc$all.index[,4],type= "O", ylab= "CCC", xlab= "number of clusters", col= "Blue")

R Language Learning Note (14): Cluster analysis

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

R Language Learning Note (14): Cluster analysis

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

R Language Learning Note (14): Cluster analysis

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support