Introduction
This semester also opened a multivariate statistical analysis course, also took the opportunity to take the class after the computer problem to achieve again, to enhance understanding.
The textbook uses the Johnson's "Multivariate Statistical analysis" Sixth edition, the Chinese and English edition textbooks, the dataset, the handout to see
Also referred to Wang Bin teacher's "Multivariate statistical analysis and R language Modeling"
This article is mainly the 4th chapter of the multivariate normal distribution on the computer problem, Tullo.
[See RMD documents] (Http://pan.baidu.com/s/1ntkuXQT)
Can be opened directly with Rstudio (install KNITR package before) 4.28
Data_4.28<-read.table ("E:\\ graduate student \" Applies multivariate statistics \\JohnsonWichern data sets\\t1-5. DAT ")
#正态Q-Q figure
Qqnorm (data_4.28$v2)
#正态性检验
#原始数据排序
new_data<-sort (data_4.28$v2)
Length (new_data)
#对应概率值
prob<-function (i,n=42) {#构建一个概率值的函数 return
((i-0.5)/n)
}
All_pro <-sapply (1:42,prob) #所有概率值
#对应的标准正态分位数
all_q<-qnorm (all_pro)
#Q-Q graph correlation coefficient
Rq<-cor ( NEW_DATA,ALL_Q)
the correlation coefficient of the #由于Q-Q graph is RQ 0.9693258, which is less than the n=40 corresponding critical point in table 4-2, thus rejecting the normality hypothesis.
4.29
# (a)
#计算样本协方差矩阵
S<-cov (Data_4.28[,5:6])
#s的逆
s_solve<-solve (s)
x_bar<-apply (Data_ 4.28[,5:6],margin=2,mean) #两列平均数
X_bar<-matrix (As.vector (X_bar), 42,2,by=2)
two_col<-t (data_4.28[, 5:6]-x_bar) #两列x-x_bar
#计算所用统计距离dis
dis<-c () for
(i-1:length (two_col[1,])) {
dis[i]<-t ( Two_col[,i])%*%s_solve%*%two_col[,i]
}
####################################
# (b)
#
chisq_num<-qchisq (0.5,2)
#所占比例
pro<-length (which (Dis<chisq_num)) with a threshold of 2 probability density of 0.5 for the card-square distribution /length (DIS)
####################################
# (c)
#对广义平方距离dis进行排序
Sort_data<-sort ( DIS)
the All_pro
#对应的自由度为2的卡方分位数
all_chiisq<-sapply (all_pro,qchisq,df=2) in #概率密度为4.28 #所有概率值
#画出卡方图 that is (all_chiisq,sort_data) corresponding scatter graph
library (ggplot2)
qplot (ALL_CHIISQ, Sort_data, geom= ' point ')
4.30
#读入数据 data_4.30_x1<-c (1:9,11) data_4.30_x2<-c (18.95,19.00,17.95,15.54,14.00,12.95,8.94,7.49,6.00,3.99) #
Build power Change function # #幂类变化函数 (Box-cox) box_cox<-function (x,λ) {if (λ==0) {return (log (x))}else{return ((x^λ-1)/λ)} } l_value<-function (X,lamda) {x_new<-sapply (X,BOX_COX,Λ=LAMDA) X_bar<-mean (x_new) l_val<-log (Mean (x_n Ew-x_bar) ^2)) * (-length (x_new)/2) + (lamda-1) *sum (log (x)) return (L_val)} #生成多个λ, to make L_value the largest λ_hat value Λ<-seq (- 1,2,0.1) All_l<-c () for (n in 1:length (λ)) {All_l[n]<-l_value (data_4.30_x1,lamda=λ[n])} #取使变化后的l_value最大的λ值 max_ Λ<-λ[which (All_l==max (all_l))] #进行数据幂变化 new_data<-sapply (data_4.30_x1,box_cox,λ=max_λ) #变化后的Q-Q map Qqnorm (new_ Data) ################################### # (b) #基本同 (a) title Λ<-seq ( -1,2,0.1) all_l<-c () for (n in 1:length (λ)) {all_l[ N]<-l_value (Data_4.30_x2,lamda=λ[n])} #取使变化后的l_value最大的λ值 Max_λ<-λ[which (All_l==max (all_l))] #进行数据幂变化 New_ Data<-sapply (data_4.30_x2,box_cox,λ=max_λ) #变化后的Q-Q figure Qqnorm (new_Data) ################################# # (c) slightly #题4.31-4.38 are #考虑边缘正态性 in accordance with the idea of solving the problem of 4.28-4.30: Do the q-q diagram to make a rough understanding then calculate the correlation coefficient of the q-q graph and compared with Table 4.2 in the book to determine whether to reject the assumption of normality #考虑二维正态性 using the 4.29 method to do the card map #变换可以采用平方根变换 logarithmic transformation Z transformation, see the Book p147 page, you can also use the power of 4.30, and then the transformed data painting Q-q diagram to judge.
4.39
Data_4.39<-read.table ("E:\\ graduate student \" Applies multivariate statistics \\JohnsonWichern data sets\\t4-6. DAT ") [, 1:5] # (a) #正态性检验 the correlation coefficient of #计算Q-Q graph encapsulates the code for question 4.28 norm_test<-function (data) {#原始数据排序 new_data<-sort (data) Len _data<-length (New_data) prob<-function (i,n) {#构建一个概率值的函数 return ((i-0.5)/N)} #对应概率值 all_pro<-sapply (1: Len_data,prob,n=len_data) #所有概率值 the correlation coefficient of #对应的标准正态分位数 all_q<-qnorm (All_pro) #Q-Q graph Return (Cor (NEW_DATA,ALL_Q))} # #对于 Independence #Q-Q figure Qqnorm (DATA_4.39$V1) #大部分在一条直线上 norm_test (DATA_4.39$V1) #在显著性水平为0.05, when n=150,
0.988 is less than the 0.9913 reject normality assumption in table 4.2. #也可以采用shapiro-wilk inspection #使用在mvnormtest包里mshapiro. Test, you can use Mshapiro.test to view how to use the method # #对于支撑力 Qqnorm (DATA_4.39$V2) Most of the time in a line norm_test (DATA_4.39$V2) #在显著性水平为0.05, when n=150, 0.989 is less than 0.9913 reject normality in Table 4.2. #对于仁爱心 Qqnorm (DATA_4.39$V3) #大部分在一条直线上 norm_test (Data_4.39$v3) #在显著性水平为0.05, when n=150, 0.993 is greater than 0.9913 in Table 4.2 does not reject the normality assumption #对于顺从性 Qqnorm (data_4.39$v4 #大部分在一条直线上 norm_test (DATA_4.39$V4) #在显著性水平为0.05, when n=150, 0.993 is greater than 0.9913 in Table 4.2 does not reject the normal assumption #对于领导能力 Qqnorm (dATA_4.39$V5) #大部分在一条直线上 norm_test (DATA_4.39$V5) #在显著性水平为0.05, when n=150, 0.981 is less than 0.9913 reject normality assumption in table 4.2 ##############
##################### # (b) # #使用卡方图进行判定 #构造画卡方图的函数 method the same title 4.29 chis_chart<-function (x) {#计算样本协方差矩阵 S<-cov (x) #s的逆 S_solve<-solve (s) x_bar<-apply (X,margin=2,mean) #两列平均数 two_col<-t (x-x_bar) #两列x-x_bar #计算所用统计距离dis dis<- C () for (I in 1:length (Two_col[1,])) {dis[i]<-t (Two_col[,i])%*%s_solve%*%two_col[,i]} #对广义平方距离dis进行排序 Sort_ Data<-sort (DIS) #prob在题4.28 Constructs All_pro<-sapply (1:length (x[,1), prob,n=130) #所有概率值 #对应的自由度为5的卡方分位数 all_chiisq& Lt;-sapply (all_pro,qchisq,df=5) #所有概率值 #画出卡方图 that is (all_chiisq,sort_data) the corresponding Scatter chart library (Ggplot2) Qplot (ALL_CHIISQ, Sor T_data, geom= ' point ')} chis_chart (data_4.39) #很明显, the dot on the card square is not close to a straight line, a curve, so the multivariate normality is not satisfied, it is known that the edge of the normal state is not satisfied with the situation, multivariate normality is rarely satisfied # # ############################### # (c) #在 (a), the distribution of independence, support and leadership does not conform to normality # #幂变化函数构造见题4. # #对于独立性 #生成多个λ, to maximize the λ_hat value of the L_value lambda <-seq ( -1,2,0.1) all_l<-c () for (N in 1:length (λ)) {All_l[n]<-l_value (data_4.39$v1,lamda=λ[n])} #取使变化后的l_value最大的λ值 Max_λ<-λ[which All_l==max (all _l)] #进行数据幂变化 new_data<-sapply (data_4.39$v1,box_cox,λ=max_λ) #变化后的Q-Q map Qqnorm (new_data) # #对于支撑力 all_l<-c () for (N in 1:length (λ)) {All_l[n]<-l_value (data_4.39$v2,lamda=λ[n])} #取使变化后的l_value最大的λ值 Max_λ<-λ[which (All_l==max (all_l))] # Data power Change new_data<-sapply (data_4.39$v2,box_cox,λ=max_λ) #变化后的Q-Q figure Qqnorm (new_data) # #对于领导力 All_l<-c () for (N in 1: Length (λ)) {All_l[n]<-l_value (data_4.39$v5,lamda=λ[n])} #取使变化后的l_value最大的λ值 Max_λ<-λ[which (All_l==max (all_l
)] #进行数据幂变化 new_data<-sapply (data_4.39$v5,box_cox,λ=max_λ) #变化后的Q-Q figure Qqnorm (New_data)
4.40
Data_4.40<-read.table ("E:\\ graduate student \" Applies multivariate statistics \\JohnsonWichern data sets\\t1-11. DAT ")
library (ggplot2)
#散点图检查
qplot (data_4.40$v1, Data_4.40$v2, geom= ' point ')
# You can see from the scatter chart that there is an outlier value
#标准化值来检查
cen_data<-scale (data_4.40)
#每一列的最大离群值为
Apply (ABS (Cen_data) in the X and Y axes, 2,max)
#与取标准化数据比较, Row 13th of the first column, row 7th of the second column has a larger deviation from the other data
# (b) (c) slightly 4.40