OSL Regression Simple linear regression
> fit<-lm (weight~height,women) > Summary (FIT) call:lm (Formula = Weight ~ height, data = women) Residuals:min 1Q Med Ian 3Q max-1.7333-1.1333-0.3833 0.7417 3.1167coefficients:estimate Std. Error t value Pr (>|t|) (Intercept) -87.51667 5.93694-14.74 1.71e-09 ***height 3.45000 0.09114 37.85 1.09e-14 * * *---signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 ' 1Residual standard error:1.525 on degrees of freedommultiple r-squared:0.991,adjusted r-squared:0.9903 F-sta tistic:1433 on 1 and DF, p-value:1.091e-14> residuals (FIT) 1 2 3 4 5 2.41666667 0.96666667 0.51666667 0.06666667-0 .38333333 6 7 8 9 10-0.83333333-1.28333333-1.73333333-1.18333333-1.63333333 11 12 13 14 15-1.08333333-0.53333333 0. 01666667 1.56666667 3.11666667> fitted (FIT) #列出拟合模型的预测值1 2 3 4 5 6 7 8 9 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 140.1833 Ten, 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833> residuals ( Fit) #列出拟合模型的残差值1 2 3 4 5 6 7 2.41666667 0.96666667 0.51666667 0.06666667-0.38333333-0.83333333-1.28333333 8 9 10 11 12 13 14-1.73333333-1.1 8333333-1.63333333-1.08333333-0.53333333 0.01666667 1.56666667 3.11666667> plot (Women$height~women$weight, Xlab= "Height (in Inches)", ylab= "Weight (in Pounds)") > Abline (FIT)
Get Predictive regression formula: Weight=-87.52+3.45*height
Polynomial regression
FIT2<-LM (Weight~height+i (height^2), data=women) > Summary (FIT2) call:lm (Formula = Weight ~ Height + I (height^2), data = women) residuals:min 1Q Median 3Q max-0.50941-0.29611-0.00941 0.28615 0.59706coefficients:estimate Std. Error T V Alue Pr (>|t|) (Intercept) 261.87818 25.19677 10.393 2.36e-07 ***height-7.34832 0.77769-9.449 6.58e-07 ***i (height^2) 0.08306 0.00598 1 3.891 9.32e-09 * * *---signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 ' 1Residual standard error:0.3841 on degrees of freedommultiple r-squared:0.9995,adjusted r-squared:0.9994 f-s Tatistic:1.139e+04 on 2 and DF, P-value: < 2.2e-16> plot (women$height,women$weight,xlab= "height (in inches)", Yla B= "Weight (in lbs)") > Lines (women$height,fitted (fit2)) regression formula: weight=261.88-7.35*height+0.083*height^2
Three quadratic linear regression
FIT3<-LM (Weight~height+i (height^2) +i (height^3), Data=women) Summary (FIT3) call:lm (Formula = Weight ~ Height + I ( height^2) + I (height^3), data = women) residuals:min 1Q Median 3Q max-0.40677-0.17391 0.03091 0.12051 0.42191Coefficients : Estimate Std. Error t value Pr (>|t|) (Intercept) -8.967e+02 2.946e+02-3.044 0.01116 * height 4.641e+01 1.366e+01 3.399 0.00594 **i (height^2) -7.462e-01 2.105e -01-3.544 0.00460 **i (height^3) 4.253e-03 1.079e-03 3.940 0.00231 * *---signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 ' 1Residual standard error:0.2583 on one degrees of freedommultiple r-squared:0.9998,adjusted r-squared:0.9997 f-s TATISTIC:1.679E+04 on 3 and one DF, P-value: < 2.2e-16plot (women$height,women$weight,xlab= "height", ylab= "weight") Lines (women$height,fitted (FIT3)) regression formula: Weight=-8.967e+02 + 4.641e+01*height-7.462e-01 * height^2 + 4.253e-03 *height^3
Multivariate linear regression
FIT<-LM (murder~population+illiteracy+income+frost,data=states) Summary (FIT) call:lm (formula = Murder ~ Population + illiteracy + Income + Frost, data = states) Residuals:min 1Q Median 3Q max-4.7960-1.6495-0.0811 1.4815 7.6210Coefficie Nts:estimate Std. Error t value Pr (>|t|) (Intercept) 1.235e+00 3.866e+00 0.319 0.7510 Population 2.237e-04 9.052e-05 2.471 0.0173 * Illiteracy 4.143e+00 8.744e-01 4.738 2.19e-05 ***income 6.442e-05 6.837e-04 0.094 0.9253 Frost 5.813e-04 1.005e-02 0.058 0.9541---signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 ' 1Residual error:2.535 on degrees of freedommultiple r-squared:0.567,adjusted r-squared:0.5285 F-sta tistic:14.73 on 4 and DF, p-value:9.133e-08
multivariate linear regression with interacting items
FIT<-LM (Mpg~hp+wt+hp:wt,data=mtcars) Summary (FIT) call:lm (formula = MPG ~ HP + WT + HP:WT, data = mtcars) residuals:min 1Q Median 3Q max-3.0632-1.6491-0.7362 1.4211 4.5513coefficients:estimate Std. Error t value Pr (>|t|) (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***hp-0.12010 0.02470-4.863 4.04e-05 ***wt-8.21662 1.26971-6.471 5.20e-07 HP:WT 0.02785 0.00742 3.753 0.000811 * * *---signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 ' 1Residual standard error:2.153 on degrees of freedommultiple r-squared:0.8848,adjusted r-squared:0.8724 f-st atistic:71.66 on 3 and DF, P-VALUE:2.981E-13 Library (effects) plot (effect ("HP:WT", Fit,,list (Wt=c (2.2,3.2,4.2))), Multiline=true)
Regression judgment
FIT<-LM (murder~population+illiteracy+income+frost,data=states) confint (FIT) 2.5% 97.5 (Intercept)- 6.552191e+00 9.0213182149Population 4.136397e-05 0.0004059867Illiteracy 2.381799e+00 5.9038743192 # Illiteracy changes 1%, the murder rate is changing the interval, its confidence interval is 95%income-1.312611e-03 0.0014414600frost-1.966781e-02 0.0208304170 fit<-lm ( weight~height,data=women) par (mfrow=c (2,2)) plot (FIT) residuals vs fitted: linear normal q-q: normality scale-location: Same variance
FIT2<-LM (Weight~height+i (height^2), data=women) par (nfrow=c (2,2)) plot (fit2)
NEWFIT2<-LM (Weight~height+i (height^2), Data=women[-c (13,15),]) #排除异常点par (Nfrow=c (2,2)) plot (newfit2)
Detecting outlier values
#离群点library (CAR) outliertest (FIT) rstudent unadjusted p-value bonferonni pnevada 3.542929 0.00095088 0.047544 # High leverage value hat.plot<-function (FIT) {p<-length (FIT) n<-length (FIT) plot (fitted (FIT), main= "Index Plot of Hat Values") abline (h=c (2,3) *p/n,col= "red", lty=2) identify (1:n,hatvalues (FIT), names (Hatvalues ( Fit))}hat.plot (FIT)
#强影响点cutoff <-4/(Nrow (states)-length (fit$coefficients)-2) plot (Fit,which=4,cook.levels=cutoff) abline (H=cutoff , lty=2,col= "Red")
Library (CAR) avplots (fit,ask=false,id.method= "identify")
Library (CAR) influenceplot (Fit,id.method = "identify", main= "Influence Plot", sub= "Circle size is proportional to Cook ' s Distance ")
Improvement measures
#变量变换library (CAR) Summary (Powertransform (states$murder)) Bcpower transformation to normality Est Power rounded Pwr Wald LWR BND Wald Upr bndstates$murder 0.6055 1 0.0884 1.1227 #可以使用 0.6来 normal variable murder, e.g. murder ^ 0.6Likelihood ratio tests About transformation PARAMETERSLRT DF Pvallr Test, lambda = (0) 5.665991 1 0.01729694LR test, lambda = (1) 2.122763 1 0.14 512456
Choosing the best regression model
#排除不需要的随机变量fit1 <-lm (murder~population+illiteracy+income+frost,data=states) fit2<-lm (murder~population+ illiteracy,data=states) Anova (fit2,fit1) AIC (fit1,fit2) #AIF the smaller the return value, the better the model effect, the thin variable df AICfit1 6 241.6429fit2 4 237.6565 The stepwise regression model explores the variables required for the best regression model by adding or deleting a variable each time the library (MASS) FIT<-LM (murder~population+illiteracy+income+frost,data= States) Stepaic (fit,direction= "backward") Start:aic=97.75murder ~ Population + illiteracy + Income + frostdf Sum of Sq RSS Aic-frost 1 0.021 289.19 95.753-income 1 0.057 289.22 95.759<none> 289.17 97.749-population 1 39.238 328.41 102. 111-illiteracy 1 144.264 433.43 115.986step:aic=95.75murder ~ Population + illiteracy + incomedf Sum of Sq RSS Aic-inco Me 1 0.057 289.25 93.763<none> 289.19 95.753-population 1 43.658 332.85 100.783-illiteracy 1 236.196 525.38 123.60 5step:aic=93.76murder ~ Population + illiteracydf Sum of Sq RSS aic<none> 289.25 93.763-population 1 48.517 337.76 99.516-illiteracy 1 299.646 588.89 127.311call:lm (formula = MuRder ~ Population + illiteracy, data = states) Best Model formula coefficients: (Intercept) Population illiteracy 1.6515497 0.0002242 4. 0807366 variable selection-Full subset regression install.packages ("Leaps") library (Leaps) States<-as.data.frame (State.x77[,c ("Murder", " Population "," illiteracy "," Income "," Frost ")]) leaps<-regsubsets (Murder~population+illiteracy+income+frost, data=states,nbest=4) Plot (leaps,scale= "ADJR2")
Library (CAR) subsets (leaps,statistic= "CP", main= "CP Plot for all subsets Regression") abline (1,1,lty=2,col= "Red") #离线越近, the effect is about good
#R平方的K重交叉验证install. Packages ("bootstrap") shrinkage<-function (fit,k=10) {require (bootstrap) theta.fit<- function (x, y) {lsfit (x, y)}theta.predict<-function (fit,x) {cbind (1,x)%*%fit$coef}x<-fit$model[,2:ncol (fit$ Model)]y<-fit$model[,1]results<-crossval (x,y,theta.fit,theta.predict,ngroup=k) R2<-cor (y,fit$ fitted.values) ^2r2cv<-cor (y,results$cv.fit) ^2cat ("Original r-square=", r2, "\ n") Cat (K, "Fold cross-validated R-square= ", R2CV," \ n ") Cat (" change= ", R2-R2CV," \ n ")}fit<-lm (murder~population+illiteracy+income+frost,data= States) Shrinkage (FIT) Original r-square= 0.5669502 Fold cross-validated r-square= 0.4474345 change= 0.1195158fit2< -LM (murder~population+illiteracy,data=states) shrinkage (fit2) Original r-square= 0.5668327 Fold cross-validated r-square= 0.5182764 change= 0.0485563 relative importance the importance of variables in the detection model zstates<-as.data.frame (Scale (states)) Zfit<-lm (murder ~population+income+illiteracy+frost,data=zstates) Coef (Zfit) (Intercept) Population Income illiteracy Frost- 2.054026e-162.705095e-01 1.072372e-02 6.840496e-01 8.185407e-03 The maximum value of the illiteracy rate, so it weights the maximum variable weight in the model variable relweights <-function (Fit,...) {R <-Cor (fit$model) Nvar <-ncol (R) rxx <-r[2:nvar, 2:nvar]rxy <-r[2:nvar, 1]SVD <-eigen (rxx) Evec <-s Vd$vectorsev <-svd$valuesdelta <-diag (sqrt (EV)) lambda <-evec%*% Delta%*% T (Evec) lambdasq <-Lambda ^ 2bet A <-solve (lambda)%*% rxyrsquare <-colsums (beta ^ 2) rawwgt <-lambdasq%*% Beta ^ 2import <-(Rawwgt/rsqua RE) * 100import <-as.data.frame (Import) row.names (import) <-names (Fit$model[2:nvar]) names (import) <-" Weights "Import <-import[order (import), 1, Drop=false]dotchart (import$weights, Labels=row.names (import), xlab="% of R-square ", pch=19,main=" Relative importance of Predictor Variables ", Sub=paste (" Total r-square= ", round (Rsquare, digits=3)),...) Return (import)}fit<-lm (murder~population+illiteracy+income+frost,data=states) relweights (fit,col= "Blue")
R Language Learning Note (vi): OLS regression