Table of Contents
Multiple regression with pr, spr, zero-order r
# multiple regression: a simple e.g. # # rm(list=ls()) d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") d colnames(d) <- c("y", "x1", "x2") d # attach(d) lm.y.x1 <- lm(y ~ x1, data=d) summary(lm.y.x1) lm.y.x2 <- lm(y ~ x2, data=d) summary(lm.y.x2) lm.y.x1x2 <- lm(y ~ x1+x2, data=d) summary(lm.y.x1x2) lm.y.x1x2$coefficient # y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2 a <- lm.y.x1x2$coefficient[1] b1 <- lm.y.x1x2$coefficient[2] b2 <- lm.y.x1x2$coefficient[3] y.pred <- a + (b1 * x1) + (b2 * x2) y.pred y.real <- y y.real y.mean <- mean(y) y.mean res <- y.real - y.pred reg <- y.pred - y.mean ss.res <- sum(res^2) ss.reg <- sum(reg^2) ss.tot <- var(y) * (length(y)-1) ss.tot ss.res ss.reg ss.res+ss.reg # slope test summary(lm.y.x1x2) # note on 2 t-tests # beta coefficient (standardized b) # beta <- b * (sd(x)/sd(y)) beta1 <- b1 * (sd(x1)/sd(y)) beta2 <- b2 * (sd(x2)/sd(y)) beta1 beta2 # install.packages("lm.beta") library(lm.beta) lm.beta(lm.y.x1x2) ####################################################### # partial correlation coefficient and pr2 # x2's explanation? lm.tmp.1 <- lm(x2~x1, data=d) res.x2.x1 <- lm.tmp.1$residuals lm.tmp.2 <- lm(y~x1, data=d) res.y.x1 <- lm.tmp.2$residuals lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) summary(lm.tmp.3) # install.packages("ppcor") library(ppcor) pcor(d) spcor(d) partial.r <- pcor.test(y, x2, x1) partial.r str(partial.r) partial.r$estimate^2 # x1's own explanation? lm.tmp.4 <- lm(x1~x2, data=d) res.x1.x2 <- lm.tmp.4$residuals lm.tmp.5 <- lm(y~x2, data=d) res.y.x2 <- lm.tmp.5$residuals lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) summary(lm.tmp.6) partial.r2 <- pcor.test(y, x1, x2) str(partial.r2) partial.r2$estimate^2 ####################################################### # semipartial correlation coefficient and spr2 # spr.1 <- spcor.test(y,x2,x1) spr.2 <- spcor.test(y,x1,x2) spr.1 spr.2 spr.1$estimate^2 spr.2$estimate^2 lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) summary(lm.tmp.7) ####################################################### # get the common area that explain the y variable # 1. summary(lm.y.x2) all.x2 <- summary(lm.y.x2)$r.squared sp.x2 <- spr.1$estimate^2 all.x2 sp.x2 cma.1 <- all.x2 - sp.x2 cma.1 # 2. summary(lm.y.x1) all.x1 <- summary(lm.y.x1)$r.squared sp.x1 <- spr.2$estimate^2 all.x1 sp.x1 cma.2 <- all.x1 - sp.x1 cma.2 # OR 3. summary(lm.y.x1x2) r2.y.x1x2 <- summary(lm.y.x1x2)$r.square r2.y.x1x2 sp.x1 sp.x2 cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2) cma.3 cma.1 cma.2 cma.3 # Note that sorting out unique and common # explanation area is only possible with # semi-partial correlation determinant # NOT partial correlation determinant # because only semi-partial correlation # shares the same denominator (as total # y). #############################################
Output
Multiple regression
> > lm.y.x1x2 <- lm(y ~ x1+x2, data=d) > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2173 -0.5779 -0.1515 0.6642 1.1906 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.399103 1.516539 4.220 0.00394 ** x1 0.011841 0.003561 3.325 0.01268 * x2 -0.544727 0.226364 -2.406 0.04702 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9301 on 7 degrees of freedom Multiple R-squared: 0.7981, Adjusted R-squared: 0.7404 F-statistic: 13.84 on 2 and 7 DF, p-value: 0.003696 > > > lm.y.x1x2$coefficient (Intercept) x1 x2 6.39910298 0.01184145 -0.54472725 > # y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2 > a <- lm.y.x1x2$coefficient[1] > b1 <- lm.y.x1x2$coefficient[2] > b2 <- lm.y.x1x2$coefficient[3] > > y.pred <- a + (b1 * x1) + (b2 * x2) > y.pred [1] 6.280586 5.380616 7.843699 6.588485 9.217328 10.022506 [7] 7.251626 9.809401 9.643641 7.962113 > y.real <- y > y.real [1] 6 5 7 7 8 10 8 11 9 9 > y.mean <- mean(y) > y.mean [1] 8 > > res <- y.real - y.pred > reg <- y.pred - y.mean > ss.res <- sum(res^2) > ss.reg <- sum(reg^2) > > ss.tot <- var(y) * (length(y)-1) > ss.tot [1] 30 > ss.res [1] 6.056235 > ss.reg [1] 23.94376 > ss.res+ss.reg [1] 30 > > # slope test > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2173 -0.5779 -0.1515 0.6642 1.1906 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.399103 1.516539 4.220 0.00394 ** x1 0.011841 0.003561 3.325 0.01268 * x2 -0.544727 0.226364 -2.406 0.04702 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9301 on 7 degrees of freedom Multiple R-squared: 0.7981, Adjusted R-squared: 0.7404 F-statistic: 13.84 on 2 and 7 DF, p-value: 0.003696 > # note on 2 t-tests > > # beta coefficient (standardized b) > # beta <- b * (sd(x)/sd(y)) > beta1 <- b1 * (sd(x1)/sd(y)) > beta2 <- b2 * (sd(x2)/sd(y)) > beta1 x1 0.616097 > beta2 x2 -0.4458785 > > # install.packages("lm.beta") > library(lm.beta) > lm.beta(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Standardized Coefficients:: (Intercept) x1 x2 NA 0.6160970 -0.4458785 > > ####################################################### > # partial correlation coefficient and pr2 > # x2's explanation? > lm.tmp.1 <- lm(x2~x1, data=d) > res.x2.x1 <- lm.tmp.1$residuals > > lm.tmp.2 <- lm(y~x1, data=d) > res.y.x1 <- lm.tmp.2$residuals > > lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) > summary(lm.tmp.3) Call: lm(formula = res.y.x1 ~ res.x2.x1, data = d) Residuals: Min 1Q Median 3Q Max -1.2173 -0.5779 -0.1515 0.6642 1.1906 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.281e-18 2.751e-01 0.000 1.000 res.x2.x1 -5.447e-01 2.117e-01 -2.573 0.033 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.8701 on 8 degrees of freedom Multiple R-squared: 0.4527, Adjusted R-squared: 0.3843 F-statistic: 6.618 on 1 and 8 DF, p-value: 0.033 > > # install.packages("ppcor") > library(ppcor) > pcor(d) $estimate y x1 x2 y 1.0000000 0.7825112 -0.6728560 x1 0.7825112 1.0000000 0.3422911 x2 -0.6728560 0.3422911 1.0000000 $p.value y x1 x2 y 0.00000000 0.01267595 0.04702022 x1 0.01267595 0.00000000 0.36723388 x2 0.04702022 0.36723388 0.00000000 $statistic y x1 x2 y 0.000000 3.3251023 -2.4064253 x1 3.325102 0.0000000 0.9638389 x2 -2.406425 0.9638389 0.0000000 $n [1] 10 $gp [1] 1 $method [1] "pearson" > spcor(d) $estimate y x1 x2 y 1.0000000 0.5646726 -0.4086619 x1 0.7171965 1.0000000 0.2078919 x2 -0.6166940 0.2470028 1.0000000 $p.value y x1 x2 y 0.00000000 0.113182 0.2748117 x1 0.02964029 0.000000 0.5914441 x2 0.07691195 0.521696 0.0000000 $statistic y x1 x2 y 0.000000 1.8101977 -1.1846548 x1 2.722920 0.0000000 0.5623159 x2 -2.072679 0.6744045 0.0000000 $n [1] 10 $gp [1] 1 $method [1] "pearson" > partial.r <- pcor.test(y, x2, x1) > partial.r estimate p.value statistic n gp Method 1 -0.672856 0.04702022 -2.406425 10 1 pearson > str(partial.r) 'data.frame': 1 obs. of 6 variables: $ estimate : num -0.673 $ p.value : num 0.047 $ statistic: num -2.41 $ n : int 10 $ gp : num 1 $ Method : chr "pearson" > partial.r$estimate^2 [1] 0.4527352 > > # x1's own explanation? > lm.tmp.4 <- lm(x1~x2, data=d) > res.x1.x2 <- lm.tmp.4$residuals > > lm.tmp.5 <- lm(y~x2, data=d) > res.y.x2 <- lm.tmp.5$residuals > > lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) > summary(lm.tmp.6) Call: lm(formula = res.y.x2 ~ res.x1.x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2173 -0.5779 -0.1515 0.6642 1.1906 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.330e-17 2.751e-01 0.000 1.00000 res.x1.x2 1.184e-02 3.331e-03 3.555 0.00746 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.8701 on 8 degrees of freedom Multiple R-squared: 0.6123, Adjusted R-squared: 0.5639 F-statistic: 12.64 on 1 and 8 DF, p-value: 0.007458 > > partial.r2 <- pcor.test(y, x1, x2) > str(partial.r2) 'data.frame': 1 obs. of 6 variables: $ estimate : num 0.783 $ p.value : num 0.0127 $ statistic: num 3.33 $ n : int 10 $ gp : num 1 $ Method : chr "pearson" > partial.r2$estimate^2 [1] 0.6123238 > ####################################################### > > # semipartial correlation coefficient and spr2 > # > spr.1 <- spcor.test(y,x2,x1) > spr.2 <- spcor.test(y,x1,x2) > spr.1 estimate p.value statistic n gp Method 1 -0.4086619 0.2748117 -1.184655 10 1 pearson > spr.2 estimate p.value statistic n gp Method 1 0.5646726 0.113182 1.810198 10 1 pearson > spr.1$estimate^2 [1] 0.1670045 > spr.2$estimate^2 [1] 0.3188552 > > lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) > summary(lm.tmp.7) Call: lm(formula = y ~ res.x2.x1, data = d) Residuals: Min 1Q Median 3Q Max -1.8617 -1.1712 -0.4940 0.5488 3.0771 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.0000 0.5589 14.314 5.54e-07 *** res.x2.x1 -0.5447 0.4301 -1.266 0.241 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.767 on 8 degrees of freedom Multiple R-squared: 0.167, Adjusted R-squared: 0.06288 F-statistic: 1.604 on 1 and 8 DF, p-value: 0.241 > ####################################################### > > # get the common area that explain the y variable > # 1. > summary(lm.y.x2) Call: lm(formula = y ~ x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2537 -0.8881 -0.4851 0.4963 2.5920 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 10.7910 1.1195 9.639 1.12e-05 *** x2 -0.8458 0.3117 -2.713 0.0265 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.397 on 8 degrees of freedom Multiple R-squared: 0.4793, Adjusted R-squared: 0.4142 F-statistic: 7.363 on 1 and 8 DF, p-value: 0.02651 > all.x2 <- summary(lm.y.x2)$r.squared > sp.x2 <- spr.1$estimate^2 > all.x2 [1] 0.4792703 > sp.x2 [1] 0.1670045 > cma.1 <- all.x2 - sp.x2 > cma.1 [1] 0.3122658 > > # 2. > summary(lm.y.x1) Call: lm(formula = y ~ x1, data = d) Residuals: Min 1Q Median 3Q Max -1.5189 -0.8969 -0.1297 1.0058 1.5800 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.617781 1.241518 2.914 0.01947 * x1 0.015269 0.004127 3.700 0.00605 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.176 on 8 degrees of freedom Multiple R-squared: 0.6311, Adjusted R-squared: 0.585 F-statistic: 13.69 on 1 and 8 DF, p-value: 0.006046 > all.x1 <- summary(lm.y.x1)$r.squared > sp.x1 <- spr.2$estimate^2 > all.x1 [1] 0.631121 > sp.x1 [1] 0.3188552 > cma.2 <- all.x1 - sp.x1 > cma.2 [1] 0.3122658 > > # OR 3. > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2173 -0.5779 -0.1515 0.6642 1.1906 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.399103 1.516539 4.220 0.00394 ** x1 0.011841 0.003561 3.325 0.01268 * x2 -0.544727 0.226364 -2.406 0.04702 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9301 on 7 degrees of freedom Multiple R-squared: 0.7981, Adjusted R-squared: 0.7404 F-statistic: 13.84 on 2 and 7 DF, p-value: 0.003696 > r2.y.x1x2 <- summary(lm.y.x1x2)$r.square > r2.y.x1x2 [1] 0.7981255 > sp.x1 [1] 0.3188552 > sp.x2 [1] 0.1670045 > cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2) > cma.3 [1] 0.3122658 > > cma.1 [1] 0.3122658 > cma.2 [1] 0.3122658 > cma.3 [1] 0.3122658 > # Note that sorting out unique and common > # explanation area is only possible with > # semi-partial correlation determinant > # NOT partial correlation determinant > # because only semi-partial correlation > # shares the same denominator (as total > # y). > ############################################# > > >
Simple regression
> # multiple regression: a simple e.g. > # > # > rm(list=ls()) > d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") > d bankaccount income famnum 1 6 220 5 2 5 190 6 3 7 260 3 4 7 200 4 5 8 330 2 6 10 490 4 7 8 210 3 8 11 380 2 9 9 320 1 10 9 270 3 > > colnames(d) <- c("y", "x1", "x2") > d y x1 x2 1 6 220 5 2 5 190 6 3 7 260 3 4 7 200 4 5 8 330 2 6 10 490 4 7 8 210 3 8 11 380 2 9 9 320 1 10 9 270 3 > # attach(d) > lm.y.x1 <- lm(y ~ x1, data=d) > summary(lm.y.x1) Call: lm(formula = y ~ x1, data = d) Residuals: Min 1Q Median 3Q Max -1.5189 -0.8969 -0.1297 1.0058 1.5800 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.617781 1.241518 2.914 0.01947 * x1 0.015269 0.004127 3.700 0.00605 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.176 on 8 degrees of freedom Multiple R-squared: 0.6311, Adjusted R-squared: 0.585 F-statistic: 13.69 on 1 and 8 DF, p-value: 0.006046
단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다.
$$ t^2 = F $$
> t.cal <- 3.7 > t.cal^2 [1] 13.69 > F.cal <- 13.69 > F.cal [1] 13.69
Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다.
$$ \text{t calculated value} = \frac {b - 0}{se} $$
위에서 $se$는 아래처럼 구한다고 언급하였다.
\begin{eqnarray*} se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\ & = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\ \text{note that MSE } & = & \text{mean square error } \\ & = & \text{ms.res } \end{eqnarray*}
위에서 구한 t값의 p value는 R에서
summary(lm.y.x1) n <- length(y) k <- 1 # num of predictor variables sse <- sum(lm.y.x1$residuals^2) # ss.res ssx1 <- sum((x1-mean(x1))^2) b <- lm.y.x1$coefficient[2] se <- sqrt((1/(n-2))*(sse/ssx1)) t.b.cal <- (b - 0) / se t.b.cal p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F) p.value # checck t.b.cal f.cal <- t.b.cal^2 f.cal p.value
> summary(lm.y.x1) Call: lm(formula = y ~ x1, data = d) Residuals: Min 1Q Median 3Q Max -1.5189 -0.8969 -0.1297 1.0058 1.5800 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.617781 1.241518 2.914 0.01947 * x1 0.015269 0.004127 3.700 0.00605 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.176 on 8 degrees of freedom Multiple R-squared: 0.6311, Adjusted R-squared: 0.585 F-statistic: 13.69 on 1 and 8 DF, p-value: 0.006046 > n <- length(y) > k <- 1 # num of predictor variables > sse <- sum(lm.y.x1$residuals^2) > ssx1 <- sum((x1-mean(x1))^2) > b <- lm.y.x1$coefficient[2] > se <-sqrt((1/(n-2))*(sse/ssx1)) > se <-sqrt(mse/ssx1) > t.b.cal <- (b - 0) / se > t.b.cal x1 3.699639 > p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F) > > # checck > t.b.cal x1 3.699639 > t.b.cal^2 x1 13.68733 > p.value x1 0.006045749 > >
> > lm.y.x2 <- lm(y ~ x2, data=d) > summary(lm.y.x2) Call: lm(formula = y ~ x2, data = d) Residuals: Min 1Q Median 3Q Max -1.2537 -0.8881 -0.4851 0.4963 2.5920 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 10.7910 1.1195 9.639 1.12e-05 *** x2 -0.8458 0.3117 -2.713 0.0265 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.397 on 8 degrees of freedom Multiple R-squared: 0.4793, Adjusted R-squared: 0.4142 F-statistic: 7.363 on 1 and 8 DF, p-value: 0.02651 > >