This is an old revision of the document!
Table of Contents
Multiple regression with pr, spr, zero-order r
# multiple regression: a simple e.g.
#
#
rm(list=ls())
d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")
d
colnames(d) <- c("y", "x1", "x2")
d
# attach(d)
lm.y.x1 <- lm(y ~ x1, data=d)
summary(lm.y.x1)
lm.y.x2 <- lm(y ~ x2, data=d)
summary(lm.y.x2)
lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
summary(lm.y.x1x2)
lm.y.x1x2$coefficient
# y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2
a <- lm.y.x1x2$coefficient[1]
b1 <- lm.y.x1x2$coefficient[2]
b2 <- lm.y.x1x2$coefficient[3]
y.pred <- a + (b1 * x1) + (b2 * x2)
y.pred
y.real <- y
y.real
y.mean <- mean(y)
y.mean
res <- y.real - y.pred
reg <- y.pred - y.mean
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.tot <- var(y) * (length(y)-1)
ss.tot
ss.res
ss.reg
ss.res+ss.reg
# slope test
summary(lm.y.x1x2)
# note on 2 t-tests
# beta coefficient (standardized b)
# beta <- b * (sd(x)/sd(y))
beta1 <- b1 * (sd(x1)/sd(y))
beta2 <- b2 * (sd(x2)/sd(y))
beta1
beta2
# install.packages("lm.beta")
library(lm.beta)
lm.beta(lm.y.x1x2)
#######################################################
# partial correlation coefficient and pr2
# x2's explanation?
lm.tmp.1 <- lm(x2~x1, data=d)
res.x2.x1 <- lm.tmp.1$residuals
lm.tmp.2 <- lm(y~x1, data=d)
res.y.x1 <- lm.tmp.2$residuals
lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
summary(lm.tmp.3)
# install.packages("ppcor")
library(ppcor)
pcor(d)
spcor(d)
partial.r <- pcor.test(y, x2, x1)
partial.r
str(partial.r)
partial.r$estimate^2
# x1's own explanation?
lm.tmp.4 <- lm(x1~x2, data=d)
res.x1.x2 <- lm.tmp.4$residuals
lm.tmp.5 <- lm(y~x2, data=d)
res.y.x2 <- lm.tmp.5$residuals
lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
summary(lm.tmp.6)
partial.r2 <- pcor.test(y, x1, x2)
str(partial.r2)
partial.r2$estimate^2
#######################################################
# semipartial correlation coefficient and spr2
#
spr.1 <- spcor.test(y,x2,x1)
spr.2 <- spcor.test(y,x1,x2)
spr.1
spr.2
spr.1$estimate^2
spr.2$estimate^2
lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
summary(lm.tmp.7)
#######################################################
# get the common area that explain the y variable
# 1.
summary(lm.y.x2)
all.x2 <- summary(lm.y.x2)$r.squared
sp.x2 <- spr.1$estimate^2
all.x2
sp.x2
cma.1 <- all.x2 - sp.x2
cma.1
# 2.
summary(lm.y.x1)
all.x1 <- summary(lm.y.x1)$r.squared
sp.x1 <- spr.2$estimate^2
all.x1
sp.x1
cma.2 <- all.x1 - sp.x1
cma.2
# OR 3.
summary(lm.y.x1x2)
r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
r2.y.x1x2
sp.x1
sp.x2
cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)
cma.3
cma.1
cma.2
cma.3
# Note that sorting out unique and common
# explanation area is only possible with
# semi-partial correlation determinant
# NOT partial correlation determinant
# because only semi-partial correlation
# shares the same denominator (as total
# y).
#############################################
Output
Simple regression
> # multiple regression: a simple e.g.
> #
> #
> rm(list=ls())
> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")
> d
bankaccount income famnum
1 6 220 5
2 5 190 6
3 7 260 3
4 7 200 4
5 8 330 2
6 10 490 4
7 8 210 3
8 11 380 2
9 9 320 1
10 9 270 3
>
> colnames(d) <- c("y", "x1", "x2")
> d
y x1 x2
1 6 220 5
2 5 190 6
3 7 260 3
4 7 200 4
5 8 330 2
6 10 490 4
7 8 210 3
8 11 380 2
9 9 320 1
10 9 270 3
> # attach(d)
> lm.y.x1 <- lm(y ~ x1, data=d)
> summary(lm.y.x1)
Call:
lm(formula = y ~ x1, data = d)
Residuals:
Min 1Q Median 3Q Max
-1.5189 -0.8969 -0.1297 1.0058 1.5800
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.617781 1.241518 2.914 0.01947 *
x1 0.015269 0.004127 3.700 0.00605 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.176 on 8 degrees of freedom
Multiple R-squared: 0.6311, Adjusted R-squared: 0.585
F-statistic: 13.69 on 1 and 8 DF, p-value: 0.006046
단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다.
$$ t^2 = F $$
> t.cal <- 3.7 > t.cal^2 [1] 13.69 > F.cal <- 13.69 > F.cal [1] 13.69
Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다.
$$ \text{t calculated value} = \frac {b - 0}{se} $$
위에서 $se$는 아래처럼 구한다고 언급하였다.
\begin{eqnarray*} se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\ & = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\ \text{note that MSE } & = & \text{mean square error } \\ & = & \text{ms.res } \end{eqnarray*}
위에서 구한 t값의 p value는 R에서
summary(lm.y.x1) n <- length(y) k <- 1 # num of predictor variables sse <- sum(lm.y.x1$residuals^2) # ss.res ssx1 <- sum((x1-mean(x1))^2) b <- lm.y.x1$coefficient[2] se <- sqrt((1/(n-2))*(sse/ssx1)) t.b.cal <- (b - 0) / se t.b.cal p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F) p.value # checck t.b.cal f.cal <- t.b.cal^2 f.cal p.value
> summary(lm.y.x1)
Call:
lm(formula = y ~ x1, data = d)
Residuals:
Min 1Q Median 3Q Max
-1.5189 -0.8969 -0.1297 1.0058 1.5800
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.617781 1.241518 2.914 0.01947 *
x1 0.015269 0.004127 3.700 0.00605 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.176 on 8 degrees of freedom
Multiple R-squared: 0.6311, Adjusted R-squared: 0.585
F-statistic: 13.69 on 1 and 8 DF, p-value: 0.006046
> n <- length(y)
> k <- 1 # num of predictor variables
> sse <- sum(lm.y.x1$residuals^2)
> ssx1 <- sum((x1-mean(x1))^2)
> b <- lm.y.x1$coefficient[2]
> se <-sqrt((1/(n-2))*(sse/ssx1))
> se <-sqrt(mse/ssx1)
> t.b.cal <- (b - 0) / se
> t.b.cal
x1
3.699639
> p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
>
> # checck
> t.b.cal
x1
3.699639
> t.b.cal^2
x1
13.68733
> p.value
x1
0.006045749
>
>
>
> lm.y.x2 <- lm(y ~ x2, data=d)
> summary(lm.y.x2)
Call:
lm(formula = y ~ x2, data = d)
Residuals:
Min 1Q Median 3Q Max
-1.2537 -0.8881 -0.4851 0.4963 2.5920
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.7910 1.1195 9.639 1.12e-05 ***
x2 -0.8458 0.3117 -2.713 0.0265 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.397 on 8 degrees of freedom
Multiple R-squared: 0.4793, Adjusted R-squared: 0.4142
F-statistic: 7.363 on 1 and 8 DF, p-value: 0.02651
>
>
