c:ms:2025:schedule:w13.lecture.note
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
c:ms:2025:schedule:w13.lecture.note [2025/06/02 09:09] – hkimscil | c:ms:2025:schedule:w13.lecture.note [2025/06/09 08:51] (current) – [output] hkimscil | ||
---|---|---|---|
Line 5: | Line 5: | ||
# | # | ||
rm(list=ls()) | rm(list=ls()) | ||
- | d <- read.csv(" | + | df <- read.csv(" |
- | d | + | df |
- | colnames(d) <- c(" | + | colnames(df) <- c(" |
- | d | + | df |
# y = 통장갯수 | # y = 통장갯수 | ||
# x1 = 인컴 | # x1 = 인컴 | ||
# x2 = 부양가족수 | # x2 = 부양가족수 | ||
- | lm.y.x1 <- lm(y ~ x1, data=d) | + | lm.y.x1 <- lm(y ~ x1, data=df) |
summary(lm.y.x1) | summary(lm.y.x1) | ||
anova(lm.y.x1) | anova(lm.y.x1) | ||
+ | cor(df$x1, df$y)^2 | ||
+ | summary(lm.y.x1)$r.squared | ||
- | lm.y.x2 <- lm(y ~ x2, data=d) | + | |
+ | lm.y.x2 <- lm(y ~ x2, data=df) | ||
summary(lm.y.x2) | summary(lm.y.x2) | ||
anova(lm.y.x2) | anova(lm.y.x2) | ||
+ | cor(df$x2, df$y)^2 | ||
+ | summary(lm.y.x2)$r.squared | ||
- | lm.y.x1x2 <- lm(y ~ x1+x2, data=d) | + | |
+ | lm.y.x1x2 <- lm(y ~ x1+x2, data=df) | ||
summary(lm.y.x1x2) | summary(lm.y.x1x2) | ||
anova(lm.y.x1x2) | anova(lm.y.x1x2) | ||
+ | bcd <- summary(lm.y.x1x2)$r.squared | ||
+ | bcd | ||
+ | bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared | ||
+ | bc.cd | ||
+ | bc.cd - bcd # note that this is the amount that shared by the two IVs | ||
+ | |||
lm.y.x1x2$coefficient | lm.y.x1x2$coefficient | ||
- | # y.hat = 6.399103 + (0.01184145)*x1 + (−0.54472725)*x2 | + | # y.hat = 6.399103 + (0.01184145)*x1 + (?0.54472725)*x2 |
a <- lm.y.x1x2$coefficient[1] | a <- lm.y.x1x2$coefficient[1] | ||
b1 <- lm.y.x1x2$coefficient[2] | b1 <- lm.y.x1x2$coefficient[2] | ||
Line 34: | Line 46: | ||
b2 | b2 | ||
- | y.pred <- a + (b1 * x1) + (b2 * x2) | + | y.pred <- a + (b1 * df$x1) + (b2 * df$x2) |
y.pred | y.pred | ||
# or | # or | ||
Line 40: | Line 52: | ||
head(y.pred == y.pred2) | head(y.pred == y.pred2) | ||
- | y.real <- y | + | y.real <- df$y |
y.real | y.real | ||
- | y.mean <- mean(y) | + | y.mean <- mean(df$y) |
y.mean | y.mean | ||
+ | deviation.score <- df$y - y.mean | ||
+ | ds <- deviation.score | ||
res <- y.real - y.pred | res <- y.real - y.pred | ||
reg <- y.pred - y.mean | reg <- y.pred - y.mean | ||
Line 50: | Line 64: | ||
# remember y is sum of res + reg + y.mean | # remember y is sum of res + reg + y.mean | ||
y2 <- res + reg + y.mean | y2 <- res + reg + y.mean | ||
- | y==y2 | + | df$y==y2 |
+ | ss.tot <- sum(ds^2) | ||
ss.res <- sum(res^2) | ss.res <- sum(res^2) | ||
ss.reg <- sum(reg^2) | ss.reg <- sum(reg^2) | ||
- | ss.tot <- var(y) * (length(y)-1) | + | ss.tot2 <- var(df$y) * (length(df$y)-1) |
ss.tot | ss.tot | ||
+ | ss.tot2 | ||
ss.res | ss.res | ||
ss.reg | ss.reg | ||
Line 62: | Line 78: | ||
k <- 3 # # of parameters a, b1, b2 | k <- 3 # # of parameters a, b1, b2 | ||
- | df.tot <- length(y)-1 | + | df.tot <- length(df$y)-1 |
df.reg <- k - 1 | df.reg <- k - 1 | ||
df.res <- df.tot - df.reg | df.res <- df.tot - df.reg | ||
Line 78: | Line 94: | ||
summary(lm.y.x1x2) | summary(lm.y.x1x2) | ||
anova(lm.y.x1x2) | anova(lm.y.x1x2) | ||
+ | |||
+ | summary(lm(y~x2+x1, | ||
+ | anova(lm(y~x2+x1, | ||
+ | |||
# note on 2 t-tests in summary | # note on 2 t-tests in summary | ||
# anova에서의 x1, x2에 대한 테스트와 | # anova에서의 x1, x2에 대한 테스트와 | ||
Line 89: | Line 109: | ||
# 테스트하기 때문에 그러함 | # 테스트하기 때문에 그러함 | ||
+ | # 또한 anova test에서 두번째 IV의 F값은 | ||
+ | # summary(lm)에서 두번 째 IV의 t값의 제곱값 | ||
+ | # 임을 이해. 이는 두번 째 IV의 설명력을 나 | ||
+ | # 타내는 부분이 lm과 anova 모두 같기 때문 | ||
+ | # 반면에 첫번째 IV의 경우에는 lm 분석 때에는 | ||
+ | # 고유의 설명력만을 가지고 (semi-partial cor^2) | ||
+ | # 판단을 하는 반면에, anova는 x2와 공유하는 | ||
+ | # 설명력도 포함해서 분석하기 때문에 t값의 | ||
+ | # 제곱이 F값이 되지 못함 | ||
+ | |||
+ | # beta에 대한 설명 | ||
# beta coefficient (standardized b) | # beta coefficient (standardized b) | ||
# beta <- b * (sd(x)/ | # beta <- b * (sd(x)/ | ||
- | beta1 <- b1 * (sd(x1)/ | + | beta1 <- b1 * (sd(df$x1)/sd(df$y)) |
- | beta2 <- b2 * (sd(x2)/ | + | beta2 <- b2 * (sd(df$x2)/sd(df$y)) |
beta1 | beta1 | ||
beta2 | beta2 | ||
Line 105: | Line 136: | ||
# understand with diagrams first | # understand with diagrams first | ||
# then calculate with r | # then calculate with r | ||
- | lm.tmp.1 <- lm(x2~x1, data=d) | + | lm.tmp.1 <- lm(x2~x1, data=df) |
res.x2.x1 <- lm.tmp.1$residuals | res.x2.x1 <- lm.tmp.1$residuals | ||
- | lm.tmp.2 <- lm(y~x1, data=d) | + | lm.tmp.2 <- lm(y~x1, data=df) |
res.y.x1 <- lm.tmp.2$residuals | res.y.x1 <- lm.tmp.2$residuals | ||
- | lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) | + | lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df) |
summary(lm.tmp.3) | summary(lm.tmp.3) | ||
+ | summary(lm.tmp.3)$r.squared | ||
+ | sqrt(summary(lm.tmp.3)$r.squared) | ||
# install.packages(" | # install.packages(" | ||
library(ppcor) | library(ppcor) | ||
- | partial.r <- pcor.test(y, | + | partial.r <- pcor.test(df$y, df$x2, df$x1) |
- | partial.r | + | |
str(partial.r) | str(partial.r) | ||
+ | partial.r$estimate | ||
summary(lm.tmp.3) | summary(lm.tmp.3) | ||
summary(lm.tmp.3)$r.square | summary(lm.tmp.3)$r.square | ||
Line 125: | Line 157: | ||
# x1's own explanation? | # x1's own explanation? | ||
- | lm.tmp.4 <- lm(x1~x2, data=d) | + | lm.tmp.4 <- lm(x1~x2, data=df) |
res.x1.x2 <- lm.tmp.4$residuals | res.x1.x2 <- lm.tmp.4$residuals | ||
- | lm.tmp.5 <- lm(y~x2, data=d) | + | lm.tmp.5 <- lm(y~x2, data=df) |
res.y.x2 <- lm.tmp.5$residuals | res.y.x2 <- lm.tmp.5$residuals | ||
- | lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) | + | lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df) |
summary(lm.tmp.6) | summary(lm.tmp.6) | ||
- | partial.r <- pcor.test(y, | + | partial.r <- pcor.test(df$y, df$x1, df$x2) |
str(partial.r) | str(partial.r) | ||
partial.r$estimate # this is partial correlation, | partial.r$estimate # this is partial correlation, | ||
# in order to get pr2, you should ^2 | # in order to get pr2, you should ^2 | ||
partial.r$estimate^2 | partial.r$estimate^2 | ||
+ | summary(lm.tmp.6)$r.squared | ||
####################################################### | ####################################################### | ||
- | # | ||
# semipartial correlation coefficient and spr2 | # semipartial correlation coefficient and spr2 | ||
# | # | ||
- | spr.1 <- spcor.test(y, | + | spr.y.x2.x1 |
- | spr.2 <- spcor.test(y, | + | spr.y.x1.x2 |
- | spr.1 | + | spr.y.x2.x1 |
- | spr.2 | + | spr.y.x1.x2 |
- | spr.1$estimate^2 | + | spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2 |
- | spr.2$estimate^2 | + | spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2 |
+ | spr2.y.x2.x1 | ||
+ | spr2.y.x1.x2 | ||
- | lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) | + | lm.tmp.7 <- lm(y ~ res.x2.x1, data=df) |
summary(lm.tmp.7) | summary(lm.tmp.7) | ||
- | spr.1$estimate^2 | + | spr2.y.x2.x1 |
- | lm.tmp.8 <- lm(y~res.x1.x2, | + | lm.tmp.8 <- lm(y~res.x1.x2, |
summary(lm.tmp.8) | summary(lm.tmp.8) | ||
- | spr.2$estimate^2 | + | spr2.y.x1.x2 |
+ | |||
+ | bcd # remember bcd in the above? | ||
+ | bd <- spr2.y.x2.x1 + spr2.y.x1.x2 | ||
+ | bd | ||
+ | bcd - bd | ||
####################################################### | ####################################################### | ||
Line 164: | Line 203: | ||
summary(lm.y.x2) | summary(lm.y.x2) | ||
all.x2 <- summary(lm.y.x2)$r.squared | all.x2 <- summary(lm.y.x2)$r.squared | ||
- | sp.x2 <- spr.1$estimate^2 | ||
all.x2 | all.x2 | ||
- | sp.x2 | + | spr2.y.x2.x1 |
- | cma.1 <- all.x2 - sp.x2 | + | cma.1 <- all.x2 - spr2.y.x2.x1 |
cma.1 | cma.1 | ||
Line 173: | Line 211: | ||
summary(lm.y.x1) | summary(lm.y.x1) | ||
all.x1 <- summary(lm.y.x1)$r.squared | all.x1 <- summary(lm.y.x1)$r.squared | ||
- | sp.x1 <- spr.2$estimate^2 | ||
all.x1 | all.x1 | ||
- | sp.x1 | + | spr2.y.x1.x2 |
- | cma.2 <- all.x1 - sp.x1 | + | cma.2 <- all.x1 - spr2.y.x1.x2 |
cma.2 | cma.2 | ||
Line 183: | Line 220: | ||
r2.y.x1x2 <- summary(lm.y.x1x2)$r.square | r2.y.x1x2 <- summary(lm.y.x1x2)$r.square | ||
r2.y.x1x2 | r2.y.x1x2 | ||
- | sp.x1 | + | spr2.y.x1.x2 |
- | sp.x2 | + | spr2.y.x2.x1 |
- | cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2) | + | cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1) |
+ | bcd - bd | ||
cma.3 | cma.3 | ||
Line 195: | Line 233: | ||
# regression에서 얻은 R2을 가지고 | # regression에서 얻은 R2을 가지고 | ||
# 공통설명력을 알아볼 수도 있었다. | # 공통설명력을 알아볼 수도 있었다. | ||
- | r2.x1 <- summary(lm.y.x1)$r.square | + | r2.y.x1 <- summary(lm.y.x1)$r.square |
- | r2.x2 <- summary(lm.y.x2)$r.square | + | r2.y.x2 <- summary(lm.y.x2)$r.square |
- | r2.x1x2 <- summary(lm.y.x1x2)$r.square | + | r2.y.x1 |
- | r2.x1 + r2.x2 - r2.x1x2 | + | r2.y.x2 |
+ | r2.y.x1x2 <- summary(lm.y.x1x2)$r.square | ||
+ | r2.y.x1x2 | ||
+ | cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2 | ||
+ | cma.4 | ||
+ | |||
+ | # Note that sorting out unique and common | ||
+ | # explanation area is only possible with | ||
+ | # semi-partial correlation determinant | ||
+ | # NOT partial correlation determinant | ||
+ | # because only semi-partial correlation | ||
+ | # shares the same denominator (as total | ||
+ | # y). | ||
+ | ############################################# | ||
</ | </ | ||
====== output ====== | ====== output ====== | ||
< | < | ||
+ | > # multiple regression: a simple e.g. | ||
+ | > # | ||
+ | > # | ||
+ | > rm(list=ls()) | ||
+ | > df <- read.csv(" | ||
+ | > df | ||
+ | | ||
+ | 1 6 220 5 | ||
+ | 2 5 190 6 | ||
+ | 3 7 260 3 | ||
+ | 4 7 200 4 | ||
+ | 5 8 330 2 | ||
+ | 6 | ||
+ | 7 8 210 3 | ||
+ | 8 | ||
+ | 9 9 320 1 | ||
+ | 10 | ||
+ | > | ||
+ | > colnames(df) <- c(" | ||
+ | > df | ||
+ | y x1 x2 | ||
+ | 1 6 220 5 | ||
+ | 2 5 190 6 | ||
+ | 3 7 260 3 | ||
+ | 4 7 200 4 | ||
+ | 5 8 330 2 | ||
+ | 6 10 490 4 | ||
+ | 7 8 210 3 | ||
+ | 8 11 380 2 | ||
+ | 9 9 320 1 | ||
+ | 10 9 270 3 | ||
+ | > # y = 통장갯수 | ||
+ | > # x1 = 인컴 | ||
+ | > # x2 = 부양가족수 | ||
+ | > lm.y.x1 <- lm(y ~ x1, data=df) | ||
+ | > summary(lm.y.x1) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.519 -0.897 -0.130 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.18 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > anova(lm.y.x1) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value Pr(> | ||
+ | x1 | ||
+ | Residuals | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > cor(df$x1, df$y)^2 | ||
+ | [1] 0.6311 | ||
+ | > summary(lm.y.x1)$r.squared | ||
+ | [1] 0.6311 | ||
+ | > | ||
+ | > | ||
+ | > lm.y.x2 <- lm(y ~ x2, data=df) | ||
+ | > summary(lm.y.x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.254 -0.888 -0.485 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x2 -0.846 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.4 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > anova(lm.y.x2) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value Pr(> | ||
+ | x2 | ||
+ | Residuals | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > cor(df$x2, df$y)^2 | ||
+ | [1] 0.4793 | ||
+ | > summary(lm.y.x2)$r.squared | ||
+ | [1] 0.4793 | ||
+ | > | ||
+ | > | ||
+ | > lm.y.x1x2 <- lm(y ~ x1+x2, data=df) | ||
+ | > summary(lm.y.x1x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1 + x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x1 | ||
+ | x2 -0.54473 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.93 on 7 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > anova(lm.y.x1x2) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value Pr(> | ||
+ | x1 | ||
+ | x2 | ||
+ | Residuals | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > bcd <- summary(lm.y.x1x2)$r.squared | ||
+ | > bcd | ||
+ | [1] 0.7981 | ||
+ | > bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared | ||
+ | > bc.cd | ||
+ | [1] 1.11 | ||
+ | > bc.cd - bcd # note that this is the amount that shared by the two IVs | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > | ||
+ | > lm.y.x1x2$coefficient | ||
+ | (Intercept) | ||
+ | 6.39910 | ||
+ | > # y.hat = 6.399103 + (0.01184145)*x1 + (? | ||
+ | > a <- lm.y.x1x2$coefficient[1] | ||
+ | > b1 <- lm.y.x1x2$coefficient[2] | ||
+ | > b2 <- lm.y.x1x2$coefficient[3] | ||
+ | > a | ||
+ | (Intercept) | ||
+ | 6.399 | ||
+ | > b1 | ||
+ | | ||
+ | 0.01184 | ||
+ | > b2 | ||
+ | | ||
+ | -0.5447 | ||
+ | > | ||
+ | > y.pred <- a + (b1 * df$x1) + (b2 * df$x2) | ||
+ | > y.pred | ||
+ | | ||
+ | > # or | ||
+ | > y.pred2 <- predict(lm.y.x1x2) | ||
+ | > head(y.pred == y.pred2) | ||
+ | | ||
+ | TRUE TRUE TRUE TRUE TRUE TRUE | ||
+ | > | ||
+ | > y.real <- df$y | ||
+ | > y.real | ||
+ | | ||
+ | > y.mean <- mean(df$y) | ||
+ | > y.mean | ||
+ | [1] 8 | ||
+ | > | ||
+ | > deviation.score <- df$y - y.mean | ||
+ | > ds <- deviation.score | ||
+ | > res <- y.real - y.pred | ||
+ | > reg <- y.pred - y.mean | ||
+ | > y.mean | ||
+ | [1] 8 | ||
+ | > # remember y is sum of res + reg + y.mean | ||
+ | > y2 <- res + reg + y.mean | ||
+ | > df$y==y2 | ||
+ | [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE | ||
+ | > | ||
+ | > ss.tot <- sum(ds^2) | ||
+ | > ss.res <- sum(res^2) | ||
+ | > ss.reg <- sum(reg^2) | ||
+ | > | ||
+ | > ss.tot2 <- var(df$y) * (length(df$y)-1) | ||
+ | > ss.tot | ||
+ | [1] 30 | ||
+ | > ss.tot2 | ||
+ | [1] 30 | ||
+ | > ss.res | ||
+ | [1] 6.056 | ||
+ | > ss.reg | ||
+ | [1] 23.94 | ||
+ | > ss.res+ss.reg | ||
+ | [1] 30 | ||
+ | > | ||
+ | > k <- 3 # # of parameters a, b1, b2 | ||
+ | > df.tot <- length(df$y)-1 | ||
+ | > df.reg <- k - 1 | ||
+ | > df.res <- df.tot - df.reg | ||
+ | > | ||
+ | > ms.reg <- ss.reg/ | ||
+ | > ms.res <- ss.res/ | ||
+ | > ms.reg | ||
+ | [1] 11.97 | ||
+ | > ms.res | ||
+ | [1] 0.8652 | ||
+ | > f.val <- ms.reg/ | ||
+ | > f.val | ||
+ | [1] 13.84 | ||
+ | > p.val <- pf(f.val, df.reg, df.res, lower.tail = F) | ||
+ | > p.val | ||
+ | [1] 0.003696 | ||
+ | > | ||
+ | > # double check | ||
+ | > summary(lm.y.x1x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1 + x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x1 | ||
+ | x2 -0.54473 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.93 on 7 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > anova(lm.y.x1x2) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value Pr(> | ||
+ | x1 | ||
+ | x2 | ||
+ | Residuals | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > | ||
+ | > summary(lm(y~x2+x1, | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x2 + x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x2 -0.54473 | ||
+ | x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.93 on 7 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > anova(lm(y~x2+x1, | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value Pr(> | ||
+ | x2 | ||
+ | x1 | ||
+ | Residuals | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > | ||
+ | > # note on 2 t-tests in summary | ||
+ | > # anova에서의 x1, x2에 대한 테스트와 | ||
+ | > # lm에서의 x1, x2에 대한 테스트 (t-test) 간에 | ||
+ | > # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 | ||
+ | > # 다름). 그 이유는 | ||
+ | > # t-tests는 __pr__ 테스트로 테스트를 | ||
+ | > # (spr, zero_order_r 테스트가 아님) 하고 | ||
+ | > # anova test는 x1 전체에 대한 테스트 하고 | ||
+ | > # x2는 x1에 대한 테스트 외에 나머지를 가지고 | ||
+ | > # 테스트하기 때문에 그러함 | ||
+ | > | ||
+ | > # 또한 anova test에서 두번째 IV의 F값은 | ||
+ | > # summary(lm)에서 두번 째 IV의 t값의 제곱값 | ||
+ | > # 임을 이해. 이는 두번 째 IV의 설명력을 나 | ||
+ | > # 타내는 부분이 lm과 anova 모두 같기 때문 | ||
+ | > # 반면에 첫번째 IV의 경우에는 lm 분석 때에는 | ||
+ | > # 고유의 설명력만을 가지고 (semi-partial cor^2) | ||
+ | > # 판단을 하는 반면에, anova는 x2와 공유하는 | ||
+ | > # 설명력도 포함해서 분석하기 때문에 t값의 | ||
+ | > # 제곱이 F값이 되지 못함 | ||
+ | > | ||
+ | > # beta에 대한 설명 | ||
+ | > # beta coefficient (standardized b) | ||
+ | > # beta <- b * (sd(x)/ | ||
+ | > beta1 <- b1 * (sd(df$x1)/ | ||
+ | > beta2 <- b2 * (sd(df$x2)/ | ||
+ | > beta1 | ||
+ | x1 | ||
+ | 0.6161 | ||
+ | > beta2 | ||
+ | | ||
+ | -0.4459 | ||
+ | > | ||
+ | > # install.packages(" | ||
+ | > library(lm.beta) | ||
+ | > lm.beta(lm.y.x1x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1 + x2, data = df) | ||
+ | |||
+ | Standardized Coefficients:: | ||
+ | (Intercept) | ||
+ | | ||
+ | |||
+ | > | ||
+ | > ####################################################### | ||
+ | > # partial correlation coefficient and pr2 | ||
+ | > # x2's explanation? | ||
+ | > # understand with diagrams first | ||
+ | > # then calculate with r | ||
+ | > lm.tmp.1 <- lm(x2~x1, data=df) | ||
+ | > res.x2.x1 <- lm.tmp.1$residuals | ||
+ | > | ||
+ | > lm.tmp.2 <- lm(y~x1, data=df) | ||
+ | > res.y.x1 <- lm.tmp.2$residuals | ||
+ | > | ||
+ | > lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df) | ||
+ | > summary(lm.tmp.3) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = res.y.x1 ~ res.x2.x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | | ||
+ | (Intercept) | ||
+ | res.x2.x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.87 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > summary(lm.tmp.3)$r.squared | ||
+ | [1] 0.4527 | ||
+ | > sqrt(summary(lm.tmp.3)$r.squared) | ||
+ | [1] 0.6729 | ||
+ | > # install.packages(" | ||
+ | > library(ppcor) | ||
+ | > partial.r <- pcor.test(df$y, | ||
+ | > str(partial.r) | ||
+ | ' | ||
+ | $ estimate : num -0.673 | ||
+ | $ p.value | ||
+ | $ statistic: num -2.41 | ||
+ | $ n : int 10 | ||
+ | $ gp : num 1 | ||
+ | $ Method | ||
+ | > partial.r$estimate | ||
+ | [1] -0.6729 | ||
+ | > summary(lm.tmp.3) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = res.y.x1 ~ res.x2.x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | | ||
+ | (Intercept) | ||
+ | res.x2.x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.87 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > summary(lm.tmp.3)$r.square | ||
+ | [1] 0.4527 | ||
+ | > partial.r$estimate^2 | ||
+ | [1] 0.4527 | ||
+ | > | ||
+ | > | ||
+ | > # x1's own explanation? | ||
+ | > lm.tmp.4 <- lm(x1~x2, data=df) | ||
+ | > res.x1.x2 <- lm.tmp.4$residuals | ||
+ | > | ||
+ | > lm.tmp.5 <- lm(y~x2, data=df) | ||
+ | > res.y.x2 <- lm.tmp.5$residuals | ||
+ | > | ||
+ | > lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df) | ||
+ | > summary(lm.tmp.6) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = res.y.x2 ~ res.x1.x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) 1.33e-17 | ||
+ | res.x1.x2 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.87 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > partial.r <- pcor.test(df$y, | ||
+ | > str(partial.r) | ||
+ | ' | ||
+ | $ estimate : num 0.783 | ||
+ | $ p.value | ||
+ | $ statistic: num 3.33 | ||
+ | $ n : int 10 | ||
+ | $ gp : num 1 | ||
+ | $ Method | ||
+ | > partial.r$estimate # this is partial correlation, | ||
+ | [1] 0.7825 | ||
+ | > # in order to get pr2, you should ^2 | ||
+ | > partial.r$estimate^2 | ||
+ | [1] 0.6123 | ||
+ | > summary(lm.tmp.6)$r.squared | ||
+ | [1] 0.6123 | ||
+ | > | ||
+ | > ####################################################### | ||
+ | > # semipartial correlation coefficient and spr2 | ||
+ | > # | ||
+ | > spr.y.x2.x1 <- spcor.test(df$y, | ||
+ | > spr.y.x1.x2 <- spcor.test(df$y, | ||
+ | > spr.y.x2.x1 | ||
+ | estimate p.value statistic | ||
+ | 1 -0.4087 | ||
+ | > spr.y.x1.x2 | ||
+ | estimate p.value statistic | ||
+ | 1 | ||
+ | > spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2 | ||
+ | > spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2 | ||
+ | > spr2.y.x2.x1 | ||
+ | [1] 0.167 | ||
+ | > spr2.y.x1.x2 | ||
+ | [1] 0.3189 | ||
+ | > | ||
+ | > lm.tmp.7 <- lm(y ~ res.x2.x1, data=df) | ||
+ | > summary(lm.tmp.7) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ res.x2.x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.862 -1.171 -0.494 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | res.x2.x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.77 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > spr2.y.x2.x1 | ||
+ | [1] 0.167 | ||
+ | > | ||
+ | > lm.tmp.8 <- lm(y~res.x1.x2, | ||
+ | > summary(lm.tmp.8) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ res.x1.x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -2.664 -0.608 -0.149 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | res.x1.x2 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.6 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > spr2.y.x1.x2 | ||
+ | [1] 0.3189 | ||
+ | > | ||
+ | > bcd # remember bcd in the above? | ||
+ | [1] 0.7981 | ||
+ | > bd <- spr2.y.x2.x1 + spr2.y.x1.x2 | ||
+ | > bd | ||
+ | [1] 0.4859 | ||
+ | > bcd - bd | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > ####################################################### | ||
+ | > # get the common area that explain the y variable | ||
+ | > # 1. | ||
+ | > summary(lm.y.x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.254 -0.888 -0.485 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x2 -0.846 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.4 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > all.x2 <- summary(lm.y.x2)$r.squared | ||
+ | > all.x2 | ||
+ | [1] 0.4793 | ||
+ | > spr2.y.x2.x1 | ||
+ | [1] 0.167 | ||
+ | > cma.1 <- all.x2 - spr2.y.x2.x1 | ||
+ | > cma.1 | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > # 2. | ||
+ | > summary(lm.y.x1) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.519 -0.897 -0.130 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x1 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.18 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > all.x1 <- summary(lm.y.x1)$r.squared | ||
+ | > all.x1 | ||
+ | [1] 0.6311 | ||
+ | > spr2.y.x1.x2 | ||
+ | [1] 0.3189 | ||
+ | > cma.2 <- all.x1 - spr2.y.x1.x2 | ||
+ | > cma.2 | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > # OR 3. | ||
+ | > summary(lm.y.x1x2) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x1 + x2, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.217 -0.578 -0.151 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x1 | ||
+ | x2 -0.54473 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 0.93 on 7 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > r2.y.x1x2 <- summary(lm.y.x1x2)$r.square | ||
+ | > r2.y.x1x2 | ||
+ | [1] 0.7981 | ||
+ | > spr2.y.x1.x2 | ||
+ | [1] 0.3189 | ||
+ | > spr2.y.x2.x1 | ||
+ | [1] 0.167 | ||
+ | > cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1) | ||
+ | > bcd - bd | ||
+ | [1] 0.3123 | ||
+ | > cma.3 | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > cma.1 | ||
+ | [1] 0.3123 | ||
+ | > cma.2 | ||
+ | [1] 0.3123 | ||
+ | > cma.3 | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > # OR 애초에 simple regression과 multiple | ||
+ | > # regression에서 얻은 R2을 가지고 | ||
+ | > # 공통설명력을 알아볼 수도 있었다. | ||
+ | > r2.y.x1 <- summary(lm.y.x1)$r.square | ||
+ | > r2.y.x2 <- summary(lm.y.x2)$r.square | ||
+ | > r2.y.x1 | ||
+ | [1] 0.6311 | ||
+ | > r2.y.x2 | ||
+ | [1] 0.4793 | ||
+ | > r2.y.x1x2 <- summary(lm.y.x1x2)$r.square | ||
+ | > r2.y.x1x2 | ||
+ | [1] 0.7981 | ||
+ | > cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2 | ||
+ | > cma.4 | ||
+ | [1] 0.3123 | ||
+ | > | ||
+ | > # Note that sorting out unique and common | ||
+ | > # explanation area is only possible with | ||
+ | > # semi-partial correlation determinant | ||
+ | > # NOT partial correlation determinant | ||
+ | > # because only semi-partial correlation | ||
+ | > # shares the same denominator (as total | ||
+ | > # y). | ||
+ | > ############################################# | ||
+ | > | ||
+ | > | ||
</ | </ | ||
+ | ====== explanation. added ====== | ||
+ | {{: | ||
+ | < | ||
+ | # ex. | ||
+ | # resid(lm(y~x1, | ||
+ | # resid(lm(y~x2, | ||
+ | # resid(lm(y~x1+x2, | ||
+ | # b / delta.y = ? | ||
+ | # ce / delta.x2 = ? | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | # exp.added | ||
+ | spcor.test(df$y, | ||
+ | spcor.test(df$y, | ||
+ | spcor.test(df$y, | ||
+ | spcor.test(df$y, | ||
+ | summary(lm(y~x1+x2, | ||
+ | |||
+ | b <- spcor.test(df$y, | ||
+ | d <- spcor.test(df$y, | ||
+ | bcd <- summary(lm(y~x1+x2, | ||
+ | |||
+ | summary(lm(df$y~df$x1+df$x2, | ||
+ | (spcor.test(df$y, | ||
+ | | ||
+ | bcd - (b + d) | ||
+ | |||
+ | </ | ||
+ | ====== Another one ====== | ||
+ | [[:partial and semipartial correlation]] | ||
c/ms/2025/schedule/w13.lecture.note.1748822961.txt.gz · Last modified: 2025/06/02 09:09 by hkimscil