c:ms:2025:schedule:w13.lecture.note
This is an old revision of the document!
Table of Contents
MR
# multiple regression: a simple e.g. # # rm(list=ls()) d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") d colnames(d) <- c("y", "x1", "x2") d # y = 통장갯수 # x1 = 인컴 # x2 = 부양가족수 lm.y.x1 <- lm(y ~ x1, data=d) summary(lm.y.x1) anova(lm.y.x1) cor(d$x1, d$y)^2 summary(lm.y.x1)$r.squared lm.y.x2 <- lm(y ~ x2, data=d) summary(lm.y.x2) anova(lm.y.x2) cor(d$x2, d$y)^2 summary(lm.y.x2)$r.squared lm.y.x1x2 <- lm(y ~ x1+x2, data=d) summary(lm.y.x1x2) anova(lm.y.x1x2) bcd <- summary(lm.y.x1x2)$r.squared bcd bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared bc.cd bc.cd - bcd # note that this is the amount that shared by the two IVs lm.y.x1x2$coefficient # y.hat = 6.399103 + (0.01184145)*x1 + (−0.54472725)*x2 a <- lm.y.x1x2$coefficient[1] b1 <- lm.y.x1x2$coefficient[2] b2 <- lm.y.x1x2$coefficient[3] a b1 b2 y.pred <- a + (b1 * d$x1) + (b2 * d$x2) y.pred # or y.pred2 <- predict(lm.y.x1x2) head(y.pred == y.pred2) y.real <- d$y y.real y.mean <- mean(d$y) y.mean deviation.score <- d$y - y.mean ds <- deviation.score res <- y.real - y.pred reg <- y.pred - y.mean y.mean # remember y is sum of res + reg + y.mean y2 <- res + reg + y.mean d$y==y2 ss.tot <- sum(ds^2) ss.res <- sum(res^2) ss.reg <- sum(reg^2) ss.tot2 <- var(d$y) * (length(d$y)-1) ss.tot ss.tot2 ss.res ss.reg ss.res+ss.reg k <- 3 # # of parameters a, b1, b2 df.tot <- length(y)-1 df.reg <- k - 1 df.res <- df.tot - df.reg ms.reg <- ss.reg/df.reg ms.res <- ss.res/df.res ms.reg ms.res f.val <- ms.reg/ms.res f.val p.val <- pf(f.val, df.reg, df.res, lower.tail = F) p.val # double check summary(lm.y.x1x2) anova(lm.y.x1x2) summary(lm(y~x2+x1, data=d)) anova(lm(y~x2+x1, data=d)) # note on 2 t-tests in summary # anova에서의 x1, x2에 대한 테스트와 # lm에서의 x1, x2에 대한 테스트 (t-test) 간에 # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 # 다름). 그 이유는 # t-tests는 __pr__ 테스트로 테스트를 # (spr, zero_order_r 테스트가 아님) 하고 # anova test는 x1 전체에 대한 테스트 하고 # x2는 x1에 대한 테스트 외에 나머지를 가지고 # 테스트하기 때문에 그러함 # 또한 anova test에서 두번째 IV의 F값은 # summary(lm)에서 두번 째 IV의 t값의 제곱값 # 임을 이해. 이는 두번 째 IV의 설명력을 나 # 타내는 부분이 lm과 anova 모두 같기 때문 # 반면에 첫번째 IV의 경우에는 lm 분석 때에는 # 고유의 설명력만을 가지고 (semi-partial cor^2) # 판단을 하는 반면에, anova는 x2와 공유하는 # 설명력도 포함해서 분석하기 때문에 t값의 # 제곱이 F값이 되지 못함 # beta에 대한 설명 # beta coefficient (standardized b) # beta <- b * (sd(x)/sd(y)) beta1 <- b1 * (sd(x1)/sd(y)) beta2 <- b2 * (sd(x2)/sd(y)) beta1 beta2 # install.packages("lm.beta") library(lm.beta) lm.beta(lm.y.x1x2) ####################################################### # partial correlation coefficient and pr2 # x2's explanation? # understand with diagrams first # then calculate with r lm.tmp.1 <- lm(x2~x1, data=d) res.x2.x1 <- lm.tmp.1$residuals lm.tmp.2 <- lm(y~x1, data=d) res.y.x1 <- lm.tmp.2$residuals lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) summary(lm.tmp.3) summary(lm.tmp.3)$r.squared sqrt(summary(lm.tmp.3)$r.squared) # install.packages("ppcor") library(ppcor) partial.r <- pcor.test(d$y, d$x2, d$x1) str(partial.r) partial.r$estimate summary(lm.tmp.3) summary(lm.tmp.3)$r.square partial.r$estimate^2 # x1's own explanation? lm.tmp.4 <- lm(x1~x2, data=d) res.x1.x2 <- lm.tmp.4$residuals lm.tmp.5 <- lm(y~x2, data=d) res.y.x2 <- lm.tmp.5$residuals lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) summary(lm.tmp.6) partial.r <- pcor.test(d$y, d$x1, d$x2) str(partial.r) partial.r$estimate # this is partial correlation, not pr^2 # in order to get pr2, you should ^2 partial.r$estimate^2 summary(lm.tmp.6)$r.squared ####################################################### # semipartial correlation coefficient and spr2 # spr.y.x2.x1 <- spcor.test(d$y,d$x2,d$x1) spr.y.x1.x2 <- spcor.test(d$y,d$x1,d$x2) spr.y.x2.x1 spr.y.x1.x2 spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2 spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2 spr2.y.x2.x1 spr2.y.x1.x2 lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) summary(lm.tmp.7) spr2.y.x2.x1 lm.tmp.8 <- lm(y~res.x1.x2, data = d) summary(lm.tmp.8) spr2.y.x1.x2 bcd # remember bcd in the above? bd <- spr2.y.x2.x1 + spr2.y.x1.x2 bd bcd - bd ####################################################### # get the common area that explain the y variable # 1. summary(lm.y.x2) all.x2 <- summary(lm.y.x2)$r.squared all.x2 spr2.y.x2.x1 cma.1 <- all.x2 - spr2.y.x2.x1 cma.1 # 2. summary(lm.y.x1) all.x1 <- summary(lm.y.x1)$r.squared all.x1 spr2.y.x1.x2 cma.2 <- all.x1 - spr2.y.x1.x2 cma.2 # OR 3. summary(lm.y.x1x2) r2.y.x1x2 <- summary(lm.y.x1x2)$r.square r2.y.x1x2 spr2.y.x1.x2 spr2.y.x2.x1 cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1) bcd - bd cma.3 cma.1 cma.2 cma.3 # OR 애초에 simple regression과 multiple # regression에서 얻은 R2을 가지고 # 공통설명력을 알아볼 수도 있었다. r2.y.x1 <- summary(lm.y.x1)$r.square r2.y.x2 <- summary(lm.y.x2)$r.square r2.y.x1 r2.y.x2 r2.y.x1x2 <- summary(lm.y.x1x2)$r.square r2.y.x1x2 cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2 cma.4 # Note that sorting out unique and common # explanation area is only possible with # semi-partial correlation determinant # NOT partial correlation determinant # because only semi-partial correlation # shares the same denominator (as total # y). #############################################
output
> # multiple regression: a simple e.g. > # > # > rm(list=ls()) > d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") > d bankaccount income famnum 1 6 220 5 2 5 190 6 3 7 260 3 4 7 200 4 5 8 330 2 6 10 490 4 7 8 210 3 8 11 380 2 9 9 320 1 10 9 270 3 > > colnames(d) <- c("y", "x1", "x2") > d y x1 x2 1 6 220 5 2 5 190 6 3 7 260 3 4 7 200 4 5 8 330 2 6 10 490 4 7 8 210 3 8 11 380 2 9 9 320 1 10 9 270 3 > # y = 통장갯수 > # x1 = 인컴 > # x2 = 부양가족수 > lm.y.x1 <- lm(y ~ x1, data=d) > summary(lm.y.x1) Call: lm(formula = y ~ x1, data = d) Residuals: Min 1Q Median 3Q Max -1.519 -0.897 -0.130 1.006 1.580 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.61778 1.24152 2.91 0.019 * x1 0.01527 0.00413 3.70 0.006 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.18 on 8 degrees of freedom Multiple R-squared: 0.631, Adjusted R-squared: 0.585 F-statistic: 13.7 on 1 and 8 DF, p-value: 0.00605 > anova(lm.y.x1) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x1 1 18.9 18.93 13.7 0.006 ** Residuals 8 11.1 1.38 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > cor(d$x1, d$y)^2 [1] 0.6311 > summary(lm.y.x1)$r.squared [1] 0.6311 > > > lm.y.x2 <- lm(y ~ x2, data=d) > summary(lm.y.x2) Call: lm(formula = y ~ x2, data = d) Residuals: Min 1Q Median 3Q Max -1.254 -0.888 -0.485 0.496 2.592 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 10.791 1.119 9.64 1.1e-05 *** x2 -0.846 0.312 -2.71 0.027 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.4 on 8 degrees of freedom Multiple R-squared: 0.479, Adjusted R-squared: 0.414 F-statistic: 7.36 on 1 and 8 DF, p-value: 0.0265 > anova(lm.y.x2) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x2 1 14.4 14.38 7.36 0.027 * Residuals 8 15.6 1.95 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > cor(d$x2, d$y)^2 [1] 0.4793 > summary(lm.y.x2)$r.squared [1] 0.4793 > > > lm.y.x1x2 <- lm(y ~ x1+x2, data=d) > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.39910 1.51654 4.22 0.0039 ** x1 0.01184 0.00356 3.33 0.0127 * x2 -0.54473 0.22636 -2.41 0.0470 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.93 on 7 degrees of freedom Multiple R-squared: 0.798, Adjusted R-squared: 0.74 F-statistic: 13.8 on 2 and 7 DF, p-value: 0.0037 > anova(lm.y.x1x2) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x1 1 18.93 18.93 21.88 0.0023 ** x2 1 5.01 5.01 5.79 0.0470 * Residuals 7 6.06 0.87 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > bcd <- summary(lm.y.x1x2)$r.squared > bcd [1] 0.7981 > bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared > bc.cd [1] 1.11 > bc.cd - bcd # note that this is the amount that shared by the two IVs [1] 0.3123 > > > lm.y.x1x2$coefficient (Intercept) x1 x2 6.39910 0.01184 -0.54473 > # y.hat = 6.399103 + (0.01184145)*x1 + (−0.54472725)*x2 > a <- lm.y.x1x2$coefficient[1] > b1 <- lm.y.x1x2$coefficient[2] > b2 <- lm.y.x1x2$coefficient[3] > a (Intercept) 6.399 > b1 x1 0.01184 > b2 x2 -0.5447 > > y.pred <- a + (b1 * d$x1) + (b2 * d$x2) > y.pred [1] 6.281 5.381 7.844 6.588 9.217 10.023 7.252 9.809 9.644 7.962 > # or > y.pred2 <- predict(lm.y.x1x2) > head(y.pred == y.pred2) 1 2 3 4 5 6 TRUE TRUE TRUE TRUE TRUE TRUE > > y.real <- d$y > y.real [1] 6 5 7 7 8 10 8 11 9 9 > y.mean <- mean(d$y) > y.mean [1] 8 > > deviation.score <- d$y - y.mean > ds <- deviation.score > res <- y.real - y.pred > reg <- y.pred - y.mean > y.mean [1] 8 > # remember y is sum of res + reg + y.mean > y2 <- res + reg + y.mean > d$y==y2 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE > > ss.tot <- sum(ds^2) > ss.res <- sum(res^2) > ss.reg <- sum(reg^2) > > ss.tot2 <- var(d$y) * (length(d$y)-1) > ss.tot [1] 30 > ss.tot2 [1] 30 > ss.res [1] 6.056 > ss.reg [1] 23.94 > ss.res+ss.reg [1] 30 > > k <- 3 # # of parameters a, b1, b2 > df.tot <- length(d$y)-1 > df.reg <- k - 1 > df.res <- df.tot - df.reg > > ms.reg <- ss.reg/df.reg > ms.res <- ss.res/df.res > ms.reg [1] 11.97 > ms.res [1] 0.8652 > f.val <- ms.reg/ms.res > f.val [1] 13.84 > p.val <- pf(f.val, df.reg, df.res, lower.tail = F) > p.val [1] 0.003696 > > # double check > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.39910 1.51654 4.22 0.0039 ** x1 0.01184 0.00356 3.33 0.0127 * x2 -0.54473 0.22636 -2.41 0.0470 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.93 on 7 degrees of freedom Multiple R-squared: 0.798, Adjusted R-squared: 0.74 F-statistic: 13.8 on 2 and 7 DF, p-value: 0.0037 > anova(lm.y.x1x2) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x1 1 18.93 18.93 21.88 0.0023 ** x2 1 5.01 5.01 5.79 0.0470 * Residuals 7 6.06 0.87 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > summary(lm(y~x2+x1, data=d)) Call: lm(formula = y ~ x2 + x1, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.39910 1.51654 4.22 0.0039 ** x2 -0.54473 0.22636 -2.41 0.0470 * x1 0.01184 0.00356 3.33 0.0127 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.93 on 7 degrees of freedom Multiple R-squared: 0.798, Adjusted R-squared: 0.74 F-statistic: 13.8 on 2 and 7 DF, p-value: 0.0037 > anova(lm(y~x2+x1, data=d)) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x2 1 14.38 14.38 16.6 0.0047 ** x1 1 9.57 9.57 11.1 0.0127 * Residuals 7 6.06 0.87 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > # note on 2 t-tests in summary > # anova에서의 x1, x2에 대한 테스트와 > # lm에서의 x1, x2에 대한 테스트 (t-test) 간에 > # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 > # 다름). 그 이유는 > # t-tests는 __pr__ 테스트로 테스트를 > # (spr, zero_order_r 테스트가 아님) 하고 > # anova test는 x1 전체에 대한 테스트 하고 > # x2는 x1에 대한 테스트 외에 나머지를 가지고 > # 테스트하기 때문에 그러함 > > # 또한 anova test에서 두번째 IV의 F값은 > # summary(lm)에서 두번 째 IV의 t값의 제곱값 > # 임을 이해. 이는 두번 째 IV의 설명력을 나 > # 타내는 부분이 lm과 anova 모두 같기 때문 > # 반면에 첫번째 IV의 경우에는 lm 분석 때에는 > # 고유의 설명력만을 가지고 (semi-partial cor^2) > # 판단을 하는 반면에, anova는 x2와 공유하는 > # 설명력도 포함해서 분석하기 때문에 t값의 > # 제곱이 F값이 되지 못함 > > # beta에 대한 설명 > # beta coefficient (standardized b) > # beta <- b * (sd(x)/sd(y)) > beta1 <- b1 * (sd(d$x1)/sd(d$y)) > beta2 <- b2 * (sd(d$x2)/sd(d$y)) > beta1 x1 0.6161 > beta2 x2 -0.4459 > > # install.packages("lm.beta") > library(lm.beta) 경고메시지(들): 패키지 ‘lm.beta’는 R 버전 4.3.3에서 작성되었습니다 > lm.beta(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Standardized Coefficients:: (Intercept) x1 x2 NA 0.6161 -0.4459 > > ####################################################### > # partial correlation coefficient and pr2 > # x2's explanation? > # understand with diagrams first > # then calculate with r > lm.tmp.1 <- lm(x2~x1, data=d) > res.x2.x1 <- lm.tmp.1$residuals > > lm.tmp.2 <- lm(y~x1, data=d) > res.y.x1 <- lm.tmp.2$residuals > > lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) > summary(lm.tmp.3) Call: lm(formula = res.y.x1 ~ res.x2.x1, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.28e-18 2.75e-01 0.00 1.000 res.x2.x1 -5.45e-01 2.12e-01 -2.57 0.033 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.87 on 8 degrees of freedom Multiple R-squared: 0.453, Adjusted R-squared: 0.384 F-statistic: 6.62 on 1 and 8 DF, p-value: 0.033 > summary(lm.tmp.3)$r.squared [1] 0.4527 > sqrt(summary(lm.tmp.3)$r.squared) [1] 0.6729 > # install.packages("ppcor") > library(ppcor) > partial.r <- pcor.test(d$y, d$x2, d$x1) > str(partial.r) 'data.frame': 1 obs. of 6 variables: $ estimate : num -0.673 $ p.value : num 0.047 $ statistic: num -2.41 $ n : int 10 $ gp : num 1 $ Method : chr "pearson" > partial.r$estimate [1] -0.6729 > summary(lm.tmp.3) Call: lm(formula = res.y.x1 ~ res.x2.x1, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.28e-18 2.75e-01 0.00 1.000 res.x2.x1 -5.45e-01 2.12e-01 -2.57 0.033 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.87 on 8 degrees of freedom Multiple R-squared: 0.453, Adjusted R-squared: 0.384 F-statistic: 6.62 on 1 and 8 DF, p-value: 0.033 > summary(lm.tmp.3)$r.square [1] 0.4527 > partial.r$estimate^2 [1] 0.4527 > > > # x1's own explanation? > lm.tmp.4 <- lm(x1~x2, data=d) > res.x1.x2 <- lm.tmp.4$residuals > > lm.tmp.5 <- lm(y~x2, data=d) > res.y.x2 <- lm.tmp.5$residuals > > lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) > summary(lm.tmp.6) Call: lm(formula = res.y.x2 ~ res.x1.x2, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.33e-17 2.75e-01 0.00 1.0000 res.x1.x2 1.18e-02 3.33e-03 3.55 0.0075 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.87 on 8 degrees of freedom Multiple R-squared: 0.612, Adjusted R-squared: 0.564 F-statistic: 12.6 on 1 and 8 DF, p-value: 0.00746 > > partial.r <- pcor.test(d$y, d$x1, d$x2) > str(partial.r) 'data.frame': 1 obs. of 6 variables: $ estimate : num 0.783 $ p.value : num 0.0127 $ statistic: num 3.33 $ n : int 10 $ gp : num 1 $ Method : chr "pearson" > partial.r$estimate # this is partial correlation, not pr^2 [1] 0.7825 > # in order to get pr2, you should ^2 > partial.r$estimate^2 [1] 0.6123 > summary(lm.tmp.6)$r.squared [1] 0.6123 > > ####################################################### > # semipartial correlation coefficient and spr2 > # > spr.y.x2.x1 <- spcor.test(d$y,d$x2,d$x1) > spr.y.x1.x2 <- spcor.test(d$y,d$x1,d$x2) > spr.y.x2.x1 estimate p.value statistic n gp Method 1 -0.4087 0.2748 -1.185 10 1 pearson > spr.y.x1.x2 estimate p.value statistic n gp Method 1 0.5647 0.1132 1.81 10 1 pearson > spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2 > spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2 > spr2.y.x2.x1 [1] 0.167 > spr2.y.x1.x2 [1] 0.3189 > > lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) > summary(lm.tmp.7) Call: lm(formula = y ~ res.x2.x1, data = d) Residuals: Min 1Q Median 3Q Max -1.862 -1.171 -0.494 0.549 3.077 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.000 0.559 14.31 5.5e-07 *** res.x2.x1 -0.545 0.430 -1.27 0.24 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.77 on 8 degrees of freedom Multiple R-squared: 0.167, Adjusted R-squared: 0.0629 F-statistic: 1.6 on 1 and 8 DF, p-value: 0.241 > spr2.y.x2.x1 [1] 0.167 > > lm.tmp.8 <- lm(y~res.x1.x2, data = d) > summary(lm.tmp.8) Call: lm(formula = y ~ res.x1.x2, data = d) Residuals: Min 1Q Median 3Q Max -2.664 -0.608 -0.149 1.219 2.290 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.00000 0.50540 15.83 2.5e-07 *** res.x1.x2 0.01184 0.00612 1.94 0.089 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.6 on 8 degrees of freedom Multiple R-squared: 0.319, Adjusted R-squared: 0.234 F-statistic: 3.74 on 1 and 8 DF, p-value: 0.089 > spr2.y.x1.x2 [1] 0.3189 > > bcd # remember bcd in the above? [1] 0.7981 > bd <- spr2.y.x2.x1 + spr2.y.x1.x2 > bd [1] 0.4859 > bcd - bd [1] 0.3123 > > ####################################################### > # get the common area that explain the y variable > # 1. > summary(lm.y.x2) Call: lm(formula = y ~ x2, data = d) Residuals: Min 1Q Median 3Q Max -1.254 -0.888 -0.485 0.496 2.592 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 10.791 1.119 9.64 1.1e-05 *** x2 -0.846 0.312 -2.71 0.027 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.4 on 8 degrees of freedom Multiple R-squared: 0.479, Adjusted R-squared: 0.414 F-statistic: 7.36 on 1 and 8 DF, p-value: 0.0265 > all.x2 <- summary(lm.y.x2)$r.squared > all.x2 [1] 0.4793 > spr2.y.x2.x1 [1] 0.167 > cma.1 <- all.x2 - spr2.y.x2.x1 > cma.1 [1] 0.3123 > > # 2. > summary(lm.y.x1) Call: lm(formula = y ~ x1, data = d) Residuals: Min 1Q Median 3Q Max -1.519 -0.897 -0.130 1.006 1.580 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.61778 1.24152 2.91 0.019 * x1 0.01527 0.00413 3.70 0.006 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.18 on 8 degrees of freedom Multiple R-squared: 0.631, Adjusted R-squared: 0.585 F-statistic: 13.7 on 1 and 8 DF, p-value: 0.00605 > all.x1 <- summary(lm.y.x1)$r.squared > all.x1 [1] 0.6311 > spr2.y.x1.x2 [1] 0.3189 > cma.2 <- all.x1 - spr2.y.x1.x2 > cma.2 [1] 0.3123 > > # OR 3. > summary(lm.y.x1x2) Call: lm(formula = y ~ x1 + x2, data = d) Residuals: Min 1Q Median 3Q Max -1.217 -0.578 -0.151 0.664 1.191 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.39910 1.51654 4.22 0.0039 ** x1 0.01184 0.00356 3.33 0.0127 * x2 -0.54473 0.22636 -2.41 0.0470 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.93 on 7 degrees of freedom Multiple R-squared: 0.798, Adjusted R-squared: 0.74 F-statistic: 13.8 on 2 and 7 DF, p-value: 0.0037 > r2.y.x1x2 <- summary(lm.y.x1x2)$r.square > r2.y.x1x2 [1] 0.7981 > spr2.y.x1.x2 [1] 0.3189 > spr2.y.x2.x1 [1] 0.167 > cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1) > bcd - bd [1] 0.3123 > cma.3 [1] 0.3123 > > cma.1 [1] 0.3123 > cma.2 [1] 0.3123 > cma.3 [1] 0.3123 > > # OR 애초에 simple regression과 multiple > # regression에서 얻은 R2을 가지고 > # 공통설명력을 알아볼 수도 있었다. > r2.y.x1 <- summary(lm.y.x1)$r.square > r2.y.x2 <- summary(lm.y.x2)$r.square > r2.y.x1 [1] 0.6311 > r2.y.x2 [1] 0.4793 > r2.y.x1x2 <- summary(lm.y.x1x2)$r.square > r2.y.x1x2 [1] 0.7981 > cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2 > cma.4 [1] 0.3123 > > # Note that sorting out unique and common > # explanation area is only possible with > # semi-partial correlation determinant > # NOT partial correlation determinant > # because only semi-partial correlation > # shares the same denominator (as total > # y). > ############################################# >
explanation. added
# ex. # resid(lm(y~x1, data=df)) = bc / delta.y # resid(lm(y~x2, data=df)) = cd / delta.y # resid(lm(y~x1+x2, data=df)) = bcd / delta.y # b / delta.y = ? # ce / delta.x2 = ?
# exp.added spcor.test(df$y, df$x1, df$x2) spcor.test(df$y, df$x1, df$x2)$estimate spcor.test(df$y, df$x1, df$x2)$estimate^2 spcor.test(df$y, df$x2, df$x1)$estimate^2 summary(lm(y~x1+x2, data=df))$r.square b <- spcor.test(df$y, df$x1, df$x2)$estimate^2 d <- spcor.test(df$y, df$x2, df$x1)$estimate^2 bcd <- summary(lm(y~x1+x2, data=df))$r.square summary(lm(df$y~df$x1+df$x2, data=df))$r.square - (spcor.test(df$y, df$x1, df$x2)$estimate^2 + spcor.test(df$y, df$x2, df$x1)$estimate^2) bcd - (b + d)
Another one
c/ms/2025/schedule/w13.lecture.note.1749424134.txt.gz · Last modified: 2025/06/09 08:08 by hkimscil