c:ms:2025:schedule:w13.lecture.note
This is an old revision of the document!
MR
# multiple regression: a simple e.g. # # rm(list=ls()) d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") d colnames(d) <- c("y", "x1", "x2") d # y = 통장갯수 # x1 = 인컴 # x2 = 부양가족수 lm.y.x1 <- lm(y ~ x1, data=d) summary(lm.y.x1) anova(lm.y.x1) lm.y.x2 <- lm(y ~ x2, data=d) summary(lm.y.x2) anova(lm.y.x2) lm.y.x1x2 <- lm(y ~ x1+x2, data=d) summary(lm.y.x1x2) anova(lm.y.x1x2) lm.y.x1x2$coefficient # y.hat = 6.399103 + (0.01184145)*x1 + (−0.54472725)*x2 a <- lm.y.x1x2$coefficient[1] b1 <- lm.y.x1x2$coefficient[2] b2 <- lm.y.x1x2$coefficient[3] a b1 b2 y.pred <- a + (b1 * x1) + (b2 * x2) y.pred # or y.pred2 <- predict(lm.y.x1x2) head(y.pred == y.pred2) y.real <- y y.real y.mean <- mean(y) y.mean res <- y.real - y.pred reg <- y.pred - y.mean y.mean # remember y is sum of res + reg + y.mean y2 <- res + reg + y.mean y==y2 ss.res <- sum(res^2) ss.reg <- sum(reg^2) ss.tot <- var(y) * (length(y)-1) ss.tot ss.res ss.reg ss.res+ss.reg k <- 3 # # of parameters a, b1, b2 df.tot <- length(y)-1 df.reg <- k - 1 df.res <- df.tot - df.reg ms.reg <- ss.reg/df.reg ms.res <- ss.res/df.res ms.reg ms.res f.val <- ms.reg/ms.res f.val p.val <- pf(f.val, df.reg, df.res, lower.tail = F) p.val # double check summary(lm.y.x1x2) anova(lm.y.x1x2) # note on 2 t-tests in summary # anova에서의 x1, x2에 대한 테스트와 # lm에서의 x1, x2에 대한 테스트 (t-test) 간에 # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 # 다름). 그 이유는 # t-tests는 __pr__ 테스트로 테스트를 # (spr, zero_order_r 테스트가 아님) 하고 # anova test는 x1 전체에 대한 테스트 하고 # x2는 x1에 대한 테스트 외에 나머지를 가지고 # 테스트하기 때문에 그러함 # beta coefficient (standardized b) # beta <- b * (sd(x)/sd(y)) beta1 <- b1 * (sd(x1)/sd(y)) beta2 <- b2 * (sd(x2)/sd(y)) beta1 beta2 # install.packages("lm.beta") library(lm.beta) lm.beta(lm.y.x1x2) ####################################################### # partial correlation coefficient and pr2 # x2's explanation? # understand with diagrams first # then calculate with r lm.tmp.1 <- lm(x2~x1, data=d) res.x2.x1 <- lm.tmp.1$residuals lm.tmp.2 <- lm(y~x1, data=d) res.y.x1 <- lm.tmp.2$residuals lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d) summary(lm.tmp.3) # install.packages("ppcor") library(ppcor) partial.r <- pcor.test(y, x2, x1) partial.r str(partial.r) summary(lm.tmp.3) summary(lm.tmp.3)$r.square partial.r$estimate^2 # x1's own explanation? lm.tmp.4 <- lm(x1~x2, data=d) res.x1.x2 <- lm.tmp.4$residuals lm.tmp.5 <- lm(y~x2, data=d) res.y.x2 <- lm.tmp.5$residuals lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d) summary(lm.tmp.6) partial.r <- pcor.test(y, x1, x2) str(partial.r) partial.r$estimate # this is partial correlation, not pr^2 # in order to get pr2, you should ^2 partial.r$estimate^2 ####################################################### # # semipartial correlation coefficient and spr2 # spr.1 <- spcor.test(y,x2,x1) spr.2 <- spcor.test(y,x1,x2) spr.1 spr.2 spr.1$estimate^2 spr.2$estimate^2 lm.tmp.7 <- lm(y ~ res.x2.x1, data = d) summary(lm.tmp.7) spr.1$estimate^2 lm.tmp.8 <- lm(y~res.x1.x2, data = d) summary(lm.tmp.8) spr.2$estimate^2 ####################################################### # get the common area that explain the y variable # 1. summary(lm.y.x2) all.x2 <- summary(lm.y.x2)$r.squared sp.x2 <- spr.1$estimate^2 all.x2 sp.x2 cma.1 <- all.x2 - sp.x2 cma.1 # 2. summary(lm.y.x1) all.x1 <- summary(lm.y.x1)$r.squared sp.x1 <- spr.2$estimate^2 all.x1 sp.x1 cma.2 <- all.x1 - sp.x1 cma.2 # OR 3. summary(lm.y.x1x2) r2.y.x1x2 <- summary(lm.y.x1x2)$r.square r2.y.x1x2 sp.x1 sp.x2 cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2) cma.3 cma.1 cma.2 cma.3 # OR 애초에 simple regression과 multiple # regression에서 얻은 R2을 가지고 # 공통설명력을 알아볼 수도 있었다. r2.x1 <- summary(lm.y.x1)$r.square r2.x2 <- summary(lm.y.x2)$r.square r2.x1x2 <- summary(lm.y.x1x2)$r.square r2.x1 + r2.x2 - r2.x1x2 # Note that sorting out unique and common # explanation area is only possible with # semi-partial correlation determinant # NOT partial correlation determinant # because only semi-partial correlation # shares the same denominator (as total # y). ############################################# ############################################# # scratch ############################################# lm.x1x2 <- lm(x1~x2, data = d) lm.x2x1 <- lm(x2~x1, data = d) summary(lm.y.x1) summary(lm.y.x2) summary(lm.y.x1x2) resx1 <- lm.x1x2$residuals regx1 <- lm.x1x2$fitted.values-mean(x1) lm.y.resx1 <- lm(y~resx1, data = d) summary(lm.y.resx1) # 0.3189 no sig. lm.y.regx1 <- lm(y~regx1, data = d) summary(lm.y.regx1) # 0.4793, p = .027; # but, this is 100% correlated part with x2 summary(lm.y.x2) cor(regx1,x2) # 1 cor(resx1,x2) # 0 resx2 <- lm.x2x1$residuals regx2 <- lm.x2x1$fitted.values-mean(x2) lm.y.resx2 <- lm(y~resx2, data = d) summary(lm.y.resx2) lm.y.regx2 <- lm(y~regx2, data = d) summary(lm.y.regx2) summary(lm.y.x1) cor(regx2,x1) # 1 because of this cor(resx2,x1) # 0 # the below two are the same summary(lm(y~x1, data = d)) summary(lm(y~regx2, data = d)) summary(lm(y~x2, data = d)) # 0.4793 summary(lm(y~x1, data = d)) # 0.6311 summary(lm(y~resx1, data = d)) # 0.3189 summary(lm(y~x1+x2, data = d)) summary(lm(y~x1, data = d))$r.square summary(lm(y~resx1, data = d))$r.square summary(lm(y~x1, data = d))$r.square - summary(lm(y~resx1, data = d))$r.square sqrt(summary(lm(y~resx1, data = d))$r.square) spcor.test(y,x1,x2) sqrt(summary(lm(y~x1, data = d))$r.square - summary(lm(y~resx1, data = d))$r.square) (summary(lm(x1~x2, data =d))$r.square) (summary(lm(y~x1, data = d))$r.square - summary(lm(y~resx1, data = d))$r.square) a <- summary(lm(y~resx1, data = d))$r.square c <- summary(lm(y~x1, data = d))$r.square b <- summary(lm(y~regx1, data = d))$r.square summary(lm(y~resx1, data = d)) summary(lm(y~resx2, data = d)) summary(lm(y~x1, data = d)) summary(lm(y~x2, data = d)) cor(regx2, x1) cor(regx1, x2) ab <- summary(lm(y~x1, data = d))$r.square a <- summary(lm(y~resx1, data = d))$r.square ab a ab-a bc <- summary(lm(y~x2, data = d))$r.square c <- summary(lm(y~resx2, data = d))$r.square bc c bc-c ab-a abc <- summary(lm(y~x1+x2, data = d))$r.square abc a c bc a+(bc-c)+c a+bc ab+c summary(lm(y~regx2+regx1, data = d)) summary(lm(y~x1+x2, data = d)) summary(lm(y~resx1+resx2, data = d)) summary(lm(y~resx1, data =d)) summary(lm(y~resx2, data =d)) cor(resx1,resx2) cor(x1,x2) spcor.test(y,resx1,resx2)$estimate^2 spcor.test(y,resx2,resx1)$estimate^2 cor(y,x1)^2 cor(y,x2)^2 cor(y,resx1)^2 cor(y,resx2)^2 spcor.test(y,x1,x2)$estimate^2 spcor.test(y,x2,x1)$estimate^2 ############################################## # install.packages("lavann") library(lavaan) specmod <- " # path c # identifying path c (prime) by putting c* y ~ c*x2 # path a x1 ~ a*x2 # path b y ~ b*x1 # indirect effect (a*b): Sobel test (Delta Method) # 간접효과 a path x b path 를 구해서 얻음 # sobel test 라 부름 ab := a*b " # Fit/estimate the model fitmod <- sem(specmod, data=d) # summarize the model summary(fitmod, fit.measures=TRUE, rsquare=T) summary(lm(x2~x1, data = d)) summary(lm(y~resx2, data = d)) summary(lm(y~x1, data = d)) summary(lm(y~x2, data =d)) ############################################# # e.g. 2 ############################################# d.yyk <- read.csv("http://commres.net/wiki/_media/d.yyk.csv") d.yyk d.yyk <- subset(d.yyk, select = -c(1)) d.yyk attach(d.yyk) lm.happiness.bmistress <- lm(happiness ~ bmi + stress, data=d.yyk) summary(lm.happiness.bmistress) anova(lm.happiness.bmistress) plot(d.yyk) lm.happiness.bmi <- lm(happiness ~ bmi, data=d.yyk) summary(lm.happiness.bmi) anova(lm.happiness.bmi) lm.happiness.stress <- lm(happiness ~ stress, data = d.yyk) summary(lm.happiness.stress) anova(lm.happiness.stress) # 공통부분을 찾기 # methd. 1 ab <- summary(lm.happiness.bmi)$r.square bc <- summary(lm.happiness.stress)$r.square abc <- summary(lm.happiness.bmistress)$r.square b <- ab+bc-abc b # spcor # spcor method spcor.test(happiness,bmi,stress) a <- spcor.test(happiness,bmi,stress)$estimate^2 spcor.test(happiness,stress,bmi) c <- spcor.test(happiness,stress,bmi)$estimate^2 abc <- summary(lm.happiness.bmistress)$r.square abc-a-c # tidious one # omitt # bmi --> stress --> happiness summary(lm.happiness.bmi) summary(lm.happiness.stress) summary(lm.happiness.bmistress) lm.bmi.stress <- lm(bmi~stress, data = d.yyk) res.lm.bmi.stress <- lm.bmi.stress$residuals lm.happiness.res.lm.bmi.stress <- lm(happiness~res.lm.bmi.stress) summary(lm.happiness.res.lm.bmi.stress) lm.stress.bmi <- lm(stress~bmi, data = d.yyk) reg.lm.stress.bmi <- lm.stress.bmi$fitted.values - mean(stress) lm.happiness.reg.lm.stress.bmi <- lm(happiness~reg.lm.stress.bmi) summary(lm.happiness.reg.lm.stress.bmi) summary(lm.happiness.stress) a <- summary(lm.happiness.res.lm.bmi.stress)$r.square b <- summary(lm.happiness.reg.lm.stress.bmi)$r.square c <- summary(lm.happiness.stress)$r.square a b c a+b lm.stress.bmi <- lm(stress~bmi, data = d.yyk) lm.bmi.stress <- lm(bmi~stress, data = d.yyk) summary(lm.stress.bmi) # stress.hat = a + b * bmi stress.hat <- -1.40167 + (0.16787 * bmi) stress.hat reg.stress summary(lm.bmi.stress) reg.bmi reg.stress <- lm.stress.bmi$fitted.values reg.bmi <- lm.bmi.stress$fitted.values reg.stress reg.bmi d.yyk lm.tmp <- lm(reg.stress~reg.bmi) lm.tmp2 <- lm(reg.bmi~reg.stress) summary(lm.tmp) summary(lm.tmp2) sqrt(summary(lm.tmp)$r.square) summary(lm.stress.bmi) summary(lm.bmi.stress) summary(lm(happiness~stress, data = d.yyk)) summary(lm(happiness~reg.bmi, data = d.yyk)) summary(lm(happiness~bmi, data=d.yyk)) summary(lm(happiness~reg.stress, data=d.yyk)) plot(tmp.a, tmp.b) plot(tmp.b, tmp.a) head(d.yyk) str(d.yyk) d.yyk #########################
output
c/ms/2025/schedule/w13.lecture.note.1748822885.txt.gz · Last modified: 2025/06/02 09:08 by hkimscil