User Tools

Site Tools


c:ms:2025:schedule:w13.lecture.note

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
c:ms:2025:schedule:w13.lecture.note [2025/06/02 09:09] hkimscilc:ms:2025:schedule:w13.lecture.note [2025/06/09 08:51] (current) – [output] hkimscil
Line 5: Line 5:
 # #
 rm(list=ls()) rm(list=ls())
-<- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")  +df <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")  
-d+df
  
-colnames(d) <- c("y", "x1", "x2"+colnames(df) <- c("y", "x1", "x2"
-d+df
 # y = 통장갯수 # y = 통장갯수
 # x1 = 인컴 # x1 = 인컴
 # x2 = 부양가족수  # x2 = 부양가족수 
-lm.y.x1 <- lm(y ~ x1, data=d)+lm.y.x1 <- lm(y ~ x1, data=df)
 summary(lm.y.x1) summary(lm.y.x1)
 anova(lm.y.x1) anova(lm.y.x1)
 +cor(df$x1, df$y)^2
 +summary(lm.y.x1)$r.squared
  
-lm.y.x2 <- lm(y ~ x2, data=d)+ 
 +lm.y.x2 <- lm(y ~ x2, data=df)
 summary(lm.y.x2) summary(lm.y.x2)
 anova(lm.y.x2) anova(lm.y.x2)
 +cor(df$x2, df$y)^2
 +summary(lm.y.x2)$r.squared
  
-lm.y.x1x2 <- lm(y ~ x1+x2, data=d)+ 
 +lm.y.x1x2 <- lm(y ~ x1+x2, data=df)
 summary(lm.y.x1x2) summary(lm.y.x1x2)
 anova(lm.y.x1x2) anova(lm.y.x1x2)
 +bcd <- summary(lm.y.x1x2)$r.squared
 +bcd
 +bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared
 +bc.cd
 +bc.cd - bcd  # note that this is the amount that shared by the two IVs
 +
  
 lm.y.x1x2$coefficient lm.y.x1x2$coefficient
-# y.hat = 6.399103 + (0.01184145)*x1 + (0.54472725)*x2 +# y.hat = 6.399103 + (0.01184145)*x1 + (?0.54472725)*x2 
 a <- lm.y.x1x2$coefficient[1] a <- lm.y.x1x2$coefficient[1]
 b1 <- lm.y.x1x2$coefficient[2] b1 <- lm.y.x1x2$coefficient[2]
Line 34: Line 46:
 b2  b2 
  
-y.pred <- a + (b1 * x1) + (b2 * x2)+y.pred <- a + (b1 * df$x1) + (b2 * df$x2)
 y.pred  y.pred 
 # or  # or 
Line 40: Line 52:
 head(y.pred == y.pred2) head(y.pred == y.pred2)
  
-y.real <- y+y.real <- df$y
 y.real y.real
-y.mean <- mean(y)+y.mean <- mean(df$y)
 y.mean  y.mean 
  
 +deviation.score <- df$y - y.mean
 +ds <- deviation.score
 res <- y.real - y.pred res <- y.real - y.pred
 reg <- y.pred - y.mean reg <- y.pred - y.mean
Line 50: Line 64:
 # remember y is sum of res + reg + y.mean # remember y is sum of res + reg + y.mean
 y2 <- res + reg + y.mean y2 <- res + reg + y.mean
-y==y2+df$y==y2
  
 +ss.tot <- sum(ds^2)
 ss.res <- sum(res^2) ss.res <- sum(res^2)
 ss.reg <- sum(reg^2) ss.reg <- sum(reg^2)
  
-ss.tot <- var(y) * (length(y)-1)+ss.tot2 <- var(df$y) * (length(df$y)-1)
 ss.tot ss.tot
 +ss.tot2
 ss.res ss.res
 ss.reg ss.reg
Line 62: Line 78:
  
 k <- 3 # # of parameters a, b1, b2 k <- 3 # # of parameters a, b1, b2
-df.tot <- length(y)-1+df.tot <- length(df$y)-1
 df.reg <- k - 1 df.reg <- k - 1
 df.res <- df.tot - df.reg df.res <- df.tot - df.reg
Line 78: Line 94:
 summary(lm.y.x1x2) summary(lm.y.x1x2)
 anova(lm.y.x1x2) anova(lm.y.x1x2)
 +
 +summary(lm(y~x2+x1, data=df))
 +anova(lm(y~x2+x1, data=df))
 +
 # note on 2 t-tests in summary # note on 2 t-tests in summary
 # anova에서의 x1, x2에 대한 테스트와  # anova에서의 x1, x2에 대한 테스트와 
Line 89: Line 109:
 # 테스트하기 때문에 그러함 # 테스트하기 때문에 그러함
  
 +# 또한 anova test에서 두번째 IV의 F값은 
 +# summary(lm)에서 두번 째 IV의 t값의 제곱값
 +# 임을 이해. 이는 두번 째 IV의 설명력을 나
 +# 타내는 부분이 lm과 anova 모두 같기 때문
 +# 반면에 첫번째 IV의 경우에는 lm 분석 때에는
 +# 고유의 설명력만을 가지고 (semi-partial cor^2)
 +# 판단을 하는 반면에, anova는 x2와 공유하는 
 +# 설명력도 포함해서 분석하기 때문에 t값의
 +# 제곱이 F값이 되지 못함
 +
 +# beta에 대한 설명
 # beta coefficient (standardized b) # beta coefficient (standardized b)
 # beta <- b * (sd(x)/sd(y)) # beta <- b * (sd(x)/sd(y))
-beta1 <- b1 * (sd(x1)/sd(y)) +beta1 <- b1 * (sd(df$x1)/sd(df$y)) 
-beta2 <- b2 * (sd(x2)/sd(y))+beta2 <- b2 * (sd(df$x2)/sd(df$y))
 beta1 beta1
 beta2 beta2
Line 105: Line 136:
 # understand with diagrams first # understand with diagrams first
 # then calculate with r # then calculate with r
-lm.tmp.1 <- lm(x2~x1, data=d)+lm.tmp.1 <- lm(x2~x1, data=df)
 res.x2.x1 <- lm.tmp.1$residuals res.x2.x1 <- lm.tmp.1$residuals
  
-lm.tmp.2 <- lm(y~x1, data=d)+lm.tmp.2 <- lm(y~x1, data=df)
 res.y.x1 <- lm.tmp.2$residuals res.y.x1 <- lm.tmp.2$residuals
  
-lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)+lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df)
 summary(lm.tmp.3) summary(lm.tmp.3)
 +summary(lm.tmp.3)$r.squared 
 +sqrt(summary(lm.tmp.3)$r.squared)
 # install.packages("ppcor") # install.packages("ppcor")
 library(ppcor) library(ppcor)
-partial.r <- pcor.test(y, x2, x1) +partial.r <- pcor.test(df$y, df$x2, df$x1)
-partial.r+
 str(partial.r) str(partial.r)
 +partial.r$estimate
 summary(lm.tmp.3) summary(lm.tmp.3)
 summary(lm.tmp.3)$r.square summary(lm.tmp.3)$r.square
Line 125: Line 157:
  
 # x1's own explanation? # x1's own explanation?
-lm.tmp.4 <- lm(x1~x2, data=d)+lm.tmp.4 <- lm(x1~x2, data=df)
 res.x1.x2 <- lm.tmp.4$residuals res.x1.x2 <- lm.tmp.4$residuals
  
-lm.tmp.5 <- lm(y~x2, data=d)+lm.tmp.5 <- lm(y~x2, data=df)
 res.y.x2 <- lm.tmp.5$residuals res.y.x2 <- lm.tmp.5$residuals
  
-lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)+lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df)
 summary(lm.tmp.6) summary(lm.tmp.6)
  
-partial.r <- pcor.test(y, x1, x2)+partial.r <- pcor.test(df$y, df$x1, df$x2)
 str(partial.r) str(partial.r)
 partial.r$estimate # this is partial correlation, not pr^2 partial.r$estimate # this is partial correlation, not pr^2
 # in order to get pr2, you should ^2 # in order to get pr2, you should ^2
 partial.r$estimate^2 partial.r$estimate^2
 +summary(lm.tmp.6)$r.squared
  
 ####################################################### #######################################################
-# 
 # semipartial correlation coefficient and spr2 # semipartial correlation coefficient and spr2
 # #
-spr.<- spcor.test(y,x2,x1) +spr.y.x2.x1 <- spcor.test(df$y,df$x2,df$x1) 
-spr.<- spcor.test(y,x1,x2) +spr.y.x1.x2 <- spcor.test(df$y,df$x1,df$x2) 
-spr.1 +spr.y.x2.x1 
-spr.2 +spr.y.x1.x2 
-spr.1$estimate^2 +spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2 
-spr.2$estimate^2+spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2 
 +spr2.y.x2.x1  
 +spr2.y.x1.x2 
  
-lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)+lm.tmp.7 <- lm(y ~ res.x2.x1, data=df)
 summary(lm.tmp.7) summary(lm.tmp.7)
-spr.1$estimate^2+spr2.y.x2.x1 
  
-lm.tmp.8 <- lm(y~res.x1.x2, data = d)+lm.tmp.8 <- lm(y~res.x1.x2, data=df)
 summary(lm.tmp.8) summary(lm.tmp.8)
-spr.2$estimate^2+spr2.y.x1.x2  
 + 
 +bcd # remember bcd in the above? 
 +bd <- spr2.y.x2.x1 + spr2.y.x1.x2  
 +bd 
 +bcd - bd
  
 ####################################################### #######################################################
Line 164: Line 203:
 summary(lm.y.x2) summary(lm.y.x2)
 all.x2 <- summary(lm.y.x2)$r.squared all.x2 <- summary(lm.y.x2)$r.squared
-sp.x2 <- spr.1$estimate^2 
 all.x2 all.x2
-sp.x2 +spr2.y.x2.x1 
-cma.1 <- all.x2 - sp.x2+cma.1 <- all.x2 - spr2.y.x2.x1
 cma.1 cma.1
  
Line 173: Line 211:
 summary(lm.y.x1) summary(lm.y.x1)
 all.x1 <- summary(lm.y.x1)$r.squared all.x1 <- summary(lm.y.x1)$r.squared
-sp.x1 <- spr.2$estimate^2 
 all.x1 all.x1
-sp.x1 +spr2.y.x1.x2 
-cma.2 <- all.x1 - sp.x1+cma.2 <- all.x1 - spr2.y.x1.x2
 cma.2 cma.2
  
Line 183: Line 220:
 r2.y.x1x2 <- summary(lm.y.x1x2)$r.square r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
 r2.y.x1x2 r2.y.x1x2
-sp.x1 +spr2.y.x1.x2 
-sp.x2 +spr2.y.x2.x1 
-cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)+cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 spr2.y.x2.x1) 
 +bcd - bd
 cma.3 cma.3
  
Line 195: Line 233:
 # regression에서 얻은 R2을 가지고  # regression에서 얻은 R2을 가지고 
 # 공통설명력을 알아볼 수도 있었다. # 공통설명력을 알아볼 수도 있었다.
-r2.x1 <- summary(lm.y.x1)$r.square +r2.y.x1 <- summary(lm.y.x1)$r.square 
-r2.x2 <- summary(lm.y.x2)$r.square +r2.y.x2 <- summary(lm.y.x2)$r.square 
-r2.x1x2 <- summary(lm.y.x1x2)$r.square +r2.y.x1 
-r2.x1 + r2.x2 - r2.x1x2+r2.y.x2 
 +r2.y.x1x2 <- summary(lm.y.x1x2)$r.square 
 +r2.y.x1x2 
 +cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2 
 +cma.4 
 + 
 +# Note that sorting out unique and common 
 +# explanation area is only possible with  
 +# semi-partial correlation determinant 
 +# NOT partial correlation determinant 
 +# because only semi-partial correlation 
 +# shares the same denominator (as total  
 +# y). 
 +############################################# 
  
 </code> </code>
 ====== output ====== ====== output ======
 <code> <code>
 +> # multiple regression: a simple e.g.
 +> #
 +> #
 +> rm(list=ls())
 +> df <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv"
 +> df
 +   bankaccount income famnum
 +1            6    220      5
 +2            5    190      6
 +3            7    260      3
 +4            7    200      4
 +5            8    330      2
 +6           10    490      4
 +7            8    210      3
 +8           11    380      2
 +9            9    320      1
 +10              270      3
 +
 +> colnames(df) <- c("y", "x1", "x2")
 +> df
 +    y  x1 x2
 +1   6 220  5
 +2   5 190  6
 +3   7 260  3
 +4   7 200  4
 +5   8 330  2
 +6  10 490  4
 +7   8 210  3
 +8  11 380  2
 +9   9 320  1
 +10  9 270  3
 +> # y = 통장갯수
 +> # x1 = 인컴
 +> # x2 = 부양가족수 
 +> lm.y.x1 <- lm(y ~ x1, data=df)
 +> summary(lm.y.x1)
 +
 +Call:
 +lm(formula = y ~ x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.519 -0.897 -0.130  1.006  1.580 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  3.61778    1.24152    2.91    0.019 * 
 +x1           0.01527    0.00413    3.70    0.006 **
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.18 on 8 degrees of freedom
 +Multiple R-squared:  0.631, Adjusted R-squared:  0.585 
 +F-statistic: 13.7 on 1 and 8 DF,  p-value: 0.00605
 +
 +> anova(lm.y.x1)
 +Analysis of Variance Table
 +
 +Response: y
 +          Df Sum Sq Mean Sq F value Pr(>F)   
 +x1           18.9   18.93    13.7  0.006 **
 +Residuals  8   11.1    1.38                  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +> cor(df$x1, df$y)^2
 +[1] 0.6311
 +> summary(lm.y.x1)$r.squared
 +[1] 0.6311
 +
 +
 +> lm.y.x2 <- lm(y ~ x2, data=df)
 +> summary(lm.y.x2)
 +
 +Call:
 +lm(formula = y ~ x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.254 -0.888 -0.485  0.496  2.592 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)   10.791      1.119    9.64  1.1e-05 ***
 +x2            -0.846      0.312   -2.71    0.027 *  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.4 on 8 degrees of freedom
 +Multiple R-squared:  0.479, Adjusted R-squared:  0.414 
 +F-statistic: 7.36 on 1 and 8 DF,  p-value: 0.0265
 +
 +> anova(lm.y.x2)
 +Analysis of Variance Table
 +
 +Response: y
 +          Df Sum Sq Mean Sq F value Pr(>F)  
 +x2           14.4   14.38    7.36  0.027 *
 +Residuals  8   15.6    1.95                 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +> cor(df$x2, df$y)^2
 +[1] 0.4793
 +> summary(lm.y.x2)$r.squared
 +[1] 0.4793
 +
 +
 +> lm.y.x1x2 <- lm(y ~ x1+x2, data=df)
 +> summary(lm.y.x1x2)
 +
 +Call:
 +lm(formula = y ~ x1 + x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  6.39910    1.51654    4.22   0.0039 **
 +x1           0.01184    0.00356    3.33   0.0127 * 
 +x2          -0.54473    0.22636   -2.41   0.0470 * 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.93 on 7 degrees of freedom
 +Multiple R-squared:  0.798, Adjusted R-squared:  0.74 
 +F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037
 +
 +> anova(lm.y.x1x2)
 +Analysis of Variance Table
 +
 +Response: y
 +          Df Sum Sq Mean Sq F value Pr(>F)   
 +x1          18.93   18.93   21.88 0.0023 **
 +x2           5.01    5.01    5.79 0.0470 * 
 +Residuals  7   6.06    0.87                  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +> bcd <- summary(lm.y.x1x2)$r.squared
 +> bcd
 +[1] 0.7981
 +> bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared
 +> bc.cd
 +[1] 1.11
 +> bc.cd - bcd  # note that this is the amount that shared by the two IVs
 +[1] 0.3123
 +
 +
 +> lm.y.x1x2$coefficient
 +(Intercept)          x1          x2 
 +    6.39910     0.01184    -0.54473 
 +> # y.hat = 6.399103 + (0.01184145)*x1 + (?0.54472725)*x2 
 +> a <- lm.y.x1x2$coefficient[1]
 +> b1 <- lm.y.x1x2$coefficient[2]
 +> b2 <- lm.y.x1x2$coefficient[3]
 +> a
 +(Intercept) 
 +      6.399 
 +> b1
 +     x1 
 +0.01184 
 +> b2 
 +     x2 
 +-0.5447 
 +
 +> y.pred <- a + (b1 * df$x1) + (b2 * df$x2)
 +> y.pred 
 + [1]  6.281  5.381  7.844  6.588  9.217 10.023  7.252  9.809  9.644  7.962
 +> # or 
 +> y.pred2 <- predict(lm.y.x1x2)
 +> head(y.pred == y.pred2)
 +      2    3    4    5    6 
 +TRUE TRUE TRUE TRUE TRUE TRUE 
 +
 +> y.real <- df$y
 +> y.real
 + [1]  6  5  7  7  8 10  8 11  9  9
 +> y.mean <- mean(df$y)
 +> y.mean 
 +[1] 8
 +
 +> deviation.score <- df$y - y.mean
 +> ds <- deviation.score
 +> res <- y.real - y.pred
 +> reg <- y.pred - y.mean
 +> y.mean 
 +[1] 8
 +> # remember y is sum of res + reg + y.mean
 +> y2 <- res + reg + y.mean
 +> df$y==y2
 + [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 +
 +> ss.tot <- sum(ds^2)
 +> ss.res <- sum(res^2)
 +> ss.reg <- sum(reg^2)
 +
 +> ss.tot2 <- var(df$y) * (length(df$y)-1)
 +> ss.tot
 +[1] 30
 +> ss.tot2
 +[1] 30
 +> ss.res
 +[1] 6.056
 +> ss.reg
 +[1] 23.94
 +> ss.res+ss.reg
 +[1] 30
 +
 +> k <- 3 # # of parameters a, b1, b2
 +> df.tot <- length(df$y)-1
 +> df.reg <- k - 1
 +> df.res <- df.tot - df.reg
 +
 +> ms.reg <- ss.reg/df.reg
 +> ms.res <- ss.res/df.res
 +> ms.reg
 +[1] 11.97
 +> ms.res
 +[1] 0.8652
 +> f.val <- ms.reg/ms.res
 +> f.val
 +[1] 13.84
 +> p.val <- pf(f.val, df.reg, df.res, lower.tail = F)
 +> p.val
 +[1] 0.003696
 +
 +> # double check 
 +> summary(lm.y.x1x2)
 +
 +Call:
 +lm(formula = y ~ x1 + x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  6.39910    1.51654    4.22   0.0039 **
 +x1           0.01184    0.00356    3.33   0.0127 * 
 +x2          -0.54473    0.22636   -2.41   0.0470 * 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.93 on 7 degrees of freedom
 +Multiple R-squared:  0.798, Adjusted R-squared:  0.74 
 +F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037
 +
 +> anova(lm.y.x1x2)
 +Analysis of Variance Table
 +
 +Response: y
 +          Df Sum Sq Mean Sq F value Pr(>F)   
 +x1          18.93   18.93   21.88 0.0023 **
 +x2           5.01    5.01    5.79 0.0470 * 
 +Residuals  7   6.06    0.87                  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +> summary(lm(y~x2+x1, data=df))
 +
 +Call:
 +lm(formula = y ~ x2 + x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  6.39910    1.51654    4.22   0.0039 **
 +x2          -0.54473    0.22636   -2.41   0.0470 * 
 +x1           0.01184    0.00356    3.33   0.0127 * 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.93 on 7 degrees of freedom
 +Multiple R-squared:  0.798, Adjusted R-squared:  0.74 
 +F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037
 +
 +> anova(lm(y~x2+x1, data=df))
 +Analysis of Variance Table
 +
 +Response: y
 +          Df Sum Sq Mean Sq F value Pr(>F)   
 +x2          14.38   14.38    16.6 0.0047 **
 +x1           9.57    9.57    11.1 0.0127 * 
 +Residuals  7   6.06    0.87                  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +> # note on 2 t-tests in summary
 +> # anova에서의 x1, x2에 대한 테스트와 
 +> # lm에서의 x1, x2에 대한 테스트 (t-test) 간에
 +> # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 
 +> # 다름). 그 이유는 
 +> # t-tests는 __pr__ 테스트로 테스트를 
 +> # (spr, zero_order_r 테스트가 아님) 하고
 +> # anova test는 x1 전체에 대한 테스트 하고
 +> # x2는 x1에 대한 테스트 외에 나머지를 가지고
 +> # 테스트하기 때문에 그러함
 +
 +> # 또한 anova test에서 두번째 IV의 F값은 
 +> # summary(lm)에서 두번 째 IV의 t값의 제곱값
 +> # 임을 이해. 이는 두번 째 IV의 설명력을 나
 +> # 타내는 부분이 lm과 anova 모두 같기 때문
 +> # 반면에 첫번째 IV의 경우에는 lm 분석 때에는
 +> # 고유의 설명력만을 가지고 (semi-partial cor^2)
 +> # 판단을 하는 반면에, anova는 x2와 공유하는 
 +> # 설명력도 포함해서 분석하기 때문에 t값의
 +> # 제곱이 F값이 되지 못함
 +
 +> # beta에 대한 설명
 +> # beta coefficient (standardized b)
 +> # beta <- b * (sd(x)/sd(y))
 +> beta1 <- b1 * (sd(df$x1)/sd(df$y))
 +> beta2 <- b2 * (sd(df$x2)/sd(df$y))
 +> beta1
 +    x1 
 +0.6161 
 +> beta2
 +     x2 
 +-0.4459 
 +
 +> # install.packages("lm.beta")
 +> library(lm.beta)
 +> lm.beta(lm.y.x1x2)
 +
 +Call:
 +lm(formula = y ~ x1 + x2, data = df)
 +
 +Standardized Coefficients::
 +(Intercept)          x1          x2 
 +         NA      0.6161     -0.4459 
 +
 +
 +> #######################################################
 +> # partial correlation coefficient and pr2
 +> # x2's explanation? 
 +> # understand with diagrams first
 +> # then calculate with r
 +> lm.tmp.1 <- lm(x2~x1, data=df)
 +> res.x2.x1 <- lm.tmp.1$residuals
 +
 +> lm.tmp.2 <- lm(y~x1, data=df)
 +> res.y.x1 <- lm.tmp.2$residuals
 +
 +> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df)
 +> summary(lm.tmp.3)
 +
 +Call:
 +lm(formula = res.y.x1 ~ res.x2.x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +             Estimate Std. Error t value Pr(>|t|)  
 +(Intercept)  6.28e-18   2.75e-01    0.00    1.000  
 +res.x2.x1   -5.45e-01   2.12e-01   -2.57    0.033 *
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.87 on 8 degrees of freedom
 +Multiple R-squared:  0.453, Adjusted R-squared:  0.384 
 +F-statistic: 6.62 on 1 and 8 DF,  p-value: 0.033
 +
 +> summary(lm.tmp.3)$r.squared
 +[1] 0.4527
 +> sqrt(summary(lm.tmp.3)$r.squared)
 +[1] 0.6729
 +> # install.packages("ppcor")
 +> library(ppcor)
 +> partial.r <- pcor.test(df$y, df$x2, df$x1)
 +> str(partial.r)
 +'data.frame': 1 obs. of  6 variables:
 + $ estimate : num -0.673
 + $ p.value  : num 0.047
 + $ statistic: num -2.41
 + $ n        : int 10
 + $ gp       : num 1
 + $ Method   : chr "pearson"
 +> partial.r$estimate
 +[1] -0.6729
 +> summary(lm.tmp.3)
 +
 +Call:
 +lm(formula = res.y.x1 ~ res.x2.x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +             Estimate Std. Error t value Pr(>|t|)  
 +(Intercept)  6.28e-18   2.75e-01    0.00    1.000  
 +res.x2.x1   -5.45e-01   2.12e-01   -2.57    0.033 *
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.87 on 8 degrees of freedom
 +Multiple R-squared:  0.453, Adjusted R-squared:  0.384 
 +F-statistic: 6.62 on 1 and 8 DF,  p-value: 0.033
 +
 +> summary(lm.tmp.3)$r.square
 +[1] 0.4527
 +> partial.r$estimate^2
 +[1] 0.4527
 +
 +
 +> # x1's own explanation?
 +> lm.tmp.4 <- lm(x1~x2, data=df)
 +> res.x1.x2 <- lm.tmp.4$residuals
 +
 +> lm.tmp.5 <- lm(y~x2, data=df)
 +> res.y.x2 <- lm.tmp.5$residuals
 +
 +> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df)
 +> summary(lm.tmp.6)
 +
 +Call:
 +lm(formula = res.y.x2 ~ res.x1.x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept) 1.33e-17   2.75e-01    0.00   1.0000   
 +res.x1.x2   1.18e-02   3.33e-03    3.55   0.0075 **
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.87 on 8 degrees of freedom
 +Multiple R-squared:  0.612, Adjusted R-squared:  0.564 
 +F-statistic: 12.6 on 1 and 8 DF,  p-value: 0.00746
 +
 +
 +> partial.r <- pcor.test(df$y, df$x1, df$x2)
 +> str(partial.r)
 +'data.frame': 1 obs. of  6 variables:
 + $ estimate : num 0.783
 + $ p.value  : num 0.0127
 + $ statistic: num 3.33
 + $ n        : int 10
 + $ gp       : num 1
 + $ Method   : chr "pearson"
 +> partial.r$estimate # this is partial correlation, not pr^2
 +[1] 0.7825
 +> # in order to get pr2, you should ^2
 +> partial.r$estimate^2
 +[1] 0.6123
 +> summary(lm.tmp.6)$r.squared
 +[1] 0.6123
 +
 +> #######################################################
 +> # semipartial correlation coefficient and spr2
 +> #
 +> spr.y.x2.x1 <- spcor.test(df$y,df$x2,df$x1)
 +> spr.y.x1.x2 <- spcor.test(df$y,df$x1,df$x2)
 +> spr.y.x2.x1
 +  estimate p.value statistic  n gp  Method
 +1  -0.4087  0.2748    -1.185 10  1 pearson
 +> spr.y.x1.x2
 +  estimate p.value statistic  n gp  Method
 +1   0.5647  0.1132      1.81 10  1 pearson
 +> spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2
 +> spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2
 +> spr2.y.x2.x1 
 +[1] 0.167
 +> spr2.y.x1.x2 
 +[1] 0.3189
 +
 +> lm.tmp.7 <- lm(y ~ res.x2.x1, data=df)
 +> summary(lm.tmp.7)
 +
 +Call:
 +lm(formula = y ~ res.x2.x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.862 -1.171 -0.494  0.549  3.077 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)    8.000      0.559   14.31  5.5e-07 ***
 +res.x2.x1     -0.545      0.430   -1.27     0.24    
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.77 on 8 degrees of freedom
 +Multiple R-squared:  0.167, Adjusted R-squared:  0.0629 
 +F-statistic:  1.6 on 1 and 8 DF,  p-value: 0.241
 +
 +> spr2.y.x2.x1 
 +[1] 0.167
 +
 +> lm.tmp.8 <- lm(y~res.x1.x2, data=df)
 +> summary(lm.tmp.8)
 +
 +Call:
 +lm(formula = y ~ res.x1.x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-2.664 -0.608 -0.149  1.219  2.290 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)  8.00000    0.50540   15.83  2.5e-07 ***
 +res.x1.x2    0.01184    0.00612    1.94    0.089 .  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.6 on 8 degrees of freedom
 +Multiple R-squared:  0.319, Adjusted R-squared:  0.234 
 +F-statistic: 3.74 on 1 and 8 DF,  p-value: 0.089
 +
 +> spr2.y.x1.x2 
 +[1] 0.3189
 +
 +> bcd # remember bcd in the above?
 +[1] 0.7981
 +> bd <- spr2.y.x2.x1 + spr2.y.x1.x2 
 +> bd
 +[1] 0.4859
 +> bcd - bd
 +[1] 0.3123
 +
 +> #######################################################
 +> # get the common area that explain the y variable
 +> # 1.
 +> summary(lm.y.x2)
 +
 +Call:
 +lm(formula = y ~ x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.254 -0.888 -0.485  0.496  2.592 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)   10.791      1.119    9.64  1.1e-05 ***
 +x2            -0.846      0.312   -2.71    0.027 *  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.4 on 8 degrees of freedom
 +Multiple R-squared:  0.479, Adjusted R-squared:  0.414 
 +F-statistic: 7.36 on 1 and 8 DF,  p-value: 0.0265
 +
 +> all.x2 <- summary(lm.y.x2)$r.squared
 +> all.x2
 +[1] 0.4793
 +> spr2.y.x2.x1
 +[1] 0.167
 +> cma.1 <- all.x2 - spr2.y.x2.x1
 +> cma.1
 +[1] 0.3123
 +
 +> # 2.
 +> summary(lm.y.x1)
 +
 +Call:
 +lm(formula = y ~ x1, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.519 -0.897 -0.130  1.006  1.580 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  3.61778    1.24152    2.91    0.019 * 
 +x1           0.01527    0.00413    3.70    0.006 **
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.18 on 8 degrees of freedom
 +Multiple R-squared:  0.631, Adjusted R-squared:  0.585 
 +F-statistic: 13.7 on 1 and 8 DF,  p-value: 0.00605
 +
 +> all.x1 <- summary(lm.y.x1)$r.squared
 +> all.x1
 +[1] 0.6311
 +> spr2.y.x1.x2
 +[1] 0.3189
 +> cma.2 <- all.x1 - spr2.y.x1.x2
 +> cma.2
 +[1] 0.3123
 +
 +> # OR 3.
 +> summary(lm.y.x1x2)
 +
 +Call:
 +lm(formula = y ~ x1 + x2, data = df)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-1.217 -0.578 -0.151  0.664  1.191 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept)  6.39910    1.51654    4.22   0.0039 **
 +x1           0.01184    0.00356    3.33   0.0127 * 
 +x2          -0.54473    0.22636   -2.41   0.0470 * 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.93 on 7 degrees of freedom
 +Multiple R-squared:  0.798, Adjusted R-squared:  0.74 
 +F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037
 +
 +> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
 +> r2.y.x1x2
 +[1] 0.7981
 +> spr2.y.x1.x2
 +[1] 0.3189
 +> spr2.y.x2.x1
 +[1] 0.167
 +> cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1)
 +> bcd - bd
 +[1] 0.3123
 +> cma.3
 +[1] 0.3123
 +
 +> cma.1 
 +[1] 0.3123
 +> cma.2
 +[1] 0.3123
 +> cma.3
 +[1] 0.3123
 +
 +> # OR 애초에 simple regression과 multiple 
 +> # regression에서 얻은 R2을 가지고 
 +> # 공통설명력을 알아볼 수도 있었다.
 +> r2.y.x1 <- summary(lm.y.x1)$r.square
 +> r2.y.x2 <- summary(lm.y.x2)$r.square
 +> r2.y.x1
 +[1] 0.6311
 +> r2.y.x2
 +[1] 0.4793
 +> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
 +> r2.y.x1x2
 +[1] 0.7981
 +> cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2
 +> cma.4
 +[1] 0.3123
 +
 +> # Note that sorting out unique and common
 +> # explanation area is only possible with 
 +> # semi-partial correlation determinant
 +> # NOT partial correlation determinant
 +> # because only semi-partial correlation
 +> # shares the same denominator (as total 
 +> # y).
 +> #############################################
 +
 +
 </code> </code>
  
 +====== explanation. added ======
 +{{:c:ms:2025:schedule:pasted:20250609-074413.png?400}}
 +<code>
 +# ex.
 +# resid(lm(y~x1, data=df)) = bc / delta.y
 +# resid(lm(y~x2, data=df)) = cd / delta.y
 +# resid(lm(y~x1+x2, data=df)) = bcd / delta.y
 +# b / delta.y = ?
 +# ce / delta.x2 = ?
 +</code>
 +
 +<code>
 +# exp.added
 +spcor.test(df$y, df$x1, df$x2)
 +spcor.test(df$y, df$x1, df$x2)$estimate
 +spcor.test(df$y, df$x1, df$x2)$estimate^2
 +spcor.test(df$y, df$x2, df$x1)$estimate^2
 +summary(lm(y~x1+x2, data=df))$r.square
 +
 +b <- spcor.test(df$y, df$x1, df$x2)$estimate^2
 +d <- spcor.test(df$y, df$x2, df$x1)$estimate^2
 +bcd <- summary(lm(y~x1+x2, data=df))$r.square
 +
 +summary(lm(df$y~df$x1+df$x2, data=df))$r.square - 
 +  (spcor.test(df$y, df$x1, df$x2)$estimate^2 + 
 +     spcor.test(df$y, df$x2, df$x1)$estimate^2)
 +bcd - (b + d)
 +
 +</code>
  
 +====== Another one ======
 +[[:partial and semipartial correlation]]
  
c/ms/2025/schedule/w13.lecture.note.1748822961.txt.gz · Last modified: 2025/06/02 09:09 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki