====== MR ======


# multiple regression: a simple e.g.
#
#
rm(list=ls())
df <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") 
df

colnames(df) <- c("y", "x1", "x2")
df
# y = 통장갯수
# x1 = 인컴
# x2 = 부양가족수 
lm.y.x1 <- lm(y ~ x1, data=df)
summary(lm.y.x1)
anova(lm.y.x1)
cor(df$x1, df$y)^2
summary(lm.y.x1)$r.squared


lm.y.x2 <- lm(y ~ x2, data=df)
summary(lm.y.x2)
anova(lm.y.x2)
cor(df$x2, df$y)^2
summary(lm.y.x2)$r.squared


lm.y.x1x2 <- lm(y ~ x1+x2, data=df)
summary(lm.y.x1x2)
anova(lm.y.x1x2)
bcd <- summary(lm.y.x1x2)$r.squared
bcd
bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared
bc.cd
bc.cd - bcd  # note that this is the amount that shared by the two IVs


lm.y.x1x2$coefficient
# y.hat = 6.399103 + (0.01184145)*x1 + (?0.54472725)*x2 
a <- lm.y.x1x2$coefficient[1]
b1 <- lm.y.x1x2$coefficient[2]
b2 <- lm.y.x1x2$coefficient[3]
a
b1
b2 

y.pred <- a + (b1 * df$x1) + (b2 * df$x2)
y.pred 
# or 
y.pred2 <- predict(lm.y.x1x2)
head(y.pred == y.pred2)

y.real <- df$y
y.real
y.mean <- mean(df$y)
y.mean 

deviation.score <- df$y - y.mean
ds <- deviation.score
res <- y.real - y.pred
reg <- y.pred - y.mean
y.mean 
# remember y is sum of res + reg + y.mean
y2 <- res + reg + y.mean
df$y==y2

ss.tot <- sum(ds^2)
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)

ss.tot2 <- var(df$y) * (length(df$y)-1)
ss.tot
ss.tot2
ss.res
ss.reg
ss.res+ss.reg

k <- 3 # # of parameters a, b1, b2
df.tot <- length(df$y)-1
df.reg <- k - 1
df.res <- df.tot - df.reg

ms.reg <- ss.reg/df.reg
ms.res <- ss.res/df.res
ms.reg
ms.res
f.val <- ms.reg/ms.res
f.val
p.val <- pf(f.val, df.reg, df.res, lower.tail = F)
p.val

# double check 
summary(lm.y.x1x2)
anova(lm.y.x1x2)

summary(lm(y~x2+x1, data=df))
anova(lm(y~x2+x1, data=df))

# note on 2 t-tests in summary
# anova에서의 x1, x2에 대한 테스트와 
# lm에서의 x1, x2에 대한 테스트 (t-test) 간에
# 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 
# 다름). 그 이유는 
# t-tests는 __pr__ 테스트로 테스트를 
# (spr, zero_order_r 테스트가 아님) 하고
# anova test는 x1 전체에 대한 테스트 하고
# x2는 x1에 대한 테스트 외에 나머지를 가지고
# 테스트하기 때문에 그러함

# 또한 anova test에서 두번째 IV의 F값은 
# summary(lm)에서 두번 째 IV의 t값의 제곱값
# 임을 이해. 이는 두번 째 IV의 설명력을 나
# 타내는 부분이 lm과 anova 모두 같기 때문
# 반면에 첫번째 IV의 경우에는 lm 분석 때에는
# 고유의 설명력만을 가지고 (semi-partial cor^2)
# 판단을 하는 반면에, anova는 x2와 공유하는 
# 설명력도 포함해서 분석하기 때문에 t값의
# 제곱이 F값이 되지 못함

# beta에 대한 설명
# beta coefficient (standardized b)
# beta <- b * (sd(x)/sd(y))
beta1 <- b1 * (sd(df$x1)/sd(df$y))
beta2 <- b2 * (sd(df$x2)/sd(df$y))
beta1
beta2

# install.packages("lm.beta")
library(lm.beta)
lm.beta(lm.y.x1x2)

#######################################################
# partial correlation coefficient and pr2
# x2's explanation? 
# understand with diagrams first
# then calculate with r
lm.tmp.1 <- lm(x2~x1, data=df)
res.x2.x1 <- lm.tmp.1$residuals

lm.tmp.2 <- lm(y~x1, data=df)
res.y.x1 <- lm.tmp.2$residuals

lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df)
summary(lm.tmp.3)
summary(lm.tmp.3)$r.squared
sqrt(summary(lm.tmp.3)$r.squared)
# install.packages("ppcor")
library(ppcor)
partial.r <- pcor.test(df$y, df$x2, df$x1)
str(partial.r)
partial.r$estimate
summary(lm.tmp.3)
summary(lm.tmp.3)$r.square
partial.r$estimate^2


# x1's own explanation?
lm.tmp.4 <- lm(x1~x2, data=df)
res.x1.x2 <- lm.tmp.4$residuals

lm.tmp.5 <- lm(y~x2, data=df)
res.y.x2 <- lm.tmp.5$residuals

lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df)
summary(lm.tmp.6)

partial.r <- pcor.test(df$y, df$x1, df$x2)
str(partial.r)
partial.r$estimate # this is partial correlation, not pr^2
# in order to get pr2, you should ^2
partial.r$estimate^2
summary(lm.tmp.6)$r.squared

#######################################################
# semipartial correlation coefficient and spr2
#
spr.y.x2.x1 <- spcor.test(df$y,df$x2,df$x1)
spr.y.x1.x2 <- spcor.test(df$y,df$x1,df$x2)
spr.y.x2.x1
spr.y.x1.x2
spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2
spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2
spr2.y.x2.x1 
spr2.y.x1.x2 

lm.tmp.7 <- lm(y ~ res.x2.x1, data=df)
summary(lm.tmp.7)
spr2.y.x2.x1 

lm.tmp.8 <- lm(y~res.x1.x2, data=df)
summary(lm.tmp.8)
spr2.y.x1.x2 

bcd # remember bcd in the above?
bd <- spr2.y.x2.x1 + spr2.y.x1.x2 
bd
bcd - bd

#######################################################
# get the common area that explain the y variable
# 1.
summary(lm.y.x2)
all.x2 <- summary(lm.y.x2)$r.squared
all.x2
spr2.y.x2.x1
cma.1 <- all.x2 - spr2.y.x2.x1
cma.1

# 2.
summary(lm.y.x1)
all.x1 <- summary(lm.y.x1)$r.squared
all.x1
spr2.y.x1.x2
cma.2 <- all.x1 - spr2.y.x1.x2
cma.2

# OR 3.
summary(lm.y.x1x2)
r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
r2.y.x1x2
spr2.y.x1.x2
spr2.y.x2.x1
cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1)
bcd - bd
cma.3

cma.1 
cma.2
cma.3

# OR 애초에 simple regression과 multiple 
# regression에서 얻은 R2을 가지고 
# 공통설명력을 알아볼 수도 있었다.
r2.y.x1 <- summary(lm.y.x1)$r.square
r2.y.x2 <- summary(lm.y.x2)$r.square
r2.y.x1
r2.y.x2
r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
r2.y.x1x2
cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2
cma.4

# Note that sorting out unique and common
# explanation area is only possible with 
# semi-partial correlation determinant
# NOT partial correlation determinant
# because only semi-partial correlation
# shares the same denominator (as total 
# y).
#############################################

====== output ======


> # multiple regression: a simple e.g.
> #
> #
> rm(list=ls())
> df <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") 
> df
   bankaccount income famnum
1            6    220      5
2            5    190      6
3            7    260      3
4            7    200      4
5            8    330      2
6           10    490      4
7            8    210      3
8           11    380      2
9            9    320      1
10           9    270      3
> 
> colnames(df) <- c("y", "x1", "x2")
> df
    y  x1 x2
1   6 220  5
2   5 190  6
3   7 260  3
4   7 200  4
5   8 330  2
6  10 490  4
7   8 210  3
8  11 380  2
9   9 320  1
10  9 270  3
> # y = 통장갯수
> # x1 = 인컴
> # x2 = 부양가족수 
> lm.y.x1 <- lm(y ~ x1, data=df)
> summary(lm.y.x1)

Call:
lm(formula = y ~ x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.519 -0.897 -0.130  1.006  1.580 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  3.61778    1.24152    2.91    0.019 * 
x1           0.01527    0.00413    3.70    0.006 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.18 on 8 degrees of freedom
Multiple R-squared:  0.631,	Adjusted R-squared:  0.585 
F-statistic: 13.7 on 1 and 8 DF,  p-value: 0.00605

> anova(lm.y.x1)
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value Pr(>F)   
x1         1   18.9   18.93    13.7  0.006 **
Residuals  8   11.1    1.38                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> cor(df$x1, df$y)^2
[1] 0.6311
> summary(lm.y.x1)$r.squared
[1] 0.6311
> 
> 
> lm.y.x2 <- lm(y ~ x2, data=df)
> summary(lm.y.x2)

Call:
lm(formula = y ~ x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.254 -0.888 -0.485  0.496  2.592 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   10.791      1.119    9.64  1.1e-05 ***
x2            -0.846      0.312   -2.71    0.027 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.4 on 8 degrees of freedom
Multiple R-squared:  0.479,	Adjusted R-squared:  0.414 
F-statistic: 7.36 on 1 and 8 DF,  p-value: 0.0265

> anova(lm.y.x2)
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value Pr(>F)  
x2         1   14.4   14.38    7.36  0.027 *
Residuals  8   15.6    1.95                 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> cor(df$x2, df$y)^2
[1] 0.4793
> summary(lm.y.x2)$r.squared
[1] 0.4793
> 
> 
> lm.y.x1x2 <- lm(y ~ x1+x2, data=df)
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.39910    1.51654    4.22   0.0039 **
x1           0.01184    0.00356    3.33   0.0127 * 
x2          -0.54473    0.22636   -2.41   0.0470 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.93 on 7 degrees of freedom
Multiple R-squared:  0.798,	Adjusted R-squared:  0.74 
F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037

> anova(lm.y.x1x2)
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value Pr(>F)   
x1         1  18.93   18.93   21.88 0.0023 **
x2         1   5.01    5.01    5.79 0.0470 * 
Residuals  7   6.06    0.87                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> bcd <- summary(lm.y.x1x2)$r.squared
> bcd
[1] 0.7981
> bc.cd <- summary(lm.y.x1)$r.squared + summary(lm.y.x2)$r.squared
> bc.cd
[1] 1.11
> bc.cd - bcd  # note that this is the amount that shared by the two IVs
[1] 0.3123
> 
> 
> lm.y.x1x2$coefficient
(Intercept)          x1          x2 
    6.39910     0.01184    -0.54473 
> # y.hat = 6.399103 + (0.01184145)*x1 + (?0.54472725)*x2 
> a <- lm.y.x1x2$coefficient[1]
> b1 <- lm.y.x1x2$coefficient[2]
> b2 <- lm.y.x1x2$coefficient[3]
> a
(Intercept) 
      6.399 
> b1
     x1 
0.01184 
> b2 
     x2 
-0.5447 
> 
> y.pred <- a + (b1 * df$x1) + (b2 * df$x2)
> y.pred 
 [1]  6.281  5.381  7.844  6.588  9.217 10.023  7.252  9.809  9.644  7.962
> # or 
> y.pred2 <- predict(lm.y.x1x2)
> head(y.pred == y.pred2)
   1    2    3    4    5    6 
TRUE TRUE TRUE TRUE TRUE TRUE 
> 
> y.real <- df$y
> y.real
 [1]  6  5  7  7  8 10  8 11  9  9
> y.mean <- mean(df$y)
> y.mean 
[1] 8
> 
> deviation.score <- df$y - y.mean
> ds <- deviation.score
> res <- y.real - y.pred
> reg <- y.pred - y.mean
> y.mean 
[1] 8
> # remember y is sum of res + reg + y.mean
> y2 <- res + reg + y.mean
> df$y==y2
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
> 
> ss.tot <- sum(ds^2)
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> 
> ss.tot2 <- var(df$y) * (length(df$y)-1)
> ss.tot
[1] 30
> ss.tot2
[1] 30
> ss.res
[1] 6.056
> ss.reg
[1] 23.94
> ss.res+ss.reg
[1] 30
> 
> k <- 3 # # of parameters a, b1, b2
> df.tot <- length(df$y)-1
> df.reg <- k - 1
> df.res <- df.tot - df.reg
> 
> ms.reg <- ss.reg/df.reg
> ms.res <- ss.res/df.res
> ms.reg
[1] 11.97
> ms.res
[1] 0.8652
> f.val <- ms.reg/ms.res
> f.val
[1] 13.84
> p.val <- pf(f.val, df.reg, df.res, lower.tail = F)
> p.val
[1] 0.003696
> 
> # double check 
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.39910    1.51654    4.22   0.0039 **
x1           0.01184    0.00356    3.33   0.0127 * 
x2          -0.54473    0.22636   -2.41   0.0470 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.93 on 7 degrees of freedom
Multiple R-squared:  0.798,	Adjusted R-squared:  0.74 
F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037

> anova(lm.y.x1x2)
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value Pr(>F)   
x1         1  18.93   18.93   21.88 0.0023 **
x2         1   5.01    5.01    5.79 0.0470 * 
Residuals  7   6.06    0.87                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> summary(lm(y~x2+x1, data=df))

Call:
lm(formula = y ~ x2 + x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.39910    1.51654    4.22   0.0039 **
x2          -0.54473    0.22636   -2.41   0.0470 * 
x1           0.01184    0.00356    3.33   0.0127 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.93 on 7 degrees of freedom
Multiple R-squared:  0.798,	Adjusted R-squared:  0.74 
F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037

> anova(lm(y~x2+x1, data=df))
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value Pr(>F)   
x2         1  14.38   14.38    16.6 0.0047 **
x1         1   9.57    9.57    11.1 0.0127 * 
Residuals  7   6.06    0.87                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> # note on 2 t-tests in summary
> # anova에서의 x1, x2에 대한 테스트와 
> # lm에서의 x1, x2에 대한 테스트 (t-test) 간에
> # 차이가 있음에 주의 (x1, x2에 대한 Pr 값이 
> # 다름). 그 이유는 
> # t-tests는 __pr__ 테스트로 테스트를 
> # (spr, zero_order_r 테스트가 아님) 하고
> # anova test는 x1 전체에 대한 테스트 하고
> # x2는 x1에 대한 테스트 외에 나머지를 가지고
> # 테스트하기 때문에 그러함
> 
> # 또한 anova test에서 두번째 IV의 F값은 
> # summary(lm)에서 두번 째 IV의 t값의 제곱값
> # 임을 이해. 이는 두번 째 IV의 설명력을 나
> # 타내는 부분이 lm과 anova 모두 같기 때문
> # 반면에 첫번째 IV의 경우에는 lm 분석 때에는
> # 고유의 설명력만을 가지고 (semi-partial cor^2)
> # 판단을 하는 반면에, anova는 x2와 공유하는 
> # 설명력도 포함해서 분석하기 때문에 t값의
> # 제곱이 F값이 되지 못함
> 
> # beta에 대한 설명
> # beta coefficient (standardized b)
> # beta <- b * (sd(x)/sd(y))
> beta1 <- b1 * (sd(df$x1)/sd(df$y))
> beta2 <- b2 * (sd(df$x2)/sd(df$y))
> beta1
    x1 
0.6161 
> beta2
     x2 
-0.4459 
> 
> # install.packages("lm.beta")
> library(lm.beta)
> lm.beta(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = df)

Standardized Coefficients::
(Intercept)          x1          x2 
         NA      0.6161     -0.4459 

> 
> #######################################################
> # partial correlation coefficient and pr2
> # x2's explanation? 
> # understand with diagrams first
> # then calculate with r
> lm.tmp.1 <- lm(x2~x1, data=df)
> res.x2.x1 <- lm.tmp.1$residuals
> 
> lm.tmp.2 <- lm(y~x1, data=df)
> res.y.x1 <- lm.tmp.2$residuals
> 
> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=df)
> summary(lm.tmp.3)

Call:
lm(formula = res.y.x1 ~ res.x2.x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept)  6.28e-18   2.75e-01    0.00    1.000  
res.x2.x1   -5.45e-01   2.12e-01   -2.57    0.033 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.87 on 8 degrees of freedom
Multiple R-squared:  0.453,	Adjusted R-squared:  0.384 
F-statistic: 6.62 on 1 and 8 DF,  p-value: 0.033

> summary(lm.tmp.3)$r.squared
[1] 0.4527
> sqrt(summary(lm.tmp.3)$r.squared)
[1] 0.6729
> # install.packages("ppcor")
> library(ppcor)
> partial.r <- pcor.test(df$y, df$x2, df$x1)
> str(partial.r)
'data.frame':	1 obs. of  6 variables:
 $ estimate : num -0.673
 $ p.value  : num 0.047
 $ statistic: num -2.41
 $ n        : int 10
 $ gp       : num 1
 $ Method   : chr "pearson"
> partial.r$estimate
[1] -0.6729
> summary(lm.tmp.3)

Call:
lm(formula = res.y.x1 ~ res.x2.x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept)  6.28e-18   2.75e-01    0.00    1.000  
res.x2.x1   -5.45e-01   2.12e-01   -2.57    0.033 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.87 on 8 degrees of freedom
Multiple R-squared:  0.453,	Adjusted R-squared:  0.384 
F-statistic: 6.62 on 1 and 8 DF,  p-value: 0.033

> summary(lm.tmp.3)$r.square
[1] 0.4527
> partial.r$estimate^2
[1] 0.4527
> 
> 
> # x1's own explanation?
> lm.tmp.4 <- lm(x1~x2, data=df)
> res.x1.x2 <- lm.tmp.4$residuals
> 
> lm.tmp.5 <- lm(y~x2, data=df)
> res.y.x2 <- lm.tmp.5$residuals
> 
> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=df)
> summary(lm.tmp.6)

Call:
lm(formula = res.y.x2 ~ res.x1.x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) 1.33e-17   2.75e-01    0.00   1.0000   
res.x1.x2   1.18e-02   3.33e-03    3.55   0.0075 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.87 on 8 degrees of freedom
Multiple R-squared:  0.612,	Adjusted R-squared:  0.564 
F-statistic: 12.6 on 1 and 8 DF,  p-value: 0.00746

> 
> partial.r <- pcor.test(df$y, df$x1, df$x2)
> str(partial.r)
'data.frame':	1 obs. of  6 variables:
 $ estimate : num 0.783
 $ p.value  : num 0.0127
 $ statistic: num 3.33
 $ n        : int 10
 $ gp       : num 1
 $ Method   : chr "pearson"
> partial.r$estimate # this is partial correlation, not pr^2
[1] 0.7825
> # in order to get pr2, you should ^2
> partial.r$estimate^2
[1] 0.6123
> summary(lm.tmp.6)$r.squared
[1] 0.6123
> 
> #######################################################
> # semipartial correlation coefficient and spr2
> #
> spr.y.x2.x1 <- spcor.test(df$y,df$x2,df$x1)
> spr.y.x1.x2 <- spcor.test(df$y,df$x1,df$x2)
> spr.y.x2.x1
  estimate p.value statistic  n gp  Method
1  -0.4087  0.2748    -1.185 10  1 pearson
> spr.y.x1.x2
  estimate p.value statistic  n gp  Method
1   0.5647  0.1132      1.81 10  1 pearson
> spr2.y.x2.x1 <- spr.y.x2.x1$estimate^2
> spr2.y.x1.x2 <- spr.y.x1.x2$estimate^2
> spr2.y.x2.x1 
[1] 0.167
> spr2.y.x1.x2 
[1] 0.3189
> 
> lm.tmp.7 <- lm(y ~ res.x2.x1, data=df)
> summary(lm.tmp.7)

Call:
lm(formula = y ~ res.x2.x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.862 -1.171 -0.494  0.549  3.077 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    8.000      0.559   14.31  5.5e-07 ***
res.x2.x1     -0.545      0.430   -1.27     0.24    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.77 on 8 degrees of freedom
Multiple R-squared:  0.167,	Adjusted R-squared:  0.0629 
F-statistic:  1.6 on 1 and 8 DF,  p-value: 0.241

> spr2.y.x2.x1 
[1] 0.167
> 
> lm.tmp.8 <- lm(y~res.x1.x2, data=df)
> summary(lm.tmp.8)

Call:
lm(formula = y ~ res.x1.x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-2.664 -0.608 -0.149  1.219  2.290 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  8.00000    0.50540   15.83  2.5e-07 ***
res.x1.x2    0.01184    0.00612    1.94    0.089 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.6 on 8 degrees of freedom
Multiple R-squared:  0.319,	Adjusted R-squared:  0.234 
F-statistic: 3.74 on 1 and 8 DF,  p-value: 0.089

> spr2.y.x1.x2 
[1] 0.3189
> 
> bcd # remember bcd in the above?
[1] 0.7981
> bd <- spr2.y.x2.x1 + spr2.y.x1.x2 
> bd
[1] 0.4859
> bcd - bd
[1] 0.3123
> 
> #######################################################
> # get the common area that explain the y variable
> # 1.
> summary(lm.y.x2)

Call:
lm(formula = y ~ x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.254 -0.888 -0.485  0.496  2.592 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   10.791      1.119    9.64  1.1e-05 ***
x2            -0.846      0.312   -2.71    0.027 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.4 on 8 degrees of freedom
Multiple R-squared:  0.479,	Adjusted R-squared:  0.414 
F-statistic: 7.36 on 1 and 8 DF,  p-value: 0.0265

> all.x2 <- summary(lm.y.x2)$r.squared
> all.x2
[1] 0.4793
> spr2.y.x2.x1
[1] 0.167
> cma.1 <- all.x2 - spr2.y.x2.x1
> cma.1
[1] 0.3123
> 
> # 2.
> summary(lm.y.x1)

Call:
lm(formula = y ~ x1, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.519 -0.897 -0.130  1.006  1.580 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  3.61778    1.24152    2.91    0.019 * 
x1           0.01527    0.00413    3.70    0.006 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.18 on 8 degrees of freedom
Multiple R-squared:  0.631,	Adjusted R-squared:  0.585 
F-statistic: 13.7 on 1 and 8 DF,  p-value: 0.00605

> all.x1 <- summary(lm.y.x1)$r.squared
> all.x1
[1] 0.6311
> spr2.y.x1.x2
[1] 0.3189
> cma.2 <- all.x1 - spr2.y.x1.x2
> cma.2
[1] 0.3123
> 
> # OR 3.
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-1.217 -0.578 -0.151  0.664  1.191 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.39910    1.51654    4.22   0.0039 **
x1           0.01184    0.00356    3.33   0.0127 * 
x2          -0.54473    0.22636   -2.41   0.0470 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.93 on 7 degrees of freedom
Multiple R-squared:  0.798,	Adjusted R-squared:  0.74 
F-statistic: 13.8 on 2 and 7 DF,  p-value: 0.0037

> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
> r2.y.x1x2
[1] 0.7981
> spr2.y.x1.x2
[1] 0.3189
> spr2.y.x2.x1
[1] 0.167
> cma.3 <- r2.y.x1x2 - (spr2.y.x1.x2 + spr2.y.x2.x1)
> bcd - bd
[1] 0.3123
> cma.3
[1] 0.3123
> 
> cma.1 
[1] 0.3123
> cma.2
[1] 0.3123
> cma.3
[1] 0.3123
> 
> # OR 애초에 simple regression과 multiple 
> # regression에서 얻은 R2을 가지고 
> # 공통설명력을 알아볼 수도 있었다.
> r2.y.x1 <- summary(lm.y.x1)$r.square
> r2.y.x2 <- summary(lm.y.x2)$r.square
> r2.y.x1
[1] 0.6311
> r2.y.x2
[1] 0.4793
> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
> r2.y.x1x2
[1] 0.7981
> cma.4 <- r2.y.x1 + r2.y.x2 - r2.y.x1x2
> cma.4
[1] 0.3123
> 
> # Note that sorting out unique and common
> # explanation area is only possible with 
> # semi-partial correlation determinant
> # NOT partial correlation determinant
> # because only semi-partial correlation
> # shares the same denominator (as total 
> # y).
> #############################################
> 
>

====== explanation. added ====== {{:c:ms:2025:schedule:pasted:20250609-074413.png?400}}


# ex.
# resid(lm(y~x1, data=df)) = bc / delta.y
# resid(lm(y~x2, data=df)) = cd / delta.y
# resid(lm(y~x1+x2, data=df)) = bcd / delta.y
# b / delta.y = ?
# ce / delta.x2 = ?


# exp.added
spcor.test(df$y, df$x1, df$x2)
spcor.test(df$y, df$x1, df$x2)$estimate
spcor.test(df$y, df$x1, df$x2)$estimate^2
spcor.test(df$y, df$x2, df$x1)$estimate^2
summary(lm(y~x1+x2, data=df))$r.square

b <- spcor.test(df$y, df$x1, df$x2)$estimate^2
d <- spcor.test(df$y, df$x2, df$x1)$estimate^2
bcd <- summary(lm(y~x1+x2, data=df))$r.square

summary(lm(df$y~df$x1+df$x2, data=df))$r.square - 
  (spcor.test(df$y, df$x1, df$x2)$estimate^2 + 
     spcor.test(df$y, df$x2, df$x1)$estimate^2)
bcd - (b + d)

====== Another one ====== [[:partial and semipartial correlation]]