This is an old revision of the document!

Regression

regression (회귀분석)
아래에서 다른 것은 이해하기 쉽지만 t.r 값과 t.b값의 (상관관계계수값과 ®, 리그레션의 계수값 (b's coefficient값)) significance level을 테스트할 때 쓰이는 se.res, se.b값을 구하는 것이 조금 복잡할 수 있다. 개념적으로

t.r값은 샘플에서 구한 r값이 significant한지 보기 위해서 이를 se값으로 나눠준 것을 말하며, 이는 모집단의 r 값을 가늠하는데 기여하는지에 대한 테스트이다. t-test의 형태를 띄기 때문에 t값으로 probability를 구한다.
- pt(t.res, df.res, lower.tail=F)*2
t.b값은 샘플로 구한 (추정한) b 값이 모집단의 b 값을 예측하는데 쓰이는 것의 significance함을 테스트 하는 것이다.
- 가령 아래의 regression 라인의 회색부분을 우리는 confidence band라고 부르는데 이는 regression line을 중심으로 (우리가 그 성격을 모르는) 모집단의 regression line을 예측하기 위해서 se.b를 활용한 것이다 (95% CI).

위의 둘은 모두 x변인 즉, 독립변인의 기여를 테스트하는 것이므로 같은 결과를 갖게 된다.

regression.rs

##############
# 배운 것 확인
##############
sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
mo.xy <- lm(y~x, data=sam)
summary(mo.xy)
plot(x,y)
abline(mo.xy)
abline(v=mean(x),col="red")
abline(h=mean(y),col="blue")

n <- length(y)
n
cov(x,y)
sp(x,y)/(n-1)
r.cal <- cor(x,y)
r.cal
cov(x,y)/(sd(x)*sd(y))

b.cal <- sp(x,y)/ss(x)
a.cal <- mean(y)-b.cal*mean(x)
cat(a.cal, b.cal)

ss.tot <- ss(y)
y.hat <- a.cal + b.cal*x
mo.xy$fitted.values
y.hat
data.frame(mo.xy$fitted.values, y.hat)
res <- y - y.hat
reg <- y.hat - mean(y)
res
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.tot
ss.res
ss.reg
ss.res+ss.reg

r.sq <- ss.reg/ss.tot
r.sq
1-r.sq
ss.res/ss.tot

df.tot <- n - 1
df.reg <- 2 - 1
df.res <- df.tot - df.reg
df.tot
df.reg
df.res

ms.tot <- ss.tot / df.tot 
ms.tot
var(y)
ms.reg <- ss.reg / df.reg
ms.res <- ss.res / df.res
f.cal <- ms.reg / ms.res
p.f <- pf(f.cal, df.reg, df.res, lower.tail = F)
data.frame(f.cal, df.reg, df.res, p.f)

summary(mo.xy)
data.frame(head(res), head(summary(mo.xy)$residuals))
data.frame(min(res), max(res))

# residual의 분산값은 무엇인가? 
# residual의 집합인 res 변인의 ss(res) 값을 
# ss.res 라고 하면 이를 n-2로 나눠준 값을 말한다.
var.res <- ss.res / (n-2) # variance of residuals
sd.res <- sqrt(var.res) # standard deviation of residuals
# 위의 값을 standard deviation of residual이라고 부를 수 있다
data.frame(var.res, sd.res) 
sigma(mo.xy)
summary(mo.xy)$sigma
summary(mo.xy)

# 위의 var.res = ms.res 이라고도 부른다
ms.res <- var.res
# 위는 ss.res 값을 n-2로 나눠준 것을 말한다
# 그런데 ss.res 대신에 아래처럼
# ss.res 이 ss.total에서 차지하는 비율로 
# 대체해서 생각해볼 수도 있다. 
ss.res/ss.tot 

# 즉, sd.res 값은 아래처럼 구한다고 했는데
sd.res <- sqrt(ss.res/(n-2))
# 위의 ss.res 대신에 ss.res/ss.tot 을 쓴다면 
# ss.res의 proportion값을 (전체 ss 값에 대한 비율값을)
# 사용한 것이 된다. 즉, 일종의 표준화된 퍼센티지로
# 계산을 하는 것이 된다.
sqrt((ss.res/ss.tot)/(n-2))
# 이것을 standard error of the residual 이라고 부른다
se.res <- sqrt((ss.res/ss.tot)/(n-2)) # se for residual
se.res

# 그리고 이 se.res은 r 값의 (correlation coefficient)
# significance 정도를 가늠하는데 쓰인다. 
# 즉, (r - 0)/ se.res =  
# x변인이 있으므로 생긴 상관관계 / 상관관계의 랜덤한 부분 .
# 이를 t 값이라고 하고, 이는 t distribution을 따르기에 
t.r <- r.cal/se.res
p.r <- pt(t.r, n-2, lower.tail = F)*2
cat(r.cal, se.res, t.r, p.r)
cor.test(x,y)

# 그렇다면 b1의 significance한 정도는 어떻게 가늠하는가?
# standard error for b1 (regression slope)

# 위에서 우리는 ms.res <- ss.res/(n-2)
ms.res
sqrt(ms.res)
sd.res <- sqrt(ms.res) # residual sd (sd for regression)
sd.res

# 그리고 ss(x) 값은
ss.x <- ss(x)
ss.x
# b (계수값의) standard error 값은 
se.b <- sd.res/sqrt(ss.x)
se.b
# 위는 아래와 같은 식이다
sqrt(ms.res)/sqrt(ss.x)
# 위의 값이 standard error of b 값이므로 .. .. 
# http://commres.net/regression#standard_error_of_b
b.cal
t.b <- b.cal/se.b
p.b <- pt(t.b, n-2, lower.tail = F)*2
data.frame(b.cal, se.b, t.b,  p.b)
# 위의 결과로 b 값이 (기울기 값이, x가 y평균 대신에 기여하는 정도가)
# significant한지 테스트 한다. 그리고, 이것은 위에서 구했던 
# r.cal 값의 테스트 한 것과 같은 결과를 갖는다
data.frame(r.cal, se.res, t.r, p.r )
# 왜 t.b와 t.r 값이 같은가?
# t.b 값은 b값을 se.b값으로 
# t.r 값은 r값을 se.res값으로 나누어서 구한것인데 
# 두 값이 같다. 그 이유는 각 식의 분자 부분인
# b값과 r값이 모두 x 변인에 의해서(만) 구해진 것이기 때문에 이다.
# t.r 값은 sample에서 구한 r값을 모집단의 r값을 예측하는 것이고
# t.b 값은 sample에서 구한 기울기 값이 (x변인의 기여정도가)
# 모집단이라면 어디일까를 (얼마일까를) 알려주는 것이기 때문이다. 
# 둘 다가 모두 x 변인으로 생긴 결과의 significant 정도를 가늠하는 
# 것이다.
summary(mo.xy)

data.frame(ss.reg, ss.res, ss.tot, r.sq, ss.res/ss.tot)
data.frame(f.cal, df.reg, df.res, p.f)
anova(mo.xy)
# remember ss.res?
ms.res
ms.reg

regression.rout

> ##############
> # 배운 것 확인
> ##############
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
> sam <- sam[,-1]
> head(sam)
         x       x2        y
1 100.7858 41.32553 3.192923
2 122.7128 51.70909 3.606455
3 115.7513 27.59380 3.299693
4 115.4523 36.08332 2.811801
5 113.1708 42.87865 3.247758
6 111.3423 54.87082 3.329180
> tail(sam)
            x       x2        y
95  110.09457 41.37388 3.252026
96  122.42522 35.62619 3.131744
97  119.04052 51.68809 3.281162
98  106.59454 40.61295 3.122031
99   97.07284 51.14235 2.578662
100 104.02543 41.19795 3.539551
> mo.xy <- lm(y~x, data=sam)
> summary(mo.xy)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> plot(x,y)
> abline(mo.xy)
> abline(v=mean(x),col="red")
> abline(h=mean(y),col="blue")
> 
> n <- length(y)
> n
[1] 100
> cov(x,y)
[1] 1.587297
> sp(x,y)/(n-1)
[1] 1.587297
> r.cal <- cor(x,y)
> r.cal
[1] 0.414393
> cov(x,y)/(sd(x)*sd(y))
[1] 0.414393
> 
> b.cal <- sp(x,y)/ss(x)
> a.cal <- mean(y)-b.cal*mean(x)
> cat(a.cal, b.cal)
1.614428 0.01447618> 
> ss.tot <- ss(y)
> y.hat <- a.cal + b.cal*x
> mo.xy$fitted.values
       1        2        3        4        5        6        7        8        9       10       11       12       13       14       15       16 
3.073421 3.390841 3.290064 3.285737 3.252709 3.226239 3.077796 3.100430 3.167429 3.145222 3.186680 3.220170 3.058232 3.039748 3.360385 2.923754 
      17       18       19       20       21       22       23       24       25       26       27       28       29       30       31       32 
3.263701 3.098318 3.041836 3.103114 3.212438 3.156691 3.278437 3.060121 3.405566 3.399474 3.424280 3.038663 3.150517 3.112629 3.136295 3.218636 
      33       34       35       36       37       38       39       40       41       42       43       44       45       46       47       48 
2.968320 2.953293 2.920798 2.974089 3.133239 3.110435 3.327778 3.343117 3.013600 3.042025 3.349640 3.106477 3.242224 2.983875 3.391751 3.291115 
      49       50       51       52       53       54       55       56       57       58       59       60       61       62       63       64 
3.244502 3.318585 3.252179 3.406932 2.894919 3.176693 3.402063 3.044182 3.100588 3.016279 3.032833 2.977549 3.050346 3.170772 3.486053 2.972923 
      65       66       67       68       69       70       71       72       73       74       75       76       77       78       79       80 
3.108503 3.264175 3.355081 3.544545 3.291476 3.214155 3.325748 3.176509 3.134742 3.272043 3.016544 2.962409 3.155069 3.233770 2.952458 3.224576 
      81       82       83       84       85       86       87       88       89       90       91       92       93       94       95       96 
2.991338 3.170813 2.875599 2.972474 3.304561 2.936843 3.083969 3.009700 3.055152 2.973885 3.254658 2.972157 3.019914 3.366299 3.208177 3.386677 
      97       98       99      100 
3.337680 3.157510 3.019672 3.120319 
> y.hat
  [1] 3.073421 3.390841 3.290064 3.285737 3.252709 3.226239 3.077796 3.100430 3.167429 3.145222 3.186680 3.220170 3.058232 3.039748 3.360385 2.923754
 [17] 3.263701 3.098318 3.041836 3.103114 3.212438 3.156691 3.278437 3.060121 3.405566 3.399474 3.424280 3.038663 3.150517 3.112629 3.136295 3.218636
 [33] 2.968320 2.953293 2.920798 2.974089 3.133239 3.110435 3.327778 3.343117 3.013600 3.042025 3.349640 3.106477 3.242224 2.983875 3.391751 3.291115
 [49] 3.244502 3.318585 3.252179 3.406932 2.894919 3.176693 3.402063 3.044182 3.100588 3.016279 3.032833 2.977549 3.050346 3.170772 3.486053 2.972923
 [65] 3.108503 3.264175 3.355081 3.544545 3.291476 3.214155 3.325748 3.176509 3.134742 3.272043 3.016544 2.962409 3.155069 3.233770 2.952458 3.224576
 [81] 2.991338 3.170813 2.875599 2.972474 3.304561 2.936843 3.083969 3.009700 3.055152 2.973885 3.254658 2.972157 3.019914 3.366299 3.208177 3.386677
 [97] 3.337680 3.157510 3.019672 3.120319
> data.frame(mo.xy$fitted.values, y.hat)
    mo.xy.fitted.values    y.hat
1              3.073421 3.073421
2              3.390841 3.390841
3              3.290064 3.290064
4              3.285737 3.285737
5              3.252709 3.252709
6              3.226239 3.226239
7              3.077796 3.077796
8              3.100430 3.100430
9              3.167429 3.167429
10             3.145222 3.145222
11             3.186680 3.186680
12             3.220170 3.220170
13             3.058232 3.058232
14             3.039748 3.039748
15             3.360385 3.360385
16             2.923754 2.923754
17             3.263701 3.263701
18             3.098318 3.098318
19             3.041836 3.041836
20             3.103114 3.103114
21             3.212438 3.212438
22             3.156691 3.156691
23             3.278437 3.278437
24             3.060121 3.060121
25             3.405566 3.405566
26             3.399474 3.399474
27             3.424280 3.424280
28             3.038663 3.038663
29             3.150517 3.150517
30             3.112629 3.112629
31             3.136295 3.136295
32             3.218636 3.218636
33             2.968320 2.968320
34             2.953293 2.953293
35             2.920798 2.920798
36             2.974089 2.974089
37             3.133239 3.133239
38             3.110435 3.110435
39             3.327778 3.327778
40             3.343117 3.343117
41             3.013600 3.013600
42             3.042025 3.042025
43             3.349640 3.349640
44             3.106477 3.106477
45             3.242224 3.242224
46             2.983875 2.983875
47             3.391751 3.391751
48             3.291115 3.291115
49             3.244502 3.244502
50             3.318585 3.318585
51             3.252179 3.252179
52             3.406932 3.406932
53             2.894919 2.894919
54             3.176693 3.176693
55             3.402063 3.402063
56             3.044182 3.044182
57             3.100588 3.100588
58             3.016279 3.016279
59             3.032833 3.032833
60             2.977549 2.977549
61             3.050346 3.050346
62             3.170772 3.170772
63             3.486053 3.486053
64             2.972923 2.972923
65             3.108503 3.108503
66             3.264175 3.264175
67             3.355081 3.355081
68             3.544545 3.544545
69             3.291476 3.291476
70             3.214155 3.214155
71             3.325748 3.325748
72             3.176509 3.176509
73             3.134742 3.134742
74             3.272043 3.272043
75             3.016544 3.016544
76             2.962409 2.962409
77             3.155069 3.155069
78             3.233770 3.233770
79             2.952458 2.952458
80             3.224576 3.224576
81             2.991338 2.991338
82             3.170813 3.170813
83             2.875599 2.875599
84             2.972474 2.972474
85             3.304561 3.304561
86             2.936843 2.936843
87             3.083969 3.083969
88             3.009700 3.009700
89             3.055152 3.055152
90             2.973885 2.973885
91             3.254658 3.254658
92             2.972157 2.972157
93             3.019914 3.019914
94             3.366299 3.366299
95             3.208177 3.208177
96             3.386677 3.386677
97             3.337680 3.337680
98             3.157510 3.157510
99             3.019672 3.019672
100            3.120319 3.120319
> res <- y - y.hat
> reg <- y.hat - mean(y)
> res
  [1]  0.119501750  0.215614401  0.009628661 -0.473936002 -0.004950905  0.102940633  0.377076682  0.223718332  0.013320795  0.061104050 -0.113819228
 [12]  0.527301849  0.119440719 -0.341981719 -0.277581268 -0.205981554  0.068641920  0.181848917 -0.116193350  0.464727911  0.471205885 -0.483644257
 [23]  0.195296811 -0.318037185  0.300474294 -0.043325441  0.442614340 -0.129151361 -0.013514069 -0.124067245 -0.194342819  0.483475449  0.445159444
 [34] -0.176297907 -0.057008592  0.035786736 -0.258975183  0.094323098 -0.206267000 -0.180409275  0.319716564  0.419244478  0.422463721 -0.335463489
 [45]  0.057786803 -0.318994939  0.032800867 -0.472268677  0.184486306  0.067978885 -0.145421543  0.113863386  0.041423444 -0.336210600  0.374876833
 [56]  0.153315855 -0.532059178  0.134657823 -0.221707791  0.184621469  0.093782431  0.378645082 -0.633723960  0.905124191  0.104136650 -0.362907837
 [67]  0.435518142 -0.170270360 -0.154292676 -0.749099980  0.345233347 -0.073221533  0.338293956 -0.147846335 -0.346320934  0.846546713 -0.209780903
 [78]  0.361603783 -0.314116393  0.038372131  0.341075152 -0.313400633  0.032494822 -0.167953398  0.292811611  0.034550659  0.382131991 -0.537251620
 [89] -0.787231802  0.047401643  0.035616050  0.097938356 -0.927698307 -0.270130357  0.043849604 -0.254933796 -0.056518191 -0.035478308 -0.441009966
[100]  0.419232444
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> ss.tot
[1] 13.24715
> ss.res
[1] 10.97233
> ss.reg
[1] 2.274822
> ss.res+ss.reg
[1] 13.24715
> 
> r.sq <- ss.reg/ss.tot
> r.sq
[1] 0.1717215
> 1-r.sq
[1] 0.8282785
> ss.res/ss.tot
[1] 0.8282785
> 
> df.tot <- n - 1
> df.reg <- 2 - 1
> df.res <- df.tot - df.reg
> df.tot
[1] 99
> df.reg
[1] 1
> df.res
[1] 98
> 
> ms.tot <- ss.tot / df.tot 
> ms.tot
[1] 0.1338096
> var(y)
[1] 0.1338096
> ms.reg <- ss.reg / df.reg
> ms.res <- ss.res / df.res
> f.cal <- ms.reg / ms.res
> p.f <- pf(f.cal, df.reg, df.res, lower.tail = F)
> data.frame(f.cal, df.reg, df.res, p.f)
    f.cal df.reg df.res          p.f
1 20.3177      1     98 1.817763e-05
> 
> summary(mo.xy)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> data.frame(head(res), head(summary(mo.xy)$residuals))
     head.res. head.summary.mo.xy..residuals.
1  0.119501750                    0.119501750
2  0.215614401                    0.215614401
3  0.009628661                    0.009628661
4 -0.473936002                   -0.473936002
5 -0.004950905                   -0.004950905
6  0.102940633                    0.102940633
> data.frame(min(res), max(res))
    min.res.  max.res.
1 -0.9276983 0.9051242
> 
> # residual의 분산값은 무엇인가? 
> # residual의 집합인 res 변인의 ss(res) 값을 
> # ss.res 라고 하면 이를 n-2로 나눠준 값을 말한다.
> var.res <- ss.res / (n-2) # variance of residuals
> sd.res <- sqrt(var.res) # standard deviation of residuals
> # 위의 값을 standard deviation of residual이라고 부를 수 있다
> data.frame(var.res, sd.res) 
    var.res    sd.res
1 0.1119626 0.3346081
> sigma(mo.xy)
[1] 0.3346081
> summary(mo.xy)$sigma
[1] 0.3346081
> summary(mo.xy)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> 
> # 위의 var.res = ms.res 이라고도 부른다
> ms.res <- var.res
> # 위는 ss.res 값을 n-2로 나눠준 것을 말한다
> # 그런데 ss.res 대신에 아래처럼
> # ss.res 이 ss.total에서 차지하는 비율로 
> # 대체해서 생각해볼 수도 있다. 
> ss.res/ss.tot 
[1] 0.8282785
> 
> # 즉, sd.res 값은 아래처럼 구한다고 했는데
> sd.res <- sqrt(ss.res/(n-2))
> # 위의 ss.res 대신에 ss.res/ss.tot 을 쓴다면 
> # ss.res의 proportion값을 (전체 ss 값에 대한 비율값을)
> # 사용한 것이 된다. 즉, 일종의 표준화된 퍼센티지로
> # 계산을 하는 것이 된다.
> sqrt((ss.res/ss.tot)/(n-2))
[1] 0.09193379
> # 이것을 standard error of the residual 이라고 부른다
> se.res <- sqrt((ss.res/ss.tot)/(n-2)) # se for residual
> se.res
[1] 0.09193379
> 
> # 그리고 이 se.res은 r 값의 (correlation coefficient)
> # significance 정도를 가늠하는데 쓰인다. 
> # 즉, (r - 0)/ se.res =  
> # x변인이 있으므로 생긴 상관관계 / 상관관계의 랜덤한 부분 .
> # 이를 t 값이라고 하고, 이는 t distribution을 따르기에 
> t.r <- r.cal/se.res
> p.r <- pt(t.r, n-2, lower.tail = F)*2
> cat(r.cal, se.res, t.r, p.r)
0.414393 0.09193379 4.507515 1.817763e-05> cor.test(x,y)

	Pearson's product-moment correlation

data:  x and y
t = 4.5075, df = 98, p-value = 1.818e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2372888 0.5648366
sample estimates:
     cor 
0.414393 

> 
> # 그렇다면 b1의 significance한 정도는 어떻게 가늠하는가?
> # standard error for b1 (regression slope)
> 
> # 위에서 우리는 ms.res <- ss.res/(n-2)
> ms.res
[1] 0.1119626
> sqrt(ms.res)
[1] 0.3346081
> sd.res <- sqrt(ms.res) # residual sd (sd for regression)
> sd.res
[1] 0.3346081
> 
> # 그리고 ss(x) 값은
> ss.x <- ss(x)
> ss.x
[1] 10855.25
> # b (계수값의) standard error 값은 
> se.b <- sd.res/sqrt(ss.x)
> se.b
[1] 0.003211564
> # 위는 아래와 같은 식이다
> sqrt(ms.res)/sqrt(ss.x)
[1] 0.003211564
> # 위의 값이 standard error of b 값이므로 .. .. 
> # http://commres.net/regression#standard_error_of_b
> b.cal
[1] 0.01447618
> t.b <- b.cal/se.b
> p.b <- pt(t.b, n-2, lower.tail = F)*2
> data.frame(b.cal, se.b, t.b,  p.b)
       b.cal        se.b      t.b          p.b
1 0.01447618 0.003211564 4.507515 1.817763e-05
> # 위의 결과로 b 값이 (기울기 값이, x가 y평균 대신에 기여하는 정도가)
> # significant한지 테스트 한다. 그리고, 이것은 위에서 구했던 
> # r.cal 값의 테스트 한 것과 같은 결과를 갖는다
> data.frame(r.cal, se.res, t.r, p.r )
     r.cal     se.res      t.r          p.r
1 0.414393 0.09193379 4.507515 1.817763e-05
> # 왜 t.b와 t.r 값이 같은가?
> # t.b 값은 b값을 se.b값으로 
> # t.r 값은 r값을 se.res값으로 나누어서 구한것인데 
> # 두 값이 같다. 그 이유는 각 식의 분자 부분인
> # b값과 r값이 모두 x 변인에 의해서(만) 구해진 것이기 때문에 이다.
> # t.r 값은 sample에서 구한 r값을 모집단의 r값을 예측하는 것이고
> # t.b 값은 sample에서 구한 기울기 값이 (x변인의 기여정도가)
> # 모집단이라면 어디일까를 (얼마일까를) 알려주는 것이기 때문이다. 
> # 둘 다가 모두 x 변인으로 생긴 결과의 significant 정도를 가늠하는 
> # 것이다.
> summary(mo.xy)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> 
> data.frame(ss.reg, ss.res, ss.tot, r.sq, ss.res/ss.tot)
    ss.reg   ss.res   ss.tot      r.sq ss.res.ss.tot
1 2.274822 10.97233 13.24715 0.1717215     0.8282785
> data.frame(f.cal, df.reg, df.res, p.f)
    f.cal df.reg df.res          p.f
1 20.3177      1     98 1.817763e-05
> anova(mo.xy)
Analysis of Variance Table

Response: y
          Df  Sum Sq Mean Sq F value    Pr(>F)    
x          1  2.2748 2.27482  20.318 1.818e-05 ***
Residuals 98 10.9723 0.11196                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> # remember ss.res?
> ms.res
[1] 0.1119626
> ms.reg
[1] 2.274822
>

residual errors and regression errors in Regression

regression.sample.00.rs

#####################################################
# residual errors and regression errors in Regression
# 
#####################################################
rm(list=ls())
ss <- function(x) {
  sum((x-mean(x))^2)
}
sp <- function(x, y) {
  sum((x-mean(x))*(y-mean(y)))
}

sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
cor(sam)

y <- sam$y
x <- sam$x
x2 <- sam$x2

cor(x,y)
m.y.x <- lm(y~x, data=sam)
summary(m.y.x)
plot(x,y)
abline(m.y.x, col='red')
abline(h=mean(y), col='blue')

# for visual understanding see
# http://commres.net/c/ms/2026/schedule/w12.lecture.note#types_of_correlations

res <- resid(m.y.x)
y.hat <- m.y.x$fitted.values
reg <- y.hat - mean(y)
ss(y)
ss(res)+ss(reg)

m.reg.x <- lm(reg~x, data=sam)
m.res.x <- lm(res~x, data=sam)
summary(m.reg.x)
summary(m.res.x)

cor(x, reg)
cor(x, res)

# Then!
cor(x, x2)
m.x2.x <- lm(x2~x, data=sam)
res.x2.x <- resid(m.x2.x)

m.y.resx2 <- lm(y~res.x2.x, data=sam)
summary(m.y.resx2)
d <- summary(m.y.resx2)$r.square
d

summary(m.y.x)
summary(m.y.x)$r.square
bc <- summary(m.y.x)$r.square

bcd <- bc + d 
bcd

# Then
m.y.xx2 <- lm(y~x+x2, data=sam)
summary(m.y.xx2)
# 나중에 더 설명

# Then!
# what is the portion (r square value) of x1's own contribution?
#
m.x.x2 <- lm(x~x2, data=sam)
res.be <- resid(m.x.x2)
m.b.abcd <- lm(y~res.be, data=sam)
summary(m.b.abcd)
b.abcd <- summary(m.b.abcd)$r.square
b.abcd
spcor.test(y,x,x2)$estimate^2

# also we can do the below
m.y.x2 <- lm(y~x2, data=sam)
res.ab <- resid(m.y.x2)
m.resab.resbe <- lm(res.ab~res.be, data=sam)
b.ab <- summary(m.resab.resbe)$r.square
b.ab
pcor.test(y,x,x2)$estimate^2

bd.abcd <- summary(lm(y~x, data=sam))$r.square
bd.abcd
cor.test(y,x)$estimate
cor.test(y,x)$estimate^2

regression.sample.00.ro

> #####################################################
> # residual errors and regression errors in Regression
> # 
> #####################################################
> rm(list=ls())
> ss <- function(x) {
+   sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+   sum((x-mean(x))*(y-mean(y)))
+ }
> 
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
> sam <- sam[,-1]
> head(sam)
         x       x2        y
1 100.7858 41.32553 3.192923
2 122.7128 51.70909 3.606455
3 115.7513 27.59380 3.299693
4 115.4523 36.08332 2.811801
5 113.1708 42.87865 3.247758
6 111.3423 54.87082 3.329180
> tail(sam)
            x       x2        y
95  110.09457 41.37388 3.252026
96  122.42522 35.62619 3.131744
97  119.04052 51.68809 3.281162
98  106.59454 40.61295 3.122031
99   97.07284 51.14235 2.578662
100 104.02543 41.19795 3.539551
> cor(sam)
           x        x2         y
x  1.0000000 0.1880415 0.4143930
x2 0.1880415 1.0000000 0.3373768
y  0.4143930 0.3373768 1.0000000
> 
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
> 
> cor(x,y)
[1] 0.414393
> m.y.x <- lm(y~x, data=sam)
> summary(m.y.x)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> plot(x,y)
> abline(m.y.x, col='red')
> abline(h=mean(y), col='blue')
> 
> # for visual understanding see
> # http://commres.net/c/ms/2026/schedule/w12.lecture.note#types_of_correlations
> 
> res <- resid(m.y.x)
> y.hat <- m.y.x$fitted.values
> reg <- y.hat - mean(y)
> ss(y)
[1] 13.24715
> ss(res)+ss(reg)
[1] 13.24715
> 
> m.reg.x <- lm(reg~x, data=sam)
> m.res.x <- lm(res~x, data=sam)
> summary(m.reg.x)

Call:
lm(formula = reg ~ x, data = sam)

Residuals:
       Min         1Q     Median         3Q        Max 
-1.119e-15 -1.269e-16 -1.860e-17  7.610e-17  3.316e-15 

Coefficients:
              Estimate Std. Error    t value Pr(>|t|)    
(Intercept) -1.546e+00  3.910e-16 -3.954e+15   <2e-16 ***
x            1.448e-02  3.644e-18  3.973e+15   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.796e-16 on 98 degrees of freedom
Multiple R-squared:      1,	Adjusted R-squared:      1 
F-statistic: 1.578e+31 on 1 and 98 DF,  p-value: < 2.2e-16

> summary(m.res.x)

Call:
lm(formula = res ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.388e-17  3.446e-01       0        1
x           0.000e+00  3.212e-03       0        1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  1.254e-33,	Adjusted R-squared:  -0.0102 
F-statistic: 1.229e-31 on 1 and 98 DF,  p-value: 1

> 
> cor(x, reg)
[1] 1
> cor(x, res)
[1] -2.061785e-17
> 
> # Then!
> cor(x, x2)
[1] 0.1880415
> m.x2.x <- lm(x2~x, data=sam)
> res.x2.x <- resid(m.x2.x)
> 
> m.y.resx2 <- lm(y~res.x2.x, data=sam)
> summary(m.y.resx2)

Call:
lm(formula = y ~ res.x2.x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.01305 -0.26905  0.00798  0.28017  0.67040 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.160479   0.035460  89.128  < 2e-16 ***
res.x2.x    0.013132   0.004843   2.711  0.00791 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3546 on 98 degrees of freedom
Multiple R-squared:  0.06978,	Adjusted R-squared:  0.06029 
F-statistic: 7.352 on 1 and 98 DF,  p-value: 0.007912

> d <- summary(m.y.resx2)$r.square
> d
[1] 0.06978376
> 
> summary(m.y.x)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92770 -0.20715  0.03368  0.20038  0.90512 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.614428   0.344622   4.685 9.04e-06 ***
x           0.014476   0.003212   4.508 1.82e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared:  0.1717,	Adjusted R-squared:  0.1633 
F-statistic: 20.32 on 1 and 98 DF,  p-value: 1.818e-05

> summary(m.y.x)$r.square
[1] 0.1717215
> bc <- summary(m.y.x)$r.square
> 
> bcd <- bc + d 
> bcd
[1] 0.2415053
> 
> # Then
> m.y.xx2 <- lm(y~x+x2, data=sam)
> summary(m.y.xx2)

Call:
lm(formula = y ~ x + x2, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.87249 -0.20913  0.00269  0.20047  0.82037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.267554   0.351230   3.609 0.000489 ***
x           0.012709   0.003145   4.041 0.000107 ***
x2          0.013132   0.004396   2.987 0.003564 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3218 on 97 degrees of freedom
Multiple R-squared:  0.2415,	Adjusted R-squared:  0.2259 
F-statistic: 15.44 on 2 and 97 DF,  p-value: 1.506e-06

> # 나중에 더 설명
> 
> # Then!
> # what is the portion (r square value) of x1's own contribution?
> #
> m.x.x2 <- lm(x~x2, data=sam)
> res.be <- resid(m.x.x2)
> m.b.abcd <- lm(y~res.be, data=sam)
> summary(m.b.abcd)

Call:
lm(formula = y ~ res.be, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.96326 -0.22562  0.01725  0.22139  0.89797 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.160479   0.034339  92.038  < 2e-16 ***
res.be      0.012709   0.003356   3.787 0.000263 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3434 on 98 degrees of freedom
Multiple R-squared:  0.1277,	Adjusted R-squared:  0.1188 
F-statistic: 14.34 on 1 and 98 DF,  p-value: 0.0002626

> b.abcd <- summary(m.b.abcd)$r.square
> b.abcd
[1] 0.1276822
> spcor.test(y,x,x2)$estimate^2
[1] 0.1276822
> 
> # also we can do the below
> m.y.x2 <- lm(y~x2, data=sam)
> res.ab <- resid(m.y.x2)
> m.resab.resbe <- lm(res.ab~res.be, data=sam)
> b.ab <- summary(m.resab.resbe)$r.square
> b.ab
[1] 0.1440821
> pcor.test(y,x,x2)$estimate^2
[1] 0.1440821
> 
> bd.abcd <- summary(lm(y~x, data=sam))$r.square
> bd.abcd
[1] 0.1717215
> cor.test(y,x)$estimate
     cor 
0.414393 
> cor.test(y,x)$estimate^2
      cor 
0.1717215 
>

Multiple regression with IVs no correlations

multiple.regression.01.rs

############################
# multiple regression
############################
rm(list=ls())
sam <- read.csv2("http://commres.net/_media/r/regression.sample.02.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
str(sam)
cor(sam)

y <- sam$y
x <- sam$x
x2 <- sam$x2

# y regression with x and x2
m.y.x1 <- lm(y~x, data=sam)
m.y.x2 <- lm(y~x2, data=sam)
# summary(m.y.x1)
# summary(m.y.x2)
explained.by.x1 <- summary(m.y.x1)$r.square
explained.by.x2 <- summary(m.y.x2)$r.square
round(explained.by.x1 + explained.by.x2,3)
rs.each.sum <- explained.by.x1 + explained.by.x2

m.y.x1x2 <- lm(y~x+x2, data=sam)
summary(m.y.x1x2)
rs.both <- summary(m.y.x1x2)$r.square
round(rs.both,3)

round(data.frame(rs.both, rs.each.sum, rs.both-rs.each.sum),3)

m.x1.x2 <- lm(x~x2, data=sam)
# R-squared = 0
# x2 = 0
# p-value = .7945
summary(m.x1.x2)
summary(m.x1.x2)$r.square
sqrt(summary(m.x1.x2)$r.square)
cor(x,x2)
cor.test(x,x2)
#################
# 그림으로 보기 
#################

multiple.regression.01.ro

> 
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.02.csv")
> sam <- sam[,-1]
> 
> head(sam)
         x       x2        y
1 109.9413 46.09681 3.277817
2 115.9064 42.91232 2.940628
3 104.5726 37.00947 2.854055
4 122.9020 44.69320 3.831361
5 120.1931 37.18179 3.102626
6 119.3134 38.54365 3.135997
> tail(sam)
               x       x2        y
999995  109.0402 39.47883 2.645672
999996  122.6468 49.57322 3.478578
999997  121.5819 41.26768 3.386559
999998  106.8277 40.39737 3.434375
999999  113.8535 36.00922 3.039189
1000000 114.4945 46.07892 3.439678
> str(sam)
'data.frame':	1000000 obs. of  3 variables:
 $ x : num  110 116 105 123 120 ...
 $ x2: num  46.1 42.9 37 44.7 37.2 ...
 $ y : num  3.28 2.94 2.85 3.83 3.1 ...
> cor(sam)
               x            x2         y
x   1.0000000000 -0.0002605277 0.3001789
x2 -0.0002605277  1.0000000000 0.1993134
y   0.3001788706  0.1993133956 1.0000000
> 
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
> 
> # y regression with x and x2
> m.y.x1 <- lm(y~x, data=sam)
> m.y.x2 <- lm(y~x2, data=sam)
> # summary(m.y.x1)
> # summary(m.y.x2)
> explained.by.x1 <- summary(m.y.x1)$r.square
> explained.by.x2 <- summary(m.y.x2)$r.square
> round(explained.by.x1 + explained.by.x2,3)
[1] 0.13
> rs.each.sum <- explained.by.x1 + explained.by.x2
> 
> m.y.x1x2 <- lm(y~x+x2, data=sam)
> summary(m.y.x1x2)

Call:
lm(formula = y ~ x + x2, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.77065 -0.25170  0.00031  0.25145  1.83133 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.604e+00  4.168e-03   384.8   <2e-16 ***
x           1.091e-02  3.389e-05   321.9   <2e-16 ***
x2          9.956e-03  4.658e-05   213.8   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3732 on 999997 degrees of freedom
Multiple R-squared:  0.1299,	Adjusted R-squared:  0.1299 
F-statistic: 7.462e+04 on 2 and 999997 DF,  p-value: < 2.2e-16

> rs.both <- summary(m.y.x1x2)$r.square
> round(rs.both,3)
[1] 0.13
> 
> round(data.frame(rs.both, rs.each.sum, rs.both-rs.each.sum),3)
  rs.both rs.each.sum rs.both...rs.each.sum
1    0.13        0.13                     0
> 
> m.x1.x2 <- lm(x~x2, data=sam)
> # R-squared = 0
> # x2 = 0
> # p-value = .7945
> summary(m.x1.x2)

Call:
lm(formula = x ~ x2, data = sam)

Residuals:
    Min      1Q  Median      3Q     Max 
-51.050  -7.432  -0.005   7.432  54.696 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  1.080e+02  5.877e-02 1838.108   <2e-16 ***
x2          -3.581e-04  1.374e-03   -0.261    0.794    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.01 on 999998 degrees of freedom
Multiple R-squared:  6.787e-08,	Adjusted R-squared:  -9.321e-07 
F-statistic: 0.06787 on 1 and 999998 DF,  p-value: 0.7945

> summary(m.x1.x2)$r.square
[1] 6.787467e-08
> sqrt(summary(m.x1.x2)$r.square)
[1] 0.0002605277
> cor(x,x2)
[1] -0.0002605277
> cor.test(x,x2)

	Pearson's product-moment correlation

data:  x and x2
t = -0.26053, df = 999998, p-value = 0.7945
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.002220491  0.001699438
sample estimates:
          cor 
-0.0002605277 

> #################
> # 그림으로 보기 
> #################
>

Multiple regression with IVs correlated

multiple.regression.03.rs

#########################
# multiple regression 2
#########################
rm(list=ls())
# 1. mean of three variables 
# iq, allowance, gpa
# x, x2, y
means <- c(108, 42, 3.2) 

# allowance, iq, gpa
sds <- c(11, 8, 0.4)    # standard deviations
# correlation matrix. note that correlation 
# between v1 and v2 is near none (0.05)
corr_matrix <- matrix(c(1, 0.30, 0.4,
                        0.30, 1, 0.2,
                        0.4, 0.2, 1),
                      nrow = 3) # Target correlation matrix
# 2. covariance matrix
# diagonal matrix of SDs
sd_diag <- diag(sds)
sd_diag

# Calculate covariance matrix
cov_matrix <- sd_diag %*% corr_matrix %*% sd_diag

# 3. population data
# set.seed(32) 
# set.seed(4321)
set.seed(1230)
n_sam <- 100
# n_sam <- 300
sam <- mvrnorm(n = n_sam, mu = means, Sigma = cov_matrix)
sam <- as.data.frame(sam)
colnames(sam) <- c("x", "x2", "y")
cor(sam)
write.csv2(sam, "regression.sample.03.csv")

sam <- read.csv2("http://commres.net/_media/r/regression.sample.03.csv")
sam <- sam[, -1]
head(sam)
tail(sam)
cor(sam)

y <- sam$y
x <- sam$x
x2 <- sam$x2

lm.y.x1 <- lm(y~x, data=sam)
lm.y.x2 <- lm(y~x2, data=sam)
summary(lm.y.x1)
summary(lm.y.x2)
bc <- summary(lm.y.x1)$r.square
cd <- summary(lm.y.x2)$r.square
bccd <- bc+cd
data.frame(bc, cd, bccd)

lm.y.x1x2 <- lm(y~x+x2, data=sam)
summary(lm.y.x1x2)
bcd <- summary(lm.y.x1x2)$r.square
# where does this diff come from?
c <- bccd - bcd
c

# b 와 d만 추리기 
res.be <- resid(lm(x~x2, data=sam))
m.b <- lm(y~res.be, data=sam)
summary(m.b)
b <- summary(m.b)$r.square
b

res.dg <- resid(lm(x2~x, data=sam))
m.d <- lm(y~res.dg, data=sam)
summary(m.d)
d <- summary(m.d)$r.square
d

b
d
bcd
bccd
c <- bccd-bcd
c2 <- bcd-(b+d)
c
c2

# ppcor package 
# spcor.test (y,x1,x2)
rs.b.sp <- spcor.test(y,x,x2)$estimate^2
rs.d.sp <- spcor.test(y,x2,x)$estimate^2
data.frame(b, rs.b.sp)
data.frame(d, rs.d.sp)
# remember b.sp = r square value
b.sp <- sqrt(rs.b.sp)
d.sp <- sqrt(rs.d.sp)
data.frame(b.sp, b, rs.b.sp)
data.frame(d.sp, d, rs.d.sp)

multiple.regression.03.ro

> #########################
> # multiple regression 2
> #########################
> rm(list=ls())
> # 1. mean of three variables 
> # iq, allowance, gpa
> # x, x2, y
> means <- c(108, 42, 3.2) 
> 
> # allowance, iq, gpa
> sds <- c(11, 8, 0.4)    # standard deviations
> # correlation matrix. note that correlation 
> # between v1 and v2 is near none (0.05)
> corr_matrix <- matrix(c(1, 0.30, 0.4,
+                         0.30, 1, 0.2,
+                         0.4, 0.2, 1),
+                       nrow = 3) # Target correlation matrix
> # 2. covariance matrix
> # diagonal matrix of SDs
> sd_diag <- diag(sds)
> sd_diag
     [,1] [,2] [,3]
[1,]   11    0  0.0
[2,]    0    8  0.0
[3,]    0    0  0.4
> 
> # Calculate covariance matrix
> cov_matrix <- sd_diag %*% corr_matrix %*% sd_diag
> 
> # 3. population data
> # set.seed(32) 
> # set.seed(4321)
> set.seed(1230)
> n_sam <- 100
> # n_sam <- 300
> sam <- mvrnorm(n = n_sam, mu = means, Sigma = cov_matrix)
> sam <- as.data.frame(sam)
> colnames(sam) <- c("x", "x2", "y")
> cor(sam)
           x        x2         y
x  1.0000000 0.4219157 0.3535740
x2 0.4219157 1.0000000 0.2700868
y  0.3535740 0.2700868 1.0000000
> write.csv2(sam, "regression.sample.03.csv")
> 
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.03.csv")
> sam <- sam[, -1]
> head(sam)
         x       x2        y
1 108.1988 47.05544 2.791294
2 118.1307 38.71573 3.338987
3 105.2902 39.08865 3.867919
4 120.7237 52.10261 3.341301
5 120.1957 45.69276 3.267319
6 118.6366 47.16571 3.126480
> tail(sam)
            x       x2        y
95  114.68980 34.28183 2.629331
96  115.67318 40.27048 3.316995
97  113.66832 34.68306 3.134684
98   99.68422 30.69797 2.670601
99  108.31653 42.74074 2.850961
100 102.19879 40.88916 3.231628
> cor(sam)
           x        x2         y
x  1.0000000 0.4219157 0.3535740
x2 0.4219157 1.0000000 0.2700868
y  0.3535740 0.2700868 1.0000000
> 
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
> 
> lm.y.x1 <- lm(y~x, data=sam)
> lm.y.x2 <- lm(y~x2, data=sam)
> summary(lm.y.x1)

Call:
lm(formula = y ~ x, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.03671 -0.26187  0.03189  0.21676  0.71688 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.943438   0.347683   5.590 2.05e-07 ***
x           0.011705   0.003128   3.742 0.000308 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3753 on 98 degrees of freedom
Multiple R-squared:  0.125,	Adjusted R-squared:  0.1161 
F-statistic:    14 on 1 and 98 DF,  p-value: 0.0003079

> summary(lm.y.x2)

Call:
lm(formula = y ~ x2, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.89560 -0.22360 -0.02491  0.25421  0.86636 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 2.695389   0.198771  13.560  < 2e-16 ***
x2          0.013381   0.004819   2.777  0.00658 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3863 on 98 degrees of freedom
Multiple R-squared:  0.07295,	Adjusted R-squared:  0.06349 
F-statistic: 7.711 on 1 and 98 DF,  p-value: 0.006575

> bc <- summary(lm.y.x1)$r.square
> cd <- summary(lm.y.x2)$r.square
> bccd <- bc+cd
> data.frame(bc, cd, bccd)
         bc        cd      bccd
1 0.1250146 0.0729469 0.1979615
> 
> lm.y.x1x2 <- lm(y~x+x2, data=sam)
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x + x2, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91766 -0.23439  0.01484  0.21300  0.72503 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.875581   0.349193   5.371 5.36e-07 ***
x           0.009650   0.003432   2.811  0.00597 ** 
x2          0.007287   0.005137   1.419  0.15921    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3734 on 97 degrees of freedom
Multiple R-squared:  0.1428,	Adjusted R-squared:  0.1251 
F-statistic:  8.08 on 2 and 97 DF,  p-value: 0.0005682

> bcd <- summary(lm.y.x1x2)$r.square
> # where does this diff come from?
> c <- bccd - bcd
> c
[1] 0.05516214
> 
> # b 와 d만 추리기 
> res.be <- resid(lm(x~x2, data=sam))
> m.b <- lm(y~res.be, data=sam)
> summary(m.b)

Call:
lm(formula = y ~ res.be, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.09179 -0.25700  0.01967  0.20519  0.77413 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.236835   0.038696  83.649  < 2e-16 ***
res.be      0.009650   0.003557   2.713  0.00788 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.387 on 98 degrees of freedom
Multiple R-squared:  0.06985,	Adjusted R-squared:  0.06036 
F-statistic:  7.36 on 1 and 98 DF,  p-value: 0.00788

> b <- summary(m.b)$r.square
> b
[1] 0.06985246
> 
> res.dg <- resid(lm(x2~x, data=sam))
> m.d <- lm(y~res.dg, data=sam)
> summary(m.d)

Call:
lm(formula = y ~ res.dg, data = sam)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.94974 -0.24497 -0.01842  0.26781  0.92351 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.236835   0.039764  81.401   <2e-16 ***
res.dg      0.007287   0.005471   1.332    0.186    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3976 on 98 degrees of freedom
Multiple R-squared:  0.01778,	Adjusted R-squared:  0.007762 
F-statistic: 1.774 on 1 and 98 DF,  p-value: 0.1859

> d <- summary(m.d)$r.square
> d
[1] 0.01778476
> 
> b
[1] 0.06985246
> d
[1] 0.01778476
> bcd
[1] 0.1427994
> bccd
[1] 0.1979615
> c <- bccd-bcd
> c2 <- bcd-(b+d)
> c
[1] 0.05516214
> c2
[1] 0.05516214
>

types.of.correlations.pptx

ppcor package in r

ppcor.sr

> # ppcor package 
> # spcor.test (y,x1,x2)
> rs.b.sp <- spcor.test(y,x,x2)$estimate^2
> rs.d.sp <- spcor.test(y,x2,x)$estimate^2
> data.frame(b, rs.b.sp)
           b    rs.b.sp
1 0.06985246 0.06985246
> data.frame(d, rs.d.sp)
           d    rs.d.sp
1 0.01778476 0.01778476
> # remember b.sp = r square value
> b.sp <- sqrt(rs.b.sp)
> d.sp <- sqrt(rs.d.sp)
> data.frame(b.sp, b, rs.b.sp)
       b.sp          b    rs.b.sp
1 0.2642962 0.06985246 0.06985246
> data.frame(d.sp, d, rs.d.sp)
       d.sp          d    rs.d.sp
1 0.1333595 0.01778476 0.01778476
>

ppcor.ro

> # ppcor package 
> # spcor.test (y,x1,x2)
> rs.b.sp <- spcor.test(y,x,x2)$estimate^2
> rs.d.sp <- spcor.test(y,x2,x)$estimate^2
> data.frame(b, rs.b.sp)
           b    rs.b.sp
1 0.06985246 0.06985246
> data.frame(d, rs.d.sp)
           d    rs.d.sp
1 0.01778476 0.01778476
> # remember b.sp = r square value
> b.sp <- sqrt(rs.b.sp)
> d.sp <- sqrt(rs.d.sp)
> data.frame(b.sp, b, rs.b.sp)
       b.sp          b    rs.b.sp
1 0.2642962 0.06985246 0.06985246
> data.frame(d.sp, d, rs.d.sp)
       d.sp          d    rs.d.sp
1 0.1333595 0.01778476 0.01778476
>

COMMunication
RESearch.NET

Table of Contents