c:ms:2026:schedule:w12.lecture.note
This is an old revision of the document!
Table of Contents
Regression
regression (회귀분석)
아래에서 다른 것은 이해하기 쉽지만 t.r 값과 t.b값의 (상관관계계수값과 ®, 리그레션의 계수값 (b's coefficient값)) significance level을 테스트할 때 쓰이는 se.res, se.b값을 구하는 것이 조금 복잡할 수 있다. 개념적으로
- t.r값은 샘플에서 구한 r값이 significant한지 보기 위해서 이를 se값으로 나눠준 것을 말하며, 이는 모집단의 r 값을 가늠하는데 기여하는지에 대한 테스트이다. t-test의 형태를 띄기 때문에 t값으로 probability를 구한다.
pt(t.res, df.res, lower.tail=F)*2
- t.b값은 샘플로 구한 (추정한) b 값이 모집단의 b 값을 예측하는데 쓰이는 것의 significance함을 테스트 하는 것이다.
- 가령 아래의 regression 라인의 회색부분을 우리는 confidence band라고 부르는데 이는 regression line을 중심으로 (우리가 그 성격을 모르는) 모집단의 regression line을 예측하기 위해서 se.b를 활용한 것이다 (95% CI).
- 위의 둘은 모두 x변인 즉, 독립변인의 기여를 테스트하는 것이므로 같은 결과를 갖게 된다.
regression.rs
##############
# 배운 것 확인
##############
sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
mo.xy <- lm(y~x, data=sam)
summary(mo.xy)
plot(x,y)
abline(mo.xy)
abline(v=mean(x),col="red")
abline(h=mean(y),col="blue")
n <- length(y)
n
cov(x,y)
sp(x,y)/(n-1)
r.cal <- cor(x,y)
r.cal
cov(x,y)/(sd(x)*sd(y))
b.cal <- sp(x,y)/ss(x)
a.cal <- mean(y)-b.cal*mean(x)
cat(a.cal, b.cal)
ss.tot <- ss(y)
y.hat <- a.cal + b.cal*x
mo.xy$fitted.values
y.hat
data.frame(mo.xy$fitted.values, y.hat)
res <- y - y.hat
reg <- y.hat - mean(y)
res
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.tot
ss.res
ss.reg
ss.res+ss.reg
r.sq <- ss.reg/ss.tot
r.sq
1-r.sq
ss.res/ss.tot
df.tot <- n - 1
df.reg <- 2 - 1
df.res <- df.tot - df.reg
df.tot
df.reg
df.res
ms.tot <- ss.tot / df.tot
ms.tot
var(y)
ms.reg <- ss.reg / df.reg
ms.res <- ss.res / df.res
f.cal <- ms.reg / ms.res
p.f <- pf(f.cal, df.reg, df.res, lower.tail = F)
data.frame(f.cal, df.reg, df.res, p.f)
summary(mo.xy)
data.frame(head(res), head(summary(mo.xy)$residuals))
data.frame(min(res), max(res))
# residual의 분산값은 무엇인가?
# residual의 집합인 res 변인의 ss(res) 값을
# ss.res 라고 하면 이를 n-2로 나눠준 값을 말한다.
var.res <- ss.res / (n-2) # variance of residuals
sd.res <- sqrt(var.res) # standard deviation of residuals
# 위의 값을 standard deviation of residual이라고 부를 수 있다
data.frame(var.res, sd.res)
sigma(mo.xy)
summary(mo.xy)$sigma
summary(mo.xy)
# 위의 var.res = ms.res 이라고도 부른다
ms.res <- var.res
# 위는 ss.res 값을 n-2로 나눠준 것을 말한다
# 그런데 ss.res 대신에 아래처럼
# ss.res 이 ss.total에서 차지하는 비율로
# 대체해서 생각해볼 수도 있다.
ss.res/ss.tot
# 즉, sd.res 값은 아래처럼 구한다고 했는데
sd.res <- sqrt(ss.res/(n-2))
# 위의 ss.res 대신에 ss.res/ss.tot 을 쓴다면
# ss.res의 proportion값을 (전체 ss 값에 대한 비율값을)
# 사용한 것이 된다. 즉, 일종의 표준화된 퍼센티지로
# 계산을 하는 것이 된다.
sqrt((ss.res/ss.tot)/(n-2))
# 이것을 standard error of the residual 이라고 부른다
se.res <- sqrt((ss.res/ss.tot)/(n-2)) # se for residual
se.res
# 그리고 이 se.res은 r 값의 (correlation coefficient)
# significance 정도를 가늠하는데 쓰인다.
# 즉, (r - 0)/ se.res =
# x변인이 있으므로 생긴 상관관계 / 상관관계의 랜덤한 부분 .
# 이를 t 값이라고 하고, 이는 t distribution을 따르기에
t.r <- r.cal/se.res
p.r <- pt(t.r, n-2, lower.tail = F)*2
cat(r.cal, se.res, t.r, p.r)
cor.test(x,y)
# 그렇다면 b1의 significance한 정도는 어떻게 가늠하는가?
# standard error for b1 (regression slope)
# 위에서 우리는 ms.res <- ss.res/(n-2)
ms.res
sqrt(ms.res)
sd.res <- sqrt(ms.res) # residual sd (sd for regression)
sd.res
# 그리고 ss(x) 값은
ss.x <- ss(x)
ss.x
# b (계수값의) standard error 값은
se.b <- sd.res/sqrt(ss.x)
se.b
# 위는 아래와 같은 식이다
sqrt(ms.res)/sqrt(ss.x)
# 위의 값이 standard error of b 값이므로 .. ..
# http://commres.net/regression#standard_error_of_b
b.cal
t.b <- b.cal/se.b
p.b <- pt(t.b, n-2, lower.tail = F)*2
data.frame(b.cal, se.b, t.b, p.b)
# 위의 결과로 b 값이 (기울기 값이, x가 y평균 대신에 기여하는 정도가)
# significant한지 테스트 한다. 그리고, 이것은 위에서 구했던
# r.cal 값의 테스트 한 것과 같은 결과를 갖는다
data.frame(r.cal, se.res, t.r, p.r )
# 왜 t.b와 t.r 값이 같은가?
# t.b 값은 b값을 se.b값으로
# t.r 값은 r값을 se.res값으로 나누어서 구한것인데
# 두 값이 같다. 그 이유는 각 식의 분자 부분인
# b값과 r값이 모두 x 변인에 의해서(만) 구해진 것이기 때문에 이다.
# t.r 값은 sample에서 구한 r값을 모집단의 r값을 예측하는 것이고
# t.b 값은 sample에서 구한 기울기 값이 (x변인의 기여정도가)
# 모집단이라면 어디일까를 (얼마일까를) 알려주는 것이기 때문이다.
# 둘 다가 모두 x 변인으로 생긴 결과의 significant 정도를 가늠하는
# 것이다.
summary(mo.xy)
data.frame(ss.reg, ss.res, ss.tot, r.sq, ss.res/ss.tot)
data.frame(f.cal, df.reg, df.res, p.f)
anova(mo.xy)
# remember ss.res?
ms.res
ms.reg
regression.rout
> ##############
> # 배운 것 확인
> ##############
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
> sam <- sam[,-1]
> head(sam)
x x2 y
1 100.7858 41.32553 3.192923
2 122.7128 51.70909 3.606455
3 115.7513 27.59380 3.299693
4 115.4523 36.08332 2.811801
5 113.1708 42.87865 3.247758
6 111.3423 54.87082 3.329180
> tail(sam)
x x2 y
95 110.09457 41.37388 3.252026
96 122.42522 35.62619 3.131744
97 119.04052 51.68809 3.281162
98 106.59454 40.61295 3.122031
99 97.07284 51.14235 2.578662
100 104.02543 41.19795 3.539551
> mo.xy <- lm(y~x, data=sam)
> summary(mo.xy)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
> plot(x,y)
> abline(mo.xy)
> abline(v=mean(x),col="red")
> abline(h=mean(y),col="blue")
>
> n <- length(y)
> n
[1] 100
> cov(x,y)
[1] 1.587297
> sp(x,y)/(n-1)
[1] 1.587297
> r.cal <- cor(x,y)
> r.cal
[1] 0.414393
> cov(x,y)/(sd(x)*sd(y))
[1] 0.414393
>
> b.cal <- sp(x,y)/ss(x)
> a.cal <- mean(y)-b.cal*mean(x)
> cat(a.cal, b.cal)
1.614428 0.01447618>
> ss.tot <- ss(y)
> y.hat <- a.cal + b.cal*x
> mo.xy$fitted.values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
3.073421 3.390841 3.290064 3.285737 3.252709 3.226239 3.077796 3.100430 3.167429 3.145222 3.186680 3.220170 3.058232 3.039748 3.360385 2.923754
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
3.263701 3.098318 3.041836 3.103114 3.212438 3.156691 3.278437 3.060121 3.405566 3.399474 3.424280 3.038663 3.150517 3.112629 3.136295 3.218636
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
2.968320 2.953293 2.920798 2.974089 3.133239 3.110435 3.327778 3.343117 3.013600 3.042025 3.349640 3.106477 3.242224 2.983875 3.391751 3.291115
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
3.244502 3.318585 3.252179 3.406932 2.894919 3.176693 3.402063 3.044182 3.100588 3.016279 3.032833 2.977549 3.050346 3.170772 3.486053 2.972923
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
3.108503 3.264175 3.355081 3.544545 3.291476 3.214155 3.325748 3.176509 3.134742 3.272043 3.016544 2.962409 3.155069 3.233770 2.952458 3.224576
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
2.991338 3.170813 2.875599 2.972474 3.304561 2.936843 3.083969 3.009700 3.055152 2.973885 3.254658 2.972157 3.019914 3.366299 3.208177 3.386677
97 98 99 100
3.337680 3.157510 3.019672 3.120319
> y.hat
[1] 3.073421 3.390841 3.290064 3.285737 3.252709 3.226239 3.077796 3.100430 3.167429 3.145222 3.186680 3.220170 3.058232 3.039748 3.360385 2.923754
[17] 3.263701 3.098318 3.041836 3.103114 3.212438 3.156691 3.278437 3.060121 3.405566 3.399474 3.424280 3.038663 3.150517 3.112629 3.136295 3.218636
[33] 2.968320 2.953293 2.920798 2.974089 3.133239 3.110435 3.327778 3.343117 3.013600 3.042025 3.349640 3.106477 3.242224 2.983875 3.391751 3.291115
[49] 3.244502 3.318585 3.252179 3.406932 2.894919 3.176693 3.402063 3.044182 3.100588 3.016279 3.032833 2.977549 3.050346 3.170772 3.486053 2.972923
[65] 3.108503 3.264175 3.355081 3.544545 3.291476 3.214155 3.325748 3.176509 3.134742 3.272043 3.016544 2.962409 3.155069 3.233770 2.952458 3.224576
[81] 2.991338 3.170813 2.875599 2.972474 3.304561 2.936843 3.083969 3.009700 3.055152 2.973885 3.254658 2.972157 3.019914 3.366299 3.208177 3.386677
[97] 3.337680 3.157510 3.019672 3.120319
> data.frame(mo.xy$fitted.values, y.hat)
mo.xy.fitted.values y.hat
1 3.073421 3.073421
2 3.390841 3.390841
3 3.290064 3.290064
4 3.285737 3.285737
5 3.252709 3.252709
6 3.226239 3.226239
7 3.077796 3.077796
8 3.100430 3.100430
9 3.167429 3.167429
10 3.145222 3.145222
11 3.186680 3.186680
12 3.220170 3.220170
13 3.058232 3.058232
14 3.039748 3.039748
15 3.360385 3.360385
16 2.923754 2.923754
17 3.263701 3.263701
18 3.098318 3.098318
19 3.041836 3.041836
20 3.103114 3.103114
21 3.212438 3.212438
22 3.156691 3.156691
23 3.278437 3.278437
24 3.060121 3.060121
25 3.405566 3.405566
26 3.399474 3.399474
27 3.424280 3.424280
28 3.038663 3.038663
29 3.150517 3.150517
30 3.112629 3.112629
31 3.136295 3.136295
32 3.218636 3.218636
33 2.968320 2.968320
34 2.953293 2.953293
35 2.920798 2.920798
36 2.974089 2.974089
37 3.133239 3.133239
38 3.110435 3.110435
39 3.327778 3.327778
40 3.343117 3.343117
41 3.013600 3.013600
42 3.042025 3.042025
43 3.349640 3.349640
44 3.106477 3.106477
45 3.242224 3.242224
46 2.983875 2.983875
47 3.391751 3.391751
48 3.291115 3.291115
49 3.244502 3.244502
50 3.318585 3.318585
51 3.252179 3.252179
52 3.406932 3.406932
53 2.894919 2.894919
54 3.176693 3.176693
55 3.402063 3.402063
56 3.044182 3.044182
57 3.100588 3.100588
58 3.016279 3.016279
59 3.032833 3.032833
60 2.977549 2.977549
61 3.050346 3.050346
62 3.170772 3.170772
63 3.486053 3.486053
64 2.972923 2.972923
65 3.108503 3.108503
66 3.264175 3.264175
67 3.355081 3.355081
68 3.544545 3.544545
69 3.291476 3.291476
70 3.214155 3.214155
71 3.325748 3.325748
72 3.176509 3.176509
73 3.134742 3.134742
74 3.272043 3.272043
75 3.016544 3.016544
76 2.962409 2.962409
77 3.155069 3.155069
78 3.233770 3.233770
79 2.952458 2.952458
80 3.224576 3.224576
81 2.991338 2.991338
82 3.170813 3.170813
83 2.875599 2.875599
84 2.972474 2.972474
85 3.304561 3.304561
86 2.936843 2.936843
87 3.083969 3.083969
88 3.009700 3.009700
89 3.055152 3.055152
90 2.973885 2.973885
91 3.254658 3.254658
92 2.972157 2.972157
93 3.019914 3.019914
94 3.366299 3.366299
95 3.208177 3.208177
96 3.386677 3.386677
97 3.337680 3.337680
98 3.157510 3.157510
99 3.019672 3.019672
100 3.120319 3.120319
> res <- y - y.hat
> reg <- y.hat - mean(y)
> res
[1] 0.119501750 0.215614401 0.009628661 -0.473936002 -0.004950905 0.102940633 0.377076682 0.223718332 0.013320795 0.061104050 -0.113819228
[12] 0.527301849 0.119440719 -0.341981719 -0.277581268 -0.205981554 0.068641920 0.181848917 -0.116193350 0.464727911 0.471205885 -0.483644257
[23] 0.195296811 -0.318037185 0.300474294 -0.043325441 0.442614340 -0.129151361 -0.013514069 -0.124067245 -0.194342819 0.483475449 0.445159444
[34] -0.176297907 -0.057008592 0.035786736 -0.258975183 0.094323098 -0.206267000 -0.180409275 0.319716564 0.419244478 0.422463721 -0.335463489
[45] 0.057786803 -0.318994939 0.032800867 -0.472268677 0.184486306 0.067978885 -0.145421543 0.113863386 0.041423444 -0.336210600 0.374876833
[56] 0.153315855 -0.532059178 0.134657823 -0.221707791 0.184621469 0.093782431 0.378645082 -0.633723960 0.905124191 0.104136650 -0.362907837
[67] 0.435518142 -0.170270360 -0.154292676 -0.749099980 0.345233347 -0.073221533 0.338293956 -0.147846335 -0.346320934 0.846546713 -0.209780903
[78] 0.361603783 -0.314116393 0.038372131 0.341075152 -0.313400633 0.032494822 -0.167953398 0.292811611 0.034550659 0.382131991 -0.537251620
[89] -0.787231802 0.047401643 0.035616050 0.097938356 -0.927698307 -0.270130357 0.043849604 -0.254933796 -0.056518191 -0.035478308 -0.441009966
[100] 0.419232444
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> ss.tot
[1] 13.24715
> ss.res
[1] 10.97233
> ss.reg
[1] 2.274822
> ss.res+ss.reg
[1] 13.24715
>
> r.sq <- ss.reg/ss.tot
> r.sq
[1] 0.1717215
> 1-r.sq
[1] 0.8282785
> ss.res/ss.tot
[1] 0.8282785
>
> df.tot <- n - 1
> df.reg <- 2 - 1
> df.res <- df.tot - df.reg
> df.tot
[1] 99
> df.reg
[1] 1
> df.res
[1] 98
>
> ms.tot <- ss.tot / df.tot
> ms.tot
[1] 0.1338096
> var(y)
[1] 0.1338096
> ms.reg <- ss.reg / df.reg
> ms.res <- ss.res / df.res
> f.cal <- ms.reg / ms.res
> p.f <- pf(f.cal, df.reg, df.res, lower.tail = F)
> data.frame(f.cal, df.reg, df.res, p.f)
f.cal df.reg df.res p.f
1 20.3177 1 98 1.817763e-05
>
> summary(mo.xy)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
> data.frame(head(res), head(summary(mo.xy)$residuals))
head.res. head.summary.mo.xy..residuals.
1 0.119501750 0.119501750
2 0.215614401 0.215614401
3 0.009628661 0.009628661
4 -0.473936002 -0.473936002
5 -0.004950905 -0.004950905
6 0.102940633 0.102940633
> data.frame(min(res), max(res))
min.res. max.res.
1 -0.9276983 0.9051242
>
> # residual의 분산값은 무엇인가?
> # residual의 집합인 res 변인의 ss(res) 값을
> # ss.res 라고 하면 이를 n-2로 나눠준 값을 말한다.
> var.res <- ss.res / (n-2) # variance of residuals
> sd.res <- sqrt(var.res) # standard deviation of residuals
> # 위의 값을 standard deviation of residual이라고 부를 수 있다
> data.frame(var.res, sd.res)
var.res sd.res
1 0.1119626 0.3346081
> sigma(mo.xy)
[1] 0.3346081
> summary(mo.xy)$sigma
[1] 0.3346081
> summary(mo.xy)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
>
> # 위의 var.res = ms.res 이라고도 부른다
> ms.res <- var.res
> # 위는 ss.res 값을 n-2로 나눠준 것을 말한다
> # 그런데 ss.res 대신에 아래처럼
> # ss.res 이 ss.total에서 차지하는 비율로
> # 대체해서 생각해볼 수도 있다.
> ss.res/ss.tot
[1] 0.8282785
>
> # 즉, sd.res 값은 아래처럼 구한다고 했는데
> sd.res <- sqrt(ss.res/(n-2))
> # 위의 ss.res 대신에 ss.res/ss.tot 을 쓴다면
> # ss.res의 proportion값을 (전체 ss 값에 대한 비율값을)
> # 사용한 것이 된다. 즉, 일종의 표준화된 퍼센티지로
> # 계산을 하는 것이 된다.
> sqrt((ss.res/ss.tot)/(n-2))
[1] 0.09193379
> # 이것을 standard error of the residual 이라고 부른다
> se.res <- sqrt((ss.res/ss.tot)/(n-2)) # se for residual
> se.res
[1] 0.09193379
>
> # 그리고 이 se.res은 r 값의 (correlation coefficient)
> # significance 정도를 가늠하는데 쓰인다.
> # 즉, (r - 0)/ se.res =
> # x변인이 있으므로 생긴 상관관계 / 상관관계의 랜덤한 부분 .
> # 이를 t 값이라고 하고, 이는 t distribution을 따르기에
> t.r <- r.cal/se.res
> p.r <- pt(t.r, n-2, lower.tail = F)*2
> cat(r.cal, se.res, t.r, p.r)
0.414393 0.09193379 4.507515 1.817763e-05> cor.test(x,y)
Pearson's product-moment correlation
data: x and y
t = 4.5075, df = 98, p-value = 1.818e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2372888 0.5648366
sample estimates:
cor
0.414393
>
> # 그렇다면 b1의 significance한 정도는 어떻게 가늠하는가?
> # standard error for b1 (regression slope)
>
> # 위에서 우리는 ms.res <- ss.res/(n-2)
> ms.res
[1] 0.1119626
> sqrt(ms.res)
[1] 0.3346081
> sd.res <- sqrt(ms.res) # residual sd (sd for regression)
> sd.res
[1] 0.3346081
>
> # 그리고 ss(x) 값은
> ss.x <- ss(x)
> ss.x
[1] 10855.25
> # b (계수값의) standard error 값은
> se.b <- sd.res/sqrt(ss.x)
> se.b
[1] 0.003211564
> # 위는 아래와 같은 식이다
> sqrt(ms.res)/sqrt(ss.x)
[1] 0.003211564
> # 위의 값이 standard error of b 값이므로 .. ..
> # http://commres.net/regression#standard_error_of_b
> b.cal
[1] 0.01447618
> t.b <- b.cal/se.b
> p.b <- pt(t.b, n-2, lower.tail = F)*2
> data.frame(b.cal, se.b, t.b, p.b)
b.cal se.b t.b p.b
1 0.01447618 0.003211564 4.507515 1.817763e-05
> # 위의 결과로 b 값이 (기울기 값이, x가 y평균 대신에 기여하는 정도가)
> # significant한지 테스트 한다. 그리고, 이것은 위에서 구했던
> # r.cal 값의 테스트 한 것과 같은 결과를 갖는다
> data.frame(r.cal, se.res, t.r, p.r )
r.cal se.res t.r p.r
1 0.414393 0.09193379 4.507515 1.817763e-05
> # 왜 t.b와 t.r 값이 같은가?
> # t.b 값은 b값을 se.b값으로
> # t.r 값은 r값을 se.res값으로 나누어서 구한것인데
> # 두 값이 같다. 그 이유는 각 식의 분자 부분인
> # b값과 r값이 모두 x 변인에 의해서(만) 구해진 것이기 때문에 이다.
> # t.r 값은 sample에서 구한 r값을 모집단의 r값을 예측하는 것이고
> # t.b 값은 sample에서 구한 기울기 값이 (x변인의 기여정도가)
> # 모집단이라면 어디일까를 (얼마일까를) 알려주는 것이기 때문이다.
> # 둘 다가 모두 x 변인으로 생긴 결과의 significant 정도를 가늠하는
> # 것이다.
> summary(mo.xy)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
>
> data.frame(ss.reg, ss.res, ss.tot, r.sq, ss.res/ss.tot)
ss.reg ss.res ss.tot r.sq ss.res.ss.tot
1 2.274822 10.97233 13.24715 0.1717215 0.8282785
> data.frame(f.cal, df.reg, df.res, p.f)
f.cal df.reg df.res p.f
1 20.3177 1 98 1.817763e-05
> anova(mo.xy)
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr(>F)
x 1 2.2748 2.27482 20.318 1.818e-05 ***
Residuals 98 10.9723 0.11196
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> # remember ss.res?
> ms.res
[1] 0.1119626
> ms.reg
[1] 2.274822
>
residual errors and regression errors in Regression
regression.sample.00.rs
#####################################################
# residual errors and regression errors in Regression
#
#####################################################
rm(list=ls())
ss <- function(x) {
sum((x-mean(x))^2)
}
sp <- function(x, y) {
sum((x-mean(x))*(y-mean(y)))
}
sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
cor(sam)
y <- sam$y
x <- sam$x
x2 <- sam$x2
cor(x,y)
m.y.x <- lm(y~x, data=sam)
summary(m.y.x)
plot(x,y)
abline(m.y.x, col='red')
abline(h=mean(y), col='blue')
# for visual understanding see
# http://commres.net/c/ms/2026/schedule/w12.lecture.note#types_of_correlations
res <- resid(m.y.x)
y.hat <- m.y.x$fitted.values
reg <- y.hat - mean(y)
ss(y)
ss(res)+ss(reg)
m.reg.x <- lm(reg~x, data=sam)
m.res.x <- lm(res~x, data=sam)
summary(m.reg.x)
summary(m.res.x)
cor(x, reg)
cor(x, res)
# Then!
cor(x, x2)
m.x2.x <- lm(x2~x, data=sam)
res.x2.x <- resid(m.x2.x)
m.y.resx2 <- lm(y~res.x2.x, data=sam)
summary(m.y.resx2)
d <- summary(m.y.resx2)$r.square
d
summary(m.y.x)
summary(m.y.x)$r.square
bc <- summary(m.y.x)$r.square
bcd <- bc + d
bcd
# Then
m.y.xx2 <- lm(y~x+x2, data=sam)
summary(m.y.xx2)
# 나중에 더 설명
# Then!
# what is the portion (r square value) of x1's own contribution?
#
m.x.x2 <- lm(x~x2, data=sam)
res.be <- resid(m.x.x2)
m.b.abcd <- lm(y~res.be, data=sam)
summary(m.b.abcd)
b.abcd <- summary(m.b.abcd)$r.square
b.abcd
spcor.test(y,x,x2)$estimate^2
# also we can do the below
m.y.x2 <- lm(y~x2, data=sam)
res.ab <- resid(m.y.x2)
m.resab.resbe <- lm(res.ab~res.be, data=sam)
b.ab <- summary(m.resab.resbe)$r.square
b.ab
pcor.test(y,x,x2)$estimate^2
bd.abcd <- summary(lm(y~x, data=sam))$r.square
bd.abcd
cor.test(y,x)$estimate
cor.test(y,x)$estimate^2
regression.sample.00.ro
> #####################################################
> # residual errors and regression errors in Regression
> #
> #####################################################
> rm(list=ls())
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+ sum((x-mean(x))*(y-mean(y)))
+ }
>
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.csv")
> sam <- sam[,-1]
> head(sam)
x x2 y
1 100.7858 41.32553 3.192923
2 122.7128 51.70909 3.606455
3 115.7513 27.59380 3.299693
4 115.4523 36.08332 2.811801
5 113.1708 42.87865 3.247758
6 111.3423 54.87082 3.329180
> tail(sam)
x x2 y
95 110.09457 41.37388 3.252026
96 122.42522 35.62619 3.131744
97 119.04052 51.68809 3.281162
98 106.59454 40.61295 3.122031
99 97.07284 51.14235 2.578662
100 104.02543 41.19795 3.539551
> cor(sam)
x x2 y
x 1.0000000 0.1880415 0.4143930
x2 0.1880415 1.0000000 0.3373768
y 0.4143930 0.3373768 1.0000000
>
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
>
> cor(x,y)
[1] 0.414393
> m.y.x <- lm(y~x, data=sam)
> summary(m.y.x)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
> plot(x,y)
> abline(m.y.x, col='red')
> abline(h=mean(y), col='blue')
>
> # for visual understanding see
> # http://commres.net/c/ms/2026/schedule/w12.lecture.note#types_of_correlations
>
> res <- resid(m.y.x)
> y.hat <- m.y.x$fitted.values
> reg <- y.hat - mean(y)
> ss(y)
[1] 13.24715
> ss(res)+ss(reg)
[1] 13.24715
>
> m.reg.x <- lm(reg~x, data=sam)
> m.res.x <- lm(res~x, data=sam)
> summary(m.reg.x)
Call:
lm(formula = reg ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-1.119e-15 -1.269e-16 -1.860e-17 7.610e-17 3.316e-15
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.546e+00 3.910e-16 -3.954e+15 <2e-16 ***
x 1.448e-02 3.644e-18 3.973e+15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.796e-16 on 98 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 1.578e+31 on 1 and 98 DF, p-value: < 2.2e-16
> summary(m.res.x)
Call:
lm(formula = res ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.388e-17 3.446e-01 0 1
x 0.000e+00 3.212e-03 0 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 1.254e-33, Adjusted R-squared: -0.0102
F-statistic: 1.229e-31 on 1 and 98 DF, p-value: 1
>
> cor(x, reg)
[1] 1
> cor(x, res)
[1] -2.061785e-17
>
> # Then!
> cor(x, x2)
[1] 0.1880415
> m.x2.x <- lm(x2~x, data=sam)
> res.x2.x <- resid(m.x2.x)
>
> m.y.resx2 <- lm(y~res.x2.x, data=sam)
> summary(m.y.resx2)
Call:
lm(formula = y ~ res.x2.x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-1.01305 -0.26905 0.00798 0.28017 0.67040
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.160479 0.035460 89.128 < 2e-16 ***
res.x2.x 0.013132 0.004843 2.711 0.00791 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3546 on 98 degrees of freedom
Multiple R-squared: 0.06978, Adjusted R-squared: 0.06029
F-statistic: 7.352 on 1 and 98 DF, p-value: 0.007912
> d <- summary(m.y.resx2)$r.square
> d
[1] 0.06978376
>
> summary(m.y.x)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.92770 -0.20715 0.03368 0.20038 0.90512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.614428 0.344622 4.685 9.04e-06 ***
x 0.014476 0.003212 4.508 1.82e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3346 on 98 degrees of freedom
Multiple R-squared: 0.1717, Adjusted R-squared: 0.1633
F-statistic: 20.32 on 1 and 98 DF, p-value: 1.818e-05
> summary(m.y.x)$r.square
[1] 0.1717215
> bc <- summary(m.y.x)$r.square
>
> bcd <- bc + d
> bcd
[1] 0.2415053
>
> # Then
> m.y.xx2 <- lm(y~x+x2, data=sam)
> summary(m.y.xx2)
Call:
lm(formula = y ~ x + x2, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.87249 -0.20913 0.00269 0.20047 0.82037
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.267554 0.351230 3.609 0.000489 ***
x 0.012709 0.003145 4.041 0.000107 ***
x2 0.013132 0.004396 2.987 0.003564 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3218 on 97 degrees of freedom
Multiple R-squared: 0.2415, Adjusted R-squared: 0.2259
F-statistic: 15.44 on 2 and 97 DF, p-value: 1.506e-06
> # 나중에 더 설명
>
> # Then!
> # what is the portion (r square value) of x1's own contribution?
> #
> m.x.x2 <- lm(x~x2, data=sam)
> res.be <- resid(m.x.x2)
> m.b.abcd <- lm(y~res.be, data=sam)
> summary(m.b.abcd)
Call:
lm(formula = y ~ res.be, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.96326 -0.22562 0.01725 0.22139 0.89797
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.160479 0.034339 92.038 < 2e-16 ***
res.be 0.012709 0.003356 3.787 0.000263 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3434 on 98 degrees of freedom
Multiple R-squared: 0.1277, Adjusted R-squared: 0.1188
F-statistic: 14.34 on 1 and 98 DF, p-value: 0.0002626
> b.abcd <- summary(m.b.abcd)$r.square
> b.abcd
[1] 0.1276822
> spcor.test(y,x,x2)$estimate^2
[1] 0.1276822
>
> # also we can do the below
> m.y.x2 <- lm(y~x2, data=sam)
> res.ab <- resid(m.y.x2)
> m.resab.resbe <- lm(res.ab~res.be, data=sam)
> b.ab <- summary(m.resab.resbe)$r.square
> b.ab
[1] 0.1440821
> pcor.test(y,x,x2)$estimate^2
[1] 0.1440821
>
> bd.abcd <- summary(lm(y~x, data=sam))$r.square
> bd.abcd
[1] 0.1717215
> cor.test(y,x)$estimate
cor
0.414393
> cor.test(y,x)$estimate^2
cor
0.1717215
>
Multiple regression with IVs no correlations
multiple.regression.01.rs
############################
# multiple regression
############################
rm(list=ls())
sam <- read.csv2("http://commres.net/_media/r/regression.sample.02.csv")
sam <- sam[,-1]
head(sam)
tail(sam)
str(sam)
cor(sam)
y <- sam$y
x <- sam$x
x2 <- sam$x2
# y regression with x and x2
m.y.x1 <- lm(y~x, data=sam)
m.y.x2 <- lm(y~x2, data=sam)
# summary(m.y.x1)
# summary(m.y.x2)
explained.by.x1 <- summary(m.y.x1)$r.square
explained.by.x2 <- summary(m.y.x2)$r.square
round(explained.by.x1 + explained.by.x2,3)
rs.each.sum <- explained.by.x1 + explained.by.x2
m.y.x1x2 <- lm(y~x+x2, data=sam)
summary(m.y.x1x2)
rs.both <- summary(m.y.x1x2)$r.square
round(rs.both,3)
round(data.frame(rs.both, rs.each.sum, rs.both-rs.each.sum),3)
m.x1.x2 <- lm(x~x2, data=sam)
# R-squared = 0
# x2 = 0
# p-value = .7945
summary(m.x1.x2)
summary(m.x1.x2)$r.square
sqrt(summary(m.x1.x2)$r.square)
cor(x,x2)
cor.test(x,x2)
#################
# 그림으로 보기
#################
multiple.regression.01.ro
>
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.02.csv")
> sam <- sam[,-1]
>
> head(sam)
x x2 y
1 109.9413 46.09681 3.277817
2 115.9064 42.91232 2.940628
3 104.5726 37.00947 2.854055
4 122.9020 44.69320 3.831361
5 120.1931 37.18179 3.102626
6 119.3134 38.54365 3.135997
> tail(sam)
x x2 y
999995 109.0402 39.47883 2.645672
999996 122.6468 49.57322 3.478578
999997 121.5819 41.26768 3.386559
999998 106.8277 40.39737 3.434375
999999 113.8535 36.00922 3.039189
1000000 114.4945 46.07892 3.439678
> str(sam)
'data.frame': 1000000 obs. of 3 variables:
$ x : num 110 116 105 123 120 ...
$ x2: num 46.1 42.9 37 44.7 37.2 ...
$ y : num 3.28 2.94 2.85 3.83 3.1 ...
> cor(sam)
x x2 y
x 1.0000000000 -0.0002605277 0.3001789
x2 -0.0002605277 1.0000000000 0.1993134
y 0.3001788706 0.1993133956 1.0000000
>
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
>
> # y regression with x and x2
> m.y.x1 <- lm(y~x, data=sam)
> m.y.x2 <- lm(y~x2, data=sam)
> # summary(m.y.x1)
> # summary(m.y.x2)
> explained.by.x1 <- summary(m.y.x1)$r.square
> explained.by.x2 <- summary(m.y.x2)$r.square
> round(explained.by.x1 + explained.by.x2,3)
[1] 0.13
> rs.each.sum <- explained.by.x1 + explained.by.x2
>
> m.y.x1x2 <- lm(y~x+x2, data=sam)
> summary(m.y.x1x2)
Call:
lm(formula = y ~ x + x2, data = sam)
Residuals:
Min 1Q Median 3Q Max
-1.77065 -0.25170 0.00031 0.25145 1.83133
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.604e+00 4.168e-03 384.8 <2e-16 ***
x 1.091e-02 3.389e-05 321.9 <2e-16 ***
x2 9.956e-03 4.658e-05 213.8 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3732 on 999997 degrees of freedom
Multiple R-squared: 0.1299, Adjusted R-squared: 0.1299
F-statistic: 7.462e+04 on 2 and 999997 DF, p-value: < 2.2e-16
> rs.both <- summary(m.y.x1x2)$r.square
> round(rs.both,3)
[1] 0.13
>
> round(data.frame(rs.both, rs.each.sum, rs.both-rs.each.sum),3)
rs.both rs.each.sum rs.both...rs.each.sum
1 0.13 0.13 0
>
> m.x1.x2 <- lm(x~x2, data=sam)
> # R-squared = 0
> # x2 = 0
> # p-value = .7945
> summary(m.x1.x2)
Call:
lm(formula = x ~ x2, data = sam)
Residuals:
Min 1Q Median 3Q Max
-51.050 -7.432 -0.005 7.432 54.696
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.080e+02 5.877e-02 1838.108 <2e-16 ***
x2 -3.581e-04 1.374e-03 -0.261 0.794
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11.01 on 999998 degrees of freedom
Multiple R-squared: 6.787e-08, Adjusted R-squared: -9.321e-07
F-statistic: 0.06787 on 1 and 999998 DF, p-value: 0.7945
> summary(m.x1.x2)$r.square
[1] 6.787467e-08
> sqrt(summary(m.x1.x2)$r.square)
[1] 0.0002605277
> cor(x,x2)
[1] -0.0002605277
> cor.test(x,x2)
Pearson's product-moment correlation
data: x and x2
t = -0.26053, df = 999998, p-value = 0.7945
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.002220491 0.001699438
sample estimates:
cor
-0.0002605277
> #################
> # 그림으로 보기
> #################
>
Multiple regression with IVs correlated
multiple.regression.03.rs
#########################
# multiple regression 2
#########################
rm(list=ls())
# 1. mean of three variables
# iq, allowance, gpa
# x, x2, y
means <- c(108, 42, 3.2)
# allowance, iq, gpa
sds <- c(11, 8, 0.4) # standard deviations
# correlation matrix. note that correlation
# between v1 and v2 is near none (0.05)
corr_matrix <- matrix(c(1, 0.30, 0.4,
0.30, 1, 0.2,
0.4, 0.2, 1),
nrow = 3) # Target correlation matrix
# 2. covariance matrix
# diagonal matrix of SDs
sd_diag <- diag(sds)
sd_diag
# Calculate covariance matrix
cov_matrix <- sd_diag %*% corr_matrix %*% sd_diag
# 3. population data
# set.seed(32)
# set.seed(4321)
set.seed(1230)
n_sam <- 100
# n_sam <- 300
sam <- mvrnorm(n = n_sam, mu = means, Sigma = cov_matrix)
sam <- as.data.frame(sam)
colnames(sam) <- c("x", "x2", "y")
cor(sam)
write.csv2(sam, "regression.sample.03.csv")
sam <- read.csv2("http://commres.net/_media/r/regression.sample.03.csv")
sam <- sam[, -1]
head(sam)
tail(sam)
cor(sam)
y <- sam$y
x <- sam$x
x2 <- sam$x2
lm.y.x1 <- lm(y~x, data=sam)
lm.y.x2 <- lm(y~x2, data=sam)
summary(lm.y.x1)
summary(lm.y.x2)
bc <- summary(lm.y.x1)$r.square
cd <- summary(lm.y.x2)$r.square
bccd <- bc+cd
data.frame(bc, cd, bccd)
lm.y.x1x2 <- lm(y~x+x2, data=sam)
summary(lm.y.x1x2)
bcd <- summary(lm.y.x1x2)$r.square
# where does this diff come from?
c <- bccd - bcd
c
# b 와 d만 추리기
res.be <- resid(lm(x~x2, data=sam))
m.b <- lm(y~res.be, data=sam)
summary(m.b)
b <- summary(m.b)$r.square
b
res.dg <- resid(lm(x2~x, data=sam))
m.d <- lm(y~res.dg, data=sam)
summary(m.d)
d <- summary(m.d)$r.square
d
b
d
bcd
bccd
c <- bccd-bcd
c2 <- bcd-(b+d)
c
c2
# ppcor package
# spcor.test (y,x1,x2)
rs.b.sp <- spcor.test(y,x,x2)$estimate^2
rs.d.sp <- spcor.test(y,x2,x)$estimate^2
data.frame(b, rs.b.sp)
data.frame(d, rs.d.sp)
# remember b.sp = r square value
b.sp <- sqrt(rs.b.sp)
d.sp <- sqrt(rs.d.sp)
data.frame(b.sp, b, rs.b.sp)
data.frame(d.sp, d, rs.d.sp)
multiple.regression.03.ro
> #########################
> # multiple regression 2
> #########################
> rm(list=ls())
> # 1. mean of three variables
> # iq, allowance, gpa
> # x, x2, y
> means <- c(108, 42, 3.2)
>
> # allowance, iq, gpa
> sds <- c(11, 8, 0.4) # standard deviations
> # correlation matrix. note that correlation
> # between v1 and v2 is near none (0.05)
> corr_matrix <- matrix(c(1, 0.30, 0.4,
+ 0.30, 1, 0.2,
+ 0.4, 0.2, 1),
+ nrow = 3) # Target correlation matrix
> # 2. covariance matrix
> # diagonal matrix of SDs
> sd_diag <- diag(sds)
> sd_diag
[,1] [,2] [,3]
[1,] 11 0 0.0
[2,] 0 8 0.0
[3,] 0 0 0.4
>
> # Calculate covariance matrix
> cov_matrix <- sd_diag %*% corr_matrix %*% sd_diag
>
> # 3. population data
> # set.seed(32)
> # set.seed(4321)
> set.seed(1230)
> n_sam <- 100
> # n_sam <- 300
> sam <- mvrnorm(n = n_sam, mu = means, Sigma = cov_matrix)
> sam <- as.data.frame(sam)
> colnames(sam) <- c("x", "x2", "y")
> cor(sam)
x x2 y
x 1.0000000 0.4219157 0.3535740
x2 0.4219157 1.0000000 0.2700868
y 0.3535740 0.2700868 1.0000000
> write.csv2(sam, "regression.sample.03.csv")
>
> sam <- read.csv2("http://commres.net/_media/r/regression.sample.03.csv")
> sam <- sam[, -1]
> head(sam)
x x2 y
1 108.1988 47.05544 2.791294
2 118.1307 38.71573 3.338987
3 105.2902 39.08865 3.867919
4 120.7237 52.10261 3.341301
5 120.1957 45.69276 3.267319
6 118.6366 47.16571 3.126480
> tail(sam)
x x2 y
95 114.68980 34.28183 2.629331
96 115.67318 40.27048 3.316995
97 113.66832 34.68306 3.134684
98 99.68422 30.69797 2.670601
99 108.31653 42.74074 2.850961
100 102.19879 40.88916 3.231628
> cor(sam)
x x2 y
x 1.0000000 0.4219157 0.3535740
x2 0.4219157 1.0000000 0.2700868
y 0.3535740 0.2700868 1.0000000
>
> y <- sam$y
> x <- sam$x
> x2 <- sam$x2
>
> lm.y.x1 <- lm(y~x, data=sam)
> lm.y.x2 <- lm(y~x2, data=sam)
> summary(lm.y.x1)
Call:
lm(formula = y ~ x, data = sam)
Residuals:
Min 1Q Median 3Q Max
-1.03671 -0.26187 0.03189 0.21676 0.71688
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.943438 0.347683 5.590 2.05e-07 ***
x 0.011705 0.003128 3.742 0.000308 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3753 on 98 degrees of freedom
Multiple R-squared: 0.125, Adjusted R-squared: 0.1161
F-statistic: 14 on 1 and 98 DF, p-value: 0.0003079
> summary(lm.y.x2)
Call:
lm(formula = y ~ x2, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.89560 -0.22360 -0.02491 0.25421 0.86636
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.695389 0.198771 13.560 < 2e-16 ***
x2 0.013381 0.004819 2.777 0.00658 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3863 on 98 degrees of freedom
Multiple R-squared: 0.07295, Adjusted R-squared: 0.06349
F-statistic: 7.711 on 1 and 98 DF, p-value: 0.006575
> bc <- summary(lm.y.x1)$r.square
> cd <- summary(lm.y.x2)$r.square
> bccd <- bc+cd
> data.frame(bc, cd, bccd)
bc cd bccd
1 0.1250146 0.0729469 0.1979615
>
> lm.y.x1x2 <- lm(y~x+x2, data=sam)
> summary(lm.y.x1x2)
Call:
lm(formula = y ~ x + x2, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.91766 -0.23439 0.01484 0.21300 0.72503
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.875581 0.349193 5.371 5.36e-07 ***
x 0.009650 0.003432 2.811 0.00597 **
x2 0.007287 0.005137 1.419 0.15921
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3734 on 97 degrees of freedom
Multiple R-squared: 0.1428, Adjusted R-squared: 0.1251
F-statistic: 8.08 on 2 and 97 DF, p-value: 0.0005682
> bcd <- summary(lm.y.x1x2)$r.square
> # where does this diff come from?
> c <- bccd - bcd
> c
[1] 0.05516214
>
> # b 와 d만 추리기
> res.be <- resid(lm(x~x2, data=sam))
> m.b <- lm(y~res.be, data=sam)
> summary(m.b)
Call:
lm(formula = y ~ res.be, data = sam)
Residuals:
Min 1Q Median 3Q Max
-1.09179 -0.25700 0.01967 0.20519 0.77413
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.236835 0.038696 83.649 < 2e-16 ***
res.be 0.009650 0.003557 2.713 0.00788 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.387 on 98 degrees of freedom
Multiple R-squared: 0.06985, Adjusted R-squared: 0.06036
F-statistic: 7.36 on 1 and 98 DF, p-value: 0.00788
> b <- summary(m.b)$r.square
> b
[1] 0.06985246
>
> res.dg <- resid(lm(x2~x, data=sam))
> m.d <- lm(y~res.dg, data=sam)
> summary(m.d)
Call:
lm(formula = y ~ res.dg, data = sam)
Residuals:
Min 1Q Median 3Q Max
-0.94974 -0.24497 -0.01842 0.26781 0.92351
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.236835 0.039764 81.401 <2e-16 ***
res.dg 0.007287 0.005471 1.332 0.186
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3976 on 98 degrees of freedom
Multiple R-squared: 0.01778, Adjusted R-squared: 0.007762
F-statistic: 1.774 on 1 and 98 DF, p-value: 0.1859
> d <- summary(m.d)$r.square
> d
[1] 0.01778476
>
> b
[1] 0.06985246
> d
[1] 0.01778476
> bcd
[1] 0.1427994
> bccd
[1] 0.1979615
> c <- bccd-bcd
> c2 <- bcd-(b+d)
> c
[1] 0.05516214
> c2
[1] 0.05516214
>
ppcor package in r
ppcor.sr
> # ppcor package
> # spcor.test (y,x1,x2)
> rs.b.sp <- spcor.test(y,x,x2)$estimate^2
> rs.d.sp <- spcor.test(y,x2,x)$estimate^2
> data.frame(b, rs.b.sp)
b rs.b.sp
1 0.06985246 0.06985246
> data.frame(d, rs.d.sp)
d rs.d.sp
1 0.01778476 0.01778476
> # remember b.sp = r square value
> b.sp <- sqrt(rs.b.sp)
> d.sp <- sqrt(rs.d.sp)
> data.frame(b.sp, b, rs.b.sp)
b.sp b rs.b.sp
1 0.2642962 0.06985246 0.06985246
> data.frame(d.sp, d, rs.d.sp)
d.sp d rs.d.sp
1 0.1333595 0.01778476 0.01778476
>
ppcor.ro
> # ppcor package
> # spcor.test (y,x1,x2)
> rs.b.sp <- spcor.test(y,x,x2)$estimate^2
> rs.d.sp <- spcor.test(y,x2,x)$estimate^2
> data.frame(b, rs.b.sp)
b rs.b.sp
1 0.06985246 0.06985246
> data.frame(d, rs.d.sp)
d rs.d.sp
1 0.01778476 0.01778476
> # remember b.sp = r square value
> b.sp <- sqrt(rs.b.sp)
> d.sp <- sqrt(rs.d.sp)
> data.frame(b.sp, b, rs.b.sp)
b.sp b rs.b.sp
1 0.2642962 0.06985246 0.06985246
> data.frame(d.sp, d, rs.d.sp)
d.sp d rs.d.sp
1 0.1333595 0.01778476 0.01778476
>
c/ms/2026/schedule/w12.lecture.note.1779678197.txt.gz · Last modified: by hkimscil



