Differences

This shows you the differences between two versions of the page.

--- c:ms:2025:schedule:w11.lecture.note [2025/05/19 15:46] – [code output] hkimscil
+++ c:ms:2025:schedule:w11.lecture.note [2025/05/21 08:59] (current) – [code] hkimscil
@@ Line 20: / Line 20: @@
 ====== Correlation ======
 \begin{eqnarray}
-  \text{correlation, r} & = & \frac {\text{Cov (X,Y)}} {sd(X) * sd(Y)}
+  \text{correlation, r} & = & \frac {\text{Cov (X,Y)}} {sd(X) * sd(Y)} \\
+  & = & \frac {\text{SP (X,Y)} }{ \sqrt{\text{SS.x}*\text{SS.y}}} \nonumber
 \end{eqnarray}
+====== F-test (anova) ======
+for reminding
+\begin{eqnarray*}
+  \text{F} & = & \frac {\text{MS}_\text{between group}} {\text{MS}_\text{within group}} \\
+  & = & \frac {\text{Difference due to the treatment (IV) } }{ \text{Difference due to random error} } \\
+  & = & \frac {\text{Difference explained by IV  } }{ \text{Difference remained random} } \\
+  \text{R}^\text{2} & = & \frac {\text{SS}_\text{between}} {\text{SS}_\text{total} } \hfill \\
+\end{eqnarray*}
+====== Check in R ======
 <code>
@@ Line 42: / Line 54: @@
 y <- (a*x) + b + re
 y
+# covariance x and y
 cov(x,y)
 sd(x)
 sd(y)
+# correlation coefficient
 cov(x,y)/(sd(x)*sd(y))
 cor(x,y)
@@ Line 51: / Line 66: @@
 df.x <- n.x-1
 df.y <- df.x
-sp.xy <- cov(x,y) * df.x
+sp.xy <- cov(x,y) * df.x  # 혹은 아래처럼 구할 수 있다
+sp.xy2 <- sum((x-m.x)*(y-m.y))
+sp.xy == sp.xy2
 ss.x <- var(x) * df.x
 ss.y <- var(y) * df.y
@@ Line 58: / Line 76: @@
 </code>
+====== Check in R: output ======
 <code>
 >
@@ Line 218: / Line 236: @@
 y <- (a*x) + b + re
 y
 m.y <- mean(y)
 df.y <- n.y - 1
@@ Line 224: / Line 243: @@
 var(y)
+#
 prod <- (x-m.x)*(y-m.y)
 sp <- sum(prod)
-sp/df.y
+sp/df.y # covariance 값
-cov(x,y)
+cov(x,y) # covariance 펑션
-s.x <- sd(x)
+s.x <- sd(x) # x변인에 대한 standard deviation 값
-s.y <- sd(y)
+s.y <- sd(y) # y
-(sp/df.y)/(s.x*s.y)
+(sp/df.y)/(s.x*s.y) # correlation 값
-cor(x,y)
+cor(x,y) # correlation 펑션
 df.y
 df.x
-sqrt(ss.x/df.x) # s.x 값이 이것
-sd(x)
-sqrt(ss.y/df.y) # s.y 값이 이것
-sd(y)
 cov(x,y)/(s.x*s.y)
-(sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))
+# covariance를 각각의 sd값을 곱한여 나눠 준 값이 r
-sp/sqrt(ss.x*ss.y) # 분모 분자에서 n-1을 제거하고 보면
+(sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))
+# 분모 분자에서 n-1을 제거하고 보면
+sp/sqrt(ss.x*ss.y)
 cor(x,y)
@@ Line 259: / Line 276: @@
 # note that the red line is y.hat = a + bx,
 # where error value y.hat - y is minimized
-# how to derive a and b
-# how to derive coefficient values
-# see wiki doc at
-# http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 lm.df <- lm(y~x, data = df)
 summary(lm.df)
@@ Line 269: / Line 283: @@
 a
 b
-# y = a + b*x 에서
+# how to derive a and b
+# how to derive coefficient values
+# see wiki doc at
+# http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 b2 <- sp/ss.x
-b3 <- cov(x,y)/var(x,x)
+b3 <- cov(x,y)/var(x,x) # b2 == b3
 # m.y = a + (b*m.x) 이므로
 a2 <- m.y - (b2*m.x)
@@ Line 285: / Line 302: @@
 # reg, res, total error
 #
-y.hat <- (a+(b*x))
+y.hat <- a+(b*x)
 y.hat2 <- predict(lm.df)
-y.hat == y.hat2
+head(y)
-y
+head(y.hat)
-y.hat
 m.y
-tmp <- data.frame(y, y.hat, m.y)
+tmp <- data.frame(x, y, y.hat)
-tmp
+head(tmp)
-plot(x,y) # 원래 데이터터
+plot(x,y) # 원래 데이터
-plot(x,y.hat) # 리그레션라인 예측선선
+plot(x,y.hat) # 리그레션라인 예측선
 plot(x,tmp$m.y) # 평균 예측선
-error <- y-y.hat
+error <- y - y.hat
-explained <- y.hat-m.y
+explained <- y.hat - m.y
-error
+tmp2 <- data.frame(tmp, explained, error)
+head(tmp2)
+head(y.hat == explained + m.y)
+head(y == explained + error + m.y)
 # note the below
 plot(error)
 hist(error)
+plot(x,error)
 cor(x,error)
 ################
-explained
+plot(x,explained)
 cor(x,explained)
+plot(x,(explained+m.y))
+cor(x,(explained+m.y))
+head(explained+error)
+plot(x,(explained+error))
+head(explained+error+m.y == y)
+plot(x,(explained+error+m.y))
+cor(x,(explained+error))
+cor(x,(explained+error+m.y))
+cor(x,y)
 # see this also
-tmp2 <- data.frame(x, explained, error, y)
 round(cor(tmp2),3)
 ###############
+# sum of square values
 ss.res <- sum(error^2)
 ss.reg <- sum(explained^2)
@@ Line 321: / Line 353: @@
 ss.res + ss.reg
-k <- 1 # number of IV
+# degrees of freedom values
-df.res <- n - k - 1
+k <- 2 # number of parameters
-df.reg <- 2 - 1 # number of parameter estimation a and b (slope)
+df.reg <- k - 1 # number of parameter estimation a and b (slope)
+df.res <- n - df.reg - 1
 ms.res <- ss.res/df.res
 ms.reg <- ss.reg/df.reg
@@ Line 335: / Line 368: @@
 ms.res
-f.cal <- ms.reg/ms.res
-f.cal
-pf(f.cal, 1, 38, lower.tail = F)
-anova(lm.df)
-summary(lm.df)
 # r square
 ss.y
@@ Line 345: / Line 373: @@
 r.sq <- ss.reg/ss.y
 r.sq
+# 위의 r.sq 값이 충분히 컸는지를 알아보는 것은
+# ss.reg가 충분히 컸는지를 알아보는 것
+# 이는 IV, x가 y를 설명하는데 충분히 기여했는지
+# 테스트하는 것. 이는 F-test를 이용해서 하게 된다
+# F = MS.bet / MS.within
+# regression의 경우는 F = MS.reg / MS.res
+f.cal <- ms.reg/ms.res
+f.cal
+pf(f.cal, df.reg, df.res, lower.tail = F)
+# check anova test
+anova(lm.df)
+summary(lm.df)
 # standard error for b coefficient
+# b coeffficient 값은 estimation
+# n.y 개의 데이터에서 구한 coefficient값 b는
+# 실제 x, y 모집단을 대표하는 b값은 위의
+# estimation에서 standard error값을 구간으로 갖는
+# 범위일 것.
 res <- resid(lm.df)
+data.frame(head(res))
+data.frame(head(error))
 se.b <- sqrt(ms.res/ss.x)
 se.b
-b + c(-2*se.b, 2*se.b) # as x increased by 1 unit, y would increase b +- 2se.b
+# as x increased by 1 unit, y would increase b +- 2se.b
-# to be exact, we use t.critical value instead of 2
+b + c(-2*se.b, 2*se.b)
-t.crit <- qt(.025, df.res, lower.tail = F)
+b
-b + c(-t.crit*se.b, t.crit*se.b)
+# to be exact
+t.crit <- qt(.975, df.res)
+t.crit
+b + c(-t.crit*se.b, t.crit*se.b)
+b
 t.cal <- lm.df$coefficients[2]/se.b
@@ Line 361: / Line 413: @@
 pt(t.cal, df.res, lower.tail = F)*2
-pf(f.cal, 1, 38, lower.tail = F)
+pf(f.cal, df.reg, df.res, lower.tail = F)
 # see also
+t.cal
 t.cal^2
 f.cal
@@ Line 482: / Line 535: @@
 attr(,"scaled:scale")
 [1] 1.039345
+>
 > m.y <- mean(y)
 > df.y <- n.y - 1
@@ Line 491: / Line 545: @@
 [1,] 804.2779
 >
+> #
 > prod <- (x-m.x)*(y-m.y)
 > sp <- sum(prod)
-> sp/df.y
+> sp/df.y # covariance 값
 [1] 222.3867
-> cov(x,y)
+> cov(x,y) # covariance 펑션
          [,1]
 [1,] 222.3867
 >
-> s.x <- sd(x)
+> s.x <- sd(x) # x변인에 대한 standard deviation 값
-> s.y <- sd(y)
+> s.y <- sd(y) # y
-> (sp/df.y)/(s.x*s.y)
+> (sp/df.y)/(s.x*s.y) # correlation 값
 [1] 0.7841619
-> cor(x,y)
+> cor(x,y) # correlation 펑션
           [,1]
 [1,] 0.7841619
@@ Line 511: / Line 566: @@
 > df.x
 [1] 39
->
-> sqrt(ss.x/df.x) # s.x 값이 이것
-[1] 10
-> sd(x)
-[1] 10
-> sqrt(ss.y/df.y) # s.y 값이 이것
-[1] 28.35979
-> sd(y)
-[1] 28.35979
 >
 > cov(x,y)/(s.x*s.y)
           [,1]
 [1,] 0.7841619
-> (sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))
+> # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r
+> (sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))
 [1] 0.7841619
-> sp/sqrt(ss.x*ss.y) # 분모 분자에서 n-1을 제거하고 보면
+> # 분모 분자에서 n-1을 제거하고 보면
+> sp/sqrt(ss.x*ss.y)
 [1] 0.7841619
 > cor(x,y)
@@ Line 544: / Line 592: @@
 > # note that the red line is y.hat = a + bx,
 > # where error value y.hat - y is minimized
-> # how to derive a and b
+>
-> # how to derive coefficient values
-> # see wiki doc at
-> # http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 > lm.df <- lm(y~x, data = df)
 > summary(lm.df)
@@ Line 576: / Line 621: @@
 > b
 [1] 2.223867
-> # y = a + b*x 에서
+> # how to derive a and b
+> # how to derive coefficient values
+> # see wiki doc at
+> # http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 > b2 <- sp/ss.x
-> b3 <- cov(x,y)/var(x,x)
+> b3 <- cov(x,y)/var(x,x) # b2 == b3
 > # m.y = a + (b*m.x) 이므로
 > a2 <- m.y - (b2*m.x)
@@ Line 598: / Line 646: @@
 > # reg, res, total error
 > #
-> y.hat <- (a+(b*x))
+> y.hat <- a+(b*x)
 > y.hat2 <- predict(lm.df)
-> y.hat == y.hat2
+> head(y)
-      [,1]
+         [,1]
- [1,] TRUE
+[1,] 175.7367
- [2,] TRUE
+[2,] 200.0202
- [3,] TRUE
+[3,] 241.0439
- [4,] TRUE
+[4,] 166.0401
- [5,] TRUE
+[5,] 195.5682
- [6,] TRUE
+[6,] 210.8411
- [7,] TRUE
+> head(y.hat)
- [8,] TRUE
+         [,1]
- [9,] TRUE
+[1,] 186.9727
-[10,] TRUE
+[2,] 205.5720
-[11,] TRUE
+[3,] 212.9643
-[12,] TRUE
+[4,] 189.8671
-[13,] TRUE
+[5,] 196.9697
-[14,] TRUE
+[6,] 239.3554
-[15,] TRUE
-[16,] TRUE
-[17,] TRUE
-[18,] TRUE
-[19,] TRUE
-[20,] TRUE
-[21,] TRUE
-[22,] TRUE
-[23,] TRUE
-[24,] TRUE
-[25,] TRUE
-[26,] TRUE
-[27,] TRUE
-[28,] TRUE
-[29,] TRUE
-[30,] TRUE
-[31,] TRUE
-[32,] TRUE
-[33,] TRUE
-[34,] TRUE
-[35,] TRUE
-[36,] TRUE
-[37,] TRUE
-[38,] TRUE
-[39,] TRUE
-[40,] TRUE
-> y
-          [,1]
- [1,] 175.7367
- [2,] 200.0202
- [3,] 241.0439
- [4,] 166.0401
- [5,] 195.5682
- [6,] 210.8411
- [7,] 181.9459
- [8,] 193.8544
- [9,] 225.4417
-[10,] 182.5276
-[11,] 194.5656
-[12,] 232.4421
-[13,] 255.3070
-[14,] 209.0313
-[15,] 219.1602
-[16,] 181.7819
-[17,] 214.3124
-[18,] 187.2961
-[19,] 207.3581
-[20,] 235.0318
-[21,] 199.2244
-[22,] 208.1031
-[23,] 182.2575
-[24,] 201.4683
-[25,] 169.6124
-[26,] 173.7941
-[27,] 161.0332
-[28,] 152.1292
-[29,] 153.4874
-[30,] 147.7652
-[31,] 189.2685
-[32,] 188.5691
-[33,] 155.5620
-[34,] 172.8731
-[35,] 184.9119
-[36,] 183.3182
-[37,] 119.8266
-[38,] 162.6349
-[39,] 170.0872
-[40,] 150.7684
-attr(,"scaled:center")
-[1] -0.08680858
-attr(,"scaled:scale")
-[1] 1.039345
-> y.hat
-          [,1]
- [1,] 186.9727
- [2,] 205.5720
- [3,] 212.9643
- [4,] 189.8671
- [5,] 196.9697
- [6,] 239.3554
- [7,] 193.0240
- [8,] 181.5673
- [9,] 223.8973
-[10,] 175.6106
-[11,] 182.3963
-[12,] 211.1666
-[13,] 221.1593
-[14,] 193.4777
-[15,] 217.0723
-[16,] 179.7820
-[17,] 188.9258
-[18,] 162.1265
-[19,] 176.7310
-[20,] 212.2383
-[21,] 201.3295
-[22,] 188.1615
-[23,] 197.6116
-[24,] 196.8985
-[25,] 183.6318
-[26,] 197.8900
-[27,] 167.9229
-[28,] 158.6935
-[29,] 168.5770
-[30,] 138.9668
-[31,] 225.1485
-[32,] 187.3333
-[33,] 155.8128
-[34,] 179.8349
-[35,] 189.5992
-[36,] 178.2857
-[37,] 138.0425
-[38,] 188.5297
-[39,] 170.7527
-[40,] 172.1024
-attr(,"scaled:center")
-[1] -0.08680858
-attr(,"scaled:scale")
-[1] 1.039345
 > m.y
 [1] 188.4
 >
-> tmp <- data.frame(y, y.hat, m.y)
+> tmp <- data.frame(x, y, y.hat)
-> tmp
+> head(tmp)
-          y    y.hat   m.y
+          x        y    y.hat
-  175.7367 186.9727 188.4
+  99.35817 175.7367 186.9727
-  200.0202 205.5720 188.4
+ 107.72169 200.0202 205.5720
-  241.0439 212.9643 188.4
+ 111.04574 241.0439 212.9643
-  166.0401 189.8671 188.4
+ 100.65968 166.0401 189.8671
-  195.5682 196.9697 188.4
+ 103.85350 195.5682 196.9697
-  210.8411 239.3554 188.4
+ 122.91296 210.8411 239.3554
-  181.9459 193.0240 188.4
+> plot(x,y) # 원래 데이터
-  193.8544 181.5673 188.4
+> plot(x,y.hat) # 리그레션라인 예측선
-  225.4417 223.8973 188.4
-182.5276 175.6106 188.4
-194.5656 182.3963 188.4
-232.4421 211.1666 188.4
-255.3070 221.1593 188.4
-209.0313 193.4777 188.4
-219.1602 217.0723 188.4
-181.7819 179.7820 188.4
-214.3124 188.9258 188.4
-187.2961 162.1265 188.4
-207.3581 176.7310 188.4
-235.0318 212.2383 188.4
-199.2244 201.3295 188.4
-208.1031 188.1615 188.4
-182.2575 197.6116 188.4
-201.4683 196.8985 188.4
-169.6124 183.6318 188.4
-173.7941 197.8900 188.4
-161.0332 167.9229 188.4
-152.1292 158.6935 188.4
-153.4874 168.5770 188.4
-147.7652 138.9668 188.4
-189.2685 225.1485 188.4
-188.5691 187.3333 188.4
-155.5620 155.8128 188.4
-172.8731 179.8349 188.4
-184.9119 189.5992 188.4
-183.3182 178.2857 188.4
-119.8266 138.0425 188.4
-162.6349 188.5297 188.4
-170.0872 170.7527 188.4
-150.7684 172.1024 188.4
-> plot(x,y) # 원래 데이터터
-> plot(x,y.hat) # 리그레션라인 예측선선
 > plot(x,tmp$m.y) # 평균 예측선
 >
-> error <- y-y.hat
+> error <- y - y.hat
-> explained <- y.hat-m.y
+> explained <- y.hat - m.y
-> error
+> tmp2 <- data.frame(tmp, explained, error)
-             [,1]
+> head(tmp2)
- [1,] -11.2359564
+          x        y    y.hat explained      error
- [2,]  -5.5518187
+  99.35817 175.7367 186.9727 -1.427352 -11.235956
- [3,]  28.0796455
+ 107.72169 200.0202 205.5720 17.172003  -5.551819
- [4,] -23.8269825
+ 111.04574 241.0439 212.9643 24.564266  28.079646
- [5,]  -1.4014742
+ 100.65968 166.0401 189.8671  1.467035 -23.826982
- [6,] -28.5142499
+103.85350 195.5682 196.9697  8.569671  -1.401474
- [7,] -11.0781515
+ 122.91296 210.8411 239.3554 50.955365 -28.514250
- [8,]  12.2870686
+>
- [9,]   1.5443968
+> head(y.hat == explained + m.y)
-[10,]   6.9169740
+     [,1]
-[11,]  12.1692981
+[1,] TRUE
-[12,]  21.2754922
+[2,] TRUE
-[13,]  34.1477152
+[3,] TRUE
-[14,]  15.5535765
+[4,] TRUE
-[15,]   2.0879605
+[5,] TRUE
-[16,]   1.9999551
+[6,] TRUE
-[17,]  25.3866091
+> head(y == explained + error + m.y)
-[18,]  25.1695629
+     [,1]
-[19,]  30.6270426
+[1,] TRUE
-[20,]  22.7934996
+[2,] TRUE
-[21,]  -2.1051048
+[3,] TRUE
-[22,]  19.9415577
+[4,] TRUE
-[23,] -15.3540820
+[5,] TRUE
-[24,]   4.5697607
+[6,] TRUE
-[25,] -14.0193869
+>
-[26,] -24.0959095
-[27,]  -6.8896271
-[28,]  -6.5642357
-[29,] -15.0896661
-[30,]   8.7983528
-[31,] -35.8800136
-[32,]   1.2357770
-[33,]  -0.2508120
-[34,]  -6.9618190
-[35,]  -4.6873617
-[36,]   5.0325839
-[37,] -18.2159090
-[38,] -25.8947992
-[39,]  -0.6654576
-[40,] -21.3340113
-attr(,"scaled:center")
-[1] -0.08680858
-attr(,"scaled:scale")
-[1] 1.039345
 > # note the below
 > plot(error)
 > hist(error)
+> plot(x,error)
 > cor(x,error)
              [,1]
 [1,] 7.981058e-17
 > ################
-> explained
+> plot(x,explained)
-             [,1]
- [1,]  -1.4273520
- [2,]  17.1720033
- [3,]  24.5642656
- [4,]   1.4670349
- [5,]   8.5696709
- [6,]  50.9553654
- [7,]   4.6240241
- [8,]  -6.8327027
- [9,]  35.4972458
-[10,] -12.7894406
-[11,]  -6.0037071
-[12,]  22.7665939
-[13,]  32.7592720
-[14,]   5.0776653
-[15,]  28.6722546
-[16,]  -8.6180356
-[17,]   0.5257494
-[18,] -26.2734989
-[19,] -11.6690128
-[20,]  23.8382597
-[21,]  12.9294844
-[22,]  -0.2385288
-[23,]   9.2115978
-[24,]   8.4984744
-[25,]  -4.7682316
-[26,]   9.4899547
-[27,] -20.4771504
-[28,] -29.7065732
-[29,] -19.8229799
-[30,] -49.4332197
-[31,]  36.7485236
-[32,]  -1.0667066
-[33,] -32.5871819
-[34,]  -8.5651393
-[35,]   1.1992231
-[36,] -10.1143632
-[37,] -50.3574953
-[38,]   0.1296534
-[39,] -17.6473510
-[40,] -16.2976456
-attr(,"scaled:center")
-[1] -0.08680858
-attr(,"scaled:scale")
-[1] 1.039345
 > cor(x,explained)
      [,1]
 [1,]    1
+> plot(x,(explained+m.y))
+> cor(x,(explained+m.y))
+     [,1]
+[1,]    1
+>
+> head(explained+error)
+           [,1]
+[1,] -12.663308
+[2,]  11.620185
+[3,]  52.643911
+[4,] -22.359948
+[5,]   7.168197
+[6,]  22.441115
+> plot(x,(explained+error))
+> head(explained+error+m.y == y)
+     [,1]
+[1,] TRUE
+[2,] TRUE
+[3,] TRUE
+[4,] TRUE
+[5,] TRUE
+[6,] TRUE
+> plot(x,(explained+error+m.y))
 >
+> cor(x,(explained+error))
+          [,1]
+[1,] 0.7841619
+> cor(x,(explained+error+m.y))
+          [,1]
+[1,] 0.7841619
+> cor(x,y)
+          [,1]
+[1,] 0.7841619
 > # see this also
-> tmp2 <- data.frame(x, explained, error, y)
 > round(cor(tmp2),3)
-              x explained error     y
+              x     y y.hat explained error
-x         1.000     1.000 0.000 0.784
+x         1.000 0.784 1.000     1.000 0.000
-explained 1.000     1.000 0.000 0.784
+y         0.784 1.000 0.784     0.784 0.621
-error     0.000     0.000 1.000 0.621
+y.hat     1.000 0.784 1.000     1.000 0.000
-y         0.784     0.784 0.621 1.000
+explained 1.000 0.784 1.000     1.000 0.000
+error     0.000 0.621 0.000     0.000 1.000
 > ###############
 >
+> # sum of square values
 > ss.res <- sum(error^2)
 > ss.reg <- sum(explained^2)
@@ Line 910: / Line 776: @@
 [1] 31366.84
 >
-> k <- 1 # number of IV
+> # degrees of freedom values
-> df.res <- n - k - 1
+> k <- 2 # number of parameters
-> df.reg <- 2 - 1 # number of parameter estimation a and b (slope)
+> df.reg <- k - 1 # number of parameter estimation a and b (slope)
+> df.res <- n - df.reg - 1
 > ms.res <- ss.res/df.res
 > ms.reg <- ss.reg/df.reg
@@ Line 930: / Line 797: @@
 [1] 317.87
 >
+> # r square
+> ss.y
+[1] 31366.84
+> ss.reg
+[1] 19287.78
+> r.sq <- ss.reg/ss.y
+> r.sq
+[1] 0.6149098
+> # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은
+> # ss.reg가 충분히 컸는지를 알아보는 것
+> # 이는 IV, x가 y를 설명하는데 충분히 기여했는지
+> # 테스트하는 것. 이는 F-test를 이용해서 하게 된다
+> # F = MS.bet / MS.within
+> # regression의 경우는 F = MS.reg / MS.res
 > f.cal <- ms.reg/ms.res
 > f.cal
 [1] 60.67819
-> pf(f.cal, 1, 38, lower.tail = F)
+> pf(f.cal, df.reg, df.res, lower.tail = F)
 [1] 2.157284e-09
+> # check anova test
 > anova(lm.df)
 Analysis of Variance Table
@@ Line 966: / Line 848: @@
 F-statistic: 60.68 on 1 and 38 DF,  p-value: 2.157e-09
-> # r square
-> ss.y
-[1] 31366.84
-> ss.reg
-[1] 19287.78
-> r.sq <- ss.reg/ss.y
-> r.sq
-[1] 0.6149098
 >
 > # standard error for b coefficient
+> # b coeffficient 값은 estimation
+> # n.y 개의 데이터에서 구한 coefficient값 b는
+> # 실제 x, y 모집단을 대표하는 b값은 위의
+> # estimation에서 standard error값을 구간으로 갖는
+> # 범위일 것.
+>
 > res <- resid(lm.df)
+> data.frame(head(res))
+   head.res.
+-11.235956
+  -5.551819
+  28.079646
+-23.826982
+  -1.401474
+-28.514250
+> data.frame(head(error))
+  head.error.
+  -11.235956
+   -5.551819
+   28.079646
+  -23.826982
+   -1.401474
+  -28.514250
 > se.b <- sqrt(ms.res/ss.x)
 > se.b
 [1] 0.285491
-> b + c(-2*se.b, 2*se.b) # as x increased by 1 unit, y would increase b +- 2se.b
+> # as x increased by 1 unit, y would increase b +- 2se.b
+> b + c(-2*se.b, 2*se.b)
 [1] 1.652885 2.794849
+> b
+[1] 2.223867
+>
 > # to be exact
-> t.crit <- qt(.025, df.res, lower.tail = F)
+> t.crit <- qt(.975, df.res)
-> # note that t.crit is equivalent to 2 in z-test
+> t.crit
+[1] 2.024394
 > b + c(-t.crit*se.b, t.crit*se.b)
 [1] 1.645921 2.801813
-> # see http://commres.net/wiki/c/ms/2023/schedule/w11.lecture.note#%EA%B8%B0%EC%9A%B8%EA%B8%B0_b%EC%97%90_%EB%8C%80%ED%95%9C_%ED%86%B5%EA%B3%84%ED%95%99%EC%A0%81%EC%9D%B8_%ED%85%8C%EC%8A%A4%ED%8A%B8
+> b
->
+[1] 2.223867
 >
 > t.cal <- lm.df$coefficients[2]/se.b
 > t.cal
@@ Line 999: / Line 900: @@
            x
 .157284e-09
-> pf(f.cal, 1, 38, lower.tail = F)
+> pf(f.cal, df.reg, df.res, lower.tail = F)
 [1] 2.157284e-09
 >
 > # see also
+> t.cal
+       x
+.789621
 > t.cal^2
        x
@@ Line 1008: / Line 912: @@
 > f.cal
 [1] 60.67819
->
 >
 >