User Tools

Site Tools


c:ms:2025:schedule:w11.lecture.note

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
c:ms:2025:schedule:w11.lecture.note [2025/05/19 15:46] – [code output] hkimscilc:ms:2025:schedule:w11.lecture.note [2025/05/21 08:59] (current) – [code] hkimscil
Line 20: Line 20:
 ====== Correlation ====== ====== Correlation ======
 \begin{eqnarray} \begin{eqnarray}
-  \text{correlation, r} & = & \frac {\text{Cov (X,Y)}} {sd(X) * sd(Y)}+  \text{correlation, r} & = & \frac {\text{Cov (X,Y)}} {sd(X) * sd(Y)} \\ 
 +  & = & \frac {\text{SP (X,Y)} }{ \sqrt{\text{SS.x}*\text{SS.y}}} \nonumber
 \end{eqnarray} \end{eqnarray}
 +====== F-test (anova) ======
 +for reminding
 +\begin{eqnarray*}
 +  \text{F} & = & \frac {\text{MS}_\text{between group}} {\text{MS}_\text{within group}} \\
 +  & = & \frac {\text{Difference due to the treatment (IV) } }{ \text{Difference due to random error} } \\
 +  & = & \frac {\text{Difference explained by IV  } }{ \text{Difference remained random} } \\
 +  \text{R}^\text{2} & = & \frac {\text{SS}_\text{between}} {\text{SS}_\text{total} } \hfill \\
 +\end{eqnarray*}
  
  
 +
 +
 +====== Check in R ======
  
 <code> <code>
Line 42: Line 54:
 y <- (a*x) + b + re y <- (a*x) + b + re
 y y
 +
 +# covariance x and y
 cov(x,y) cov(x,y)
 sd(x) sd(x)
 sd(y) sd(y)
 +# correlation coefficient 
 cov(x,y)/(sd(x)*sd(y)) cov(x,y)/(sd(x)*sd(y))
 cor(x,y) cor(x,y)
Line 51: Line 66:
 df.x <- n.x-1 df.x <- n.x-1
 df.y <- df.x df.y <- df.x
-sp.xy <- cov(x,y) * df.x+sp.xy <- cov(x,y) * df.x  # 혹은 아래처럼 구할 수 있다 
 +sp.xy2 <- sum((x-m.x)*(y-m.y)) 
 +sp.xy == sp.xy2 
 ss.x <- var(x) * df.x ss.x <- var(x) * df.x
 ss.y <- var(y) * df.y ss.y <- var(y) * df.y
Line 58: Line 76:
  
 </code> </code>
 +====== Check in R: output ======
 <code> <code>
  
Line 218: Line 236:
 y <- (a*x) + b + re y <- (a*x) + b + re
 y y
 +
 m.y <- mean(y) m.y <- mean(y)
 df.y <- n.y - 1 df.y <- n.y - 1
Line 224: Line 243:
 var(y) var(y)
  
 +#
 prod <- (x-m.x)*(y-m.y) prod <- (x-m.x)*(y-m.y)
 sp <- sum(prod) sp <- sum(prod)
-sp/df.y +sp/df.y # covariance 값  
-cov(x,y)+cov(x,y) # covariance 펑션
  
-s.x <- sd(x) +s.x <- sd(x) # x변인에 대한 standard deviation 값 
-s.y <- sd(y) +s.y <- sd(y) # y 
-(sp/df.y)/(s.x*s.y) +(sp/df.y)/(s.x*s.y) # correlation 값 
-cor(x,y)+cor(x,y) # correlation 펑션
  
 df.y  df.y 
 df.x df.x
- 
-sqrt(ss.x/df.x) # s.x 값이 이것 
-sd(x) 
-sqrt(ss.y/df.y) # s.y 값이 이것 
-sd(y) 
  
 cov(x,y)/(s.x*s.y) cov(x,y)/(s.x*s.y)
-(sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y)) +# covariance를 각각의 sd값을 곱한여 나눠 준 값이 r 
-sp/sqrt(ss.x*ss.y) # 분모 분자에서 n-1을 제거하고 보면+(sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))  
 +# 분모 분자에서 n-1을 제거하고 보면 
 +sp/sqrt(ss.x*ss.y) 
 cor(x,y) cor(x,y)
  
Line 259: Line 276:
 # note that the red line is y.hat = a + bx,  # note that the red line is y.hat = a + bx, 
 # where error value y.hat - y is minimized # where error value y.hat - y is minimized
-# how to derive a and b  +
-# how to derive coefficient values  +
-# see wiki doc at  +
-# http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression+
 lm.df <- lm(y~x, data = df) lm.df <- lm(y~x, data = df)
 summary(lm.df) summary(lm.df)
Line 269: Line 283:
 a a
 b b
-y = b*x 에서+how to derive and  
 +# how to derive coefficient values  
 +# see wiki doc at  
 +# http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 b2 <- sp/ss.x b2 <- sp/ss.x
-b3 <- cov(x,y)/var(x,x)+b3 <- cov(x,y)/var(x,x) # b2 == b3
 # m.y = a + (b*m.x) 이므로 # m.y = a + (b*m.x) 이므로
 a2 <- m.y - (b2*m.x) a2 <- m.y - (b2*m.x)
Line 285: Line 302:
 # reg, res, total error  # reg, res, total error 
  
-y.hat <- (a+(b*x))+y.hat <- a+(b*x)
 y.hat2 <- predict(lm.df) y.hat2 <- predict(lm.df)
-y.hat == y.hat2  +head(y) 
-y +head(y.hat
-y.hat+
 m.y m.y
  
-tmp <- data.frame(y, y.hat, m.y+tmp <- data.frame(x, y, y.hat) 
-tmp +head(tmp) 
-plot(x,y) # 원래 데이터 +plot(x,y) # 원래 데이터 
-plot(x,y.hat) # 리그레션라인 예측+plot(x,y.hat) # 리그레션라인 예측선
 plot(x,tmp$m.y) # 평균 예측선 plot(x,tmp$m.y) # 평균 예측선
  
-error <- y-y.hat +error <- y - y.hat 
-explained <- y.hat-m.y +explained <- y.hat - m.y 
-error+tmp2 <- data.frame(tmp, explained, error
 +head(tmp2) 
 + 
 +head(y.hat == explained + m.y) 
 +head(y == explained + error + m.y) 
 # note the below # note the below
 plot(error) plot(error)
 hist(error) hist(error)
 +plot(x,error)
 cor(x,error) cor(x,error)
 ################ ################
-explained +plot(x,explained)
 cor(x,explained) cor(x,explained)
 +plot(x,(explained+m.y))
 +cor(x,(explained+m.y))
  
 +head(explained+error)
 +plot(x,(explained+error))
 +head(explained+error+m.y == y)
 +plot(x,(explained+error+m.y))
 +
 +cor(x,(explained+error))
 +cor(x,(explained+error+m.y))
 +cor(x,y)
 # see this also # see this also
-tmp2 <- data.frame(x, explained, error, y) 
 round(cor(tmp2),3) round(cor(tmp2),3)
 ############### ###############
  
 +# sum of square values 
 ss.res <- sum(error^2) ss.res <- sum(error^2)
 ss.reg <- sum(explained^2) ss.reg <- sum(explained^2)
Line 321: Line 353:
 ss.res + ss.reg ss.res + ss.reg
  
-k <- # number of IV +# degrees of freedom values 
-df.res <- n - k - 1 +k <- # number of parameters 
-df.reg <- 2 - 1 # number of parameter estimation a and b (slope)+df.reg <- k - 1 # number of parameter estimation a and b (slope) 
 +df.res <- n - df.reg - 1
 ms.res <- ss.res/df.res ms.res <- ss.res/df.res
 ms.reg <- ss.reg/df.reg ms.reg <- ss.reg/df.reg
Line 335: Line 368:
 ms.res ms.res
  
-f.cal <- ms.reg/ms.res 
-f.cal 
-pf(f.cal, 1, 38, lower.tail = F) 
-anova(lm.df) 
-summary(lm.df) 
 # r square  # r square 
 ss.y ss.y
Line 345: Line 373:
 r.sq <- ss.reg/ss.y r.sq <- ss.reg/ss.y
 r.sq r.sq
 +# 위의 r.sq 값이 충분히 컸는지를 알아보는 것은
 +# ss.reg가 충분히 컸는지를 알아보는 것
 +# 이는 IV, x가 y를 설명하는데 충분히 기여했는지
 +# 테스트하는 것. 이는 F-test를 이용해서 하게 된다
 +# F = MS.bet / MS.within
 +# regression의 경우는 F = MS.reg / MS.res
 +f.cal <- ms.reg/ms.res
 +f.cal
 +pf(f.cal, df.reg, df.res, lower.tail = F)
 +# check anova test 
 +anova(lm.df)
 +summary(lm.df)
  
 # standard error for b coefficient # standard error for b coefficient
 +# b coeffficient 값은 estimation 
 +# n.y 개의 데이터에서 구한 coefficient값 b는 
 +# 실제 x, y 모집단을 대표하는 b값은 위의
 +# estimation에서 standard error값을 구간으로 갖는
 +# 범위일 것. 
 +
 res <- resid(lm.df) res <- resid(lm.df)
 +data.frame(head(res))
 +data.frame(head(error))
 se.b <- sqrt(ms.res/ss.x) se.b <- sqrt(ms.res/ss.x)
 se.b se.b
-b + c(-2*se.b, 2*se.b) # as x increased by 1 unit, y would increase b +- 2se.b +# as x increased by 1 unit, y would increase b +- 2se.b 
-# to be exact, we use t.critical value instead of 2 +b + c(-2*se.b, 2*se.b)  
-t.crit <- qt(.025, df.res, lower.tail = F) +b
-b + c(-t.crit*se.b, t.crit*se.b) +
  
 +# to be exact 
 +t.crit <- qt(.975, df.res)
 +t.crit
 +b + c(-t.crit*se.b, t.crit*se.b)
 +b
  
 t.cal <- lm.df$coefficients[2]/se.b t.cal <- lm.df$coefficients[2]/se.b
Line 361: Line 413:
  
 pt(t.cal, df.res, lower.tail = F)*2 pt(t.cal, df.res, lower.tail = F)*2
-pf(f.cal, 138, lower.tail = F)+pf(f.cal, df.regdf.res, lower.tail = F)
  
 # see also # see also
 +t.cal
 t.cal^2 t.cal^2
 f.cal f.cal
Line 482: Line 535:
 attr(,"scaled:scale") attr(,"scaled:scale")
 [1] 1.039345 [1] 1.039345
 +
 > m.y <- mean(y) > m.y <- mean(y)
 > df.y <- n.y - 1 > df.y <- n.y - 1
Line 491: Line 545:
 [1,] 804.2779 [1,] 804.2779
  
 +> #
 > prod <- (x-m.x)*(y-m.y) > prod <- (x-m.x)*(y-m.y)
 > sp <- sum(prod) > sp <- sum(prod)
-> sp/df.y+> sp/df.y # covariance 값 
 [1] 222.3867 [1] 222.3867
-> cov(x,y)+> cov(x,y) # covariance 펑션
          [,1]          [,1]
 [1,] 222.3867 [1,] 222.3867
  
-> s.x <- sd(x) +> s.x <- sd(x) # x변인에 대한 standard deviation 값 
-> s.y <- sd(y) +> s.y <- sd(y) # y 
-> (sp/df.y)/(s.x*s.y)+> (sp/df.y)/(s.x*s.y) # correlation 값
 [1] 0.7841619 [1] 0.7841619
-> cor(x,y)+> cor(x,y) # correlation 펑션
           [,1]           [,1]
 [1,] 0.7841619 [1,] 0.7841619
Line 511: Line 566:
 > df.x > df.x
 [1] 39 [1] 39
- 
-> sqrt(ss.x/df.x) # s.x 값이 이것 
-[1] 10 
-> sd(x) 
-[1] 10 
-> sqrt(ss.y/df.y) # s.y 값이 이것 
-[1] 28.35979 
-> sd(y) 
-[1] 28.35979 
  
 > cov(x,y)/(s.x*s.y) > cov(x,y)/(s.x*s.y)
           [,1]           [,1]
 [1,] 0.7841619 [1,] 0.7841619
-> (sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y))+> # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r 
 +> (sp/df.y)/sqrt((ss.x/df.x)*(ss.y/df.y)) 
 [1] 0.7841619 [1] 0.7841619
-sp/sqrt(ss.x*ss.y) # 분모 분자에서 n-1을 제거하고 보면+> # 분모 분자에서 n-1을 제거하고 보면 
 +> sp/sqrt(ss.x*ss.y) 
 [1] 0.7841619 [1] 0.7841619
 > cor(x,y) > cor(x,y)
Line 544: Line 592:
 > # note that the red line is y.hat = a + bx,  > # note that the red line is y.hat = a + bx, 
 > # where error value y.hat - y is minimized > # where error value y.hat - y is minimized
-# how to derive a and b  +
-> # how to derive coefficient values  +
-> # see wiki doc at  +
-> # http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression+
 > lm.df <- lm(y~x, data = df) > lm.df <- lm(y~x, data = df)
 > summary(lm.df) > summary(lm.df)
Line 576: Line 621:
 > b > b
 [1] 2.223867 [1] 2.223867
-> # y = b*x 에서+> # how to derive and  
 +> # how to derive coefficient values  
 +> # see wiki doc at  
 +> # http://commres.net/wiki/deriviation_of_a_and_b_in_a_simple_regression
 > b2 <- sp/ss.x > b2 <- sp/ss.x
-> b3 <- cov(x,y)/var(x,x)+> b3 <- cov(x,y)/var(x,x) # b2 == b3
 > # m.y = a + (b*m.x) 이므로 > # m.y = a + (b*m.x) 이므로
 > a2 <- m.y - (b2*m.x) > a2 <- m.y - (b2*m.x)
Line 598: Line 646:
 > # reg, res, total error  > # reg, res, total error 
 > #  > # 
-> y.hat <- (a+(b*x))+> y.hat <- a+(b*x)
 > y.hat2 <- predict(lm.df) > y.hat2 <- predict(lm.df)
-> y.hat == y.hat2  +head(y) 
-      [,1] +         [,1] 
- [1,] TRUE +[1,] 175.7367 
- [2,] TRUE +[2,] 200.0202 
- [3,] TRUE +[3,] 241.0439 
- [4,] TRUE +[4,] 166.0401 
- [5,] TRUE +[5,] 195.5682 
- [6,] TRUE +[6,] 210.8411 
- [7,] TRUE +> head(y.hat 
- [8,] TRUE +         [,1] 
- [9,] TRUE +[1,] 186.9727 
-[10,] TRUE +[2,] 205.5720 
-[11,] TRUE +[3,] 212.9643 
-[12,] TRUE +[4,] 189.8671 
-[13,] TRUE +[5,] 196.9697 
-[14,] TRUE +[6,] 239.3554
-[15,] TRUE +
-[16,] TRUE +
-[17,] TRUE +
-[18,] TRUE +
-[19,] TRUE +
-[20,] TRUE +
-[21,] TRUE +
-[22,] TRUE +
-[23,] TRUE +
-[24,] TRUE +
-[25,] TRUE +
-[26,] TRUE +
-[27,] TRUE +
-[28,] TRUE +
-[29,] TRUE +
-[30,] TRUE +
-[31,] TRUE +
-[32,] TRUE +
-[33,] TRUE +
-[34,] TRUE +
-[35,] TRUE +
-[36,] TRUE +
-[37,] TRUE +
-[38,] TRUE +
-[39,] TRUE +
-[40,] TRUE +
-> y +
-          [,1] +
- [1,] 175.7367 +
- [2,] 200.0202 +
- [3,] 241.0439 +
- [4,] 166.0401 +
- [5,] 195.5682 +
- [6,] 210.8411 +
- [7,] 181.9459 +
- [8,] 193.8544 +
- [9,] 225.4417 +
-[10,] 182.5276 +
-[11,] 194.5656 +
-[12,] 232.4421 +
-[13,] 255.3070 +
-[14,] 209.0313 +
-[15,] 219.1602 +
-[16,] 181.7819 +
-[17,] 214.3124 +
-[18,] 187.2961 +
-[19,] 207.3581 +
-[20,] 235.0318 +
-[21,] 199.2244 +
-[22,] 208.1031 +
-[23,] 182.2575 +
-[24,] 201.4683 +
-[25,] 169.6124 +
-[26,] 173.7941 +
-[27,] 161.0332 +
-[28,] 152.1292 +
-[29,] 153.4874 +
-[30,] 147.7652 +
-[31,] 189.2685 +
-[32,] 188.5691 +
-[33,] 155.5620 +
-[34,] 172.8731 +
-[35,] 184.9119 +
-[36,] 183.3182 +
-[37,] 119.8266 +
-[38,] 162.6349 +
-[39,] 170.0872 +
-[40,] 150.7684 +
-attr(,"scaled:center"+
-[1] -0.08680858 +
-attr(,"scaled:scale"+
-[1] 1.039345 +
-y.hat +
-          [,1] +
- [1,] 186.9727 +
- [2,] 205.5720 +
- [3,] 212.9643 +
- [4,] 189.8671 +
- [5,] 196.9697 +
- [6,] 239.3554 +
- [7,] 193.0240 +
- [8,] 181.5673 +
- [9,] 223.8973 +
-[10,] 175.6106 +
-[11,] 182.3963 +
-[12,] 211.1666 +
-[13,] 221.1593 +
-[14,] 193.4777 +
-[15,] 217.0723 +
-[16,] 179.7820 +
-[17,] 188.9258 +
-[18,] 162.1265 +
-[19,] 176.7310 +
-[20,] 212.2383 +
-[21,] 201.3295 +
-[22,] 188.1615 +
-[23,] 197.6116 +
-[24,] 196.8985 +
-[25,] 183.6318 +
-[26,] 197.8900 +
-[27,] 167.9229 +
-[28,] 158.6935 +
-[29,] 168.5770 +
-[30,] 138.9668 +
-[31,] 225.1485 +
-[32,] 187.3333 +
-[33,] 155.8128 +
-[34,] 179.8349 +
-[35,] 189.5992 +
-[36,] 178.2857 +
-[37,] 138.0425 +
-[38,] 188.5297 +
-[39,] 170.7527 +
-[40,] 172.1024 +
-attr(,"scaled:center"+
-[1] -0.08680858 +
-attr(,"scaled:scale"+
-[1] 1.039345+
 > m.y > m.y
 [1] 188.4 [1] 188.4
  
-> tmp <- data.frame(y, y.hat, m.y+> tmp <- data.frame(x, y, y.hat) 
-> tmp +head(tmp) 
-          y    y.hat   m.y +          x        y    y.hat 
-1  175.7367 186.9727 188.4 +1  99.35817 175.7367 186.9727 
- 200.0202 205.5720 188.4 +107.72169 200.0202 205.5720 
- 241.0439 212.9643 188.4 +111.04574 241.0439 212.9643 
- 166.0401 189.8671 188.4 +100.65968 166.0401 189.8671 
- 195.5682 196.9697 188.4 +103.85350 195.5682 196.9697 
- 210.8411 239.3554 188.4 +122.91296 210.8411 239.3554 
-7  181.9459 193.0240 188.4 +> plot(x,y) # 원래 데이터 
-8  193.8544 181.5673 188.4 +> plot(x,y.hat) # 리그레션라인 예측선
-9  225.4417 223.8973 188.4 +
-10 182.5276 175.6106 188.4 +
-11 194.5656 182.3963 188.4 +
-12 232.4421 211.1666 188.4 +
-13 255.3070 221.1593 188.4 +
-14 209.0313 193.4777 188.4 +
-15 219.1602 217.0723 188.4 +
-16 181.7819 179.7820 188.4 +
-17 214.3124 188.9258 188.4 +
-18 187.2961 162.1265 188.4 +
-19 207.3581 176.7310 188.4 +
-20 235.0318 212.2383 188.4 +
-21 199.2244 201.3295 188.4 +
-22 208.1031 188.1615 188.4 +
-23 182.2575 197.6116 188.4 +
-24 201.4683 196.8985 188.4 +
-25 169.6124 183.6318 188.4 +
-26 173.7941 197.8900 188.4 +
-27 161.0332 167.9229 188.4 +
-28 152.1292 158.6935 188.4 +
-29 153.4874 168.5770 188.4 +
-30 147.7652 138.9668 188.4 +
-31 189.2685 225.1485 188.4 +
-32 188.5691 187.3333 188.4 +
-33 155.5620 155.8128 188.4 +
-34 172.8731 179.8349 188.4 +
-35 184.9119 189.5992 188.4 +
-36 183.3182 178.2857 188.4 +
-37 119.8266 138.0425 188.4 +
-38 162.6349 188.5297 188.4 +
-39 170.0872 170.7527 188.4 +
-40 150.7684 172.1024 188.4 +
-> plot(x,y) # 원래 데이터 +
-> plot(x,y.hat) # 리그레션라인 예측+
 > plot(x,tmp$m.y) # 평균 예측선 > plot(x,tmp$m.y) # 평균 예측선
  
-> error <- y-y.hat +> error <- y - y.hat 
-> explained <- y.hat-m.y +> explained <- y.hat - m.y 
-> error +tmp2 <- data.frame(tmp, explained, error) 
-             [,1] +> head(tmp2) 
- [1,] -11.2359564 +          x        y    y.hat explained      error 
- [2,]  -5.5518187 + 99.35817 175.7367 186.9727 -1.427352 -11.235956 
- [3,]  28.0796455 +107.72169 200.0202 205.5720 17.172003  -5.551819 
- [4,] -23.8269825 +111.04574 241.0439 212.9643 24.564266  28.079646 
- [5, -1.4014742 +100.65968 166.0401 189.8671  1.467035 -23.826982 
- [6,-28.5142499 +5 103.85350 195.5682 196.9697  8.569671  -1.401474 
- [7,] -11.0781515 +122.91296 210.8411 239.3554 50.955365 -28.514250 
- [8,]  12.2870686 + 
- [9,  1.5443968 +> head(y.hat == explained + m.y) 
-[10,]   6.9169740 +     [,1
-[11,]  12.1692981 +[1,] TRUE 
-[12,]  21.2754922 +[2,] TRUE 
-[13,]  34.1477152 +[3,] TRUE 
-[14,]  15.5535765 +[4,] TRUE 
-[15,]   2.0879605 +[5,] TRUE 
-[16,]   1.9999551 +[6,] TRUE 
-[17,]  25.3866091 +> head(y == explained + error + m.y) 
-[18,]  25.1695629 +     [,1
-[19,]  30.6270426 +[1,] TRUE 
-[20,]  22.7934996 +[2,] TRUE 
-[21,]  -2.1051048 +[3,] TRUE 
-[22,]  19.9415577 +[4,] TRUE 
-[23,] -15.3540820 +[5,] TRUE 
-[24,]   4.5697607 +[6,] TRUE 
-[25,] -14.0193869 +
-[26,] -24.0959095 +
-[27,]  -6.8896271 +
-[28,]  -6.5642357 +
-[29,] -15.0896661 +
-[30,]   8.7983528 +
-[31,] -35.8800136 +
-[32,]   1.2357770 +
-[33,]  -0.2508120 +
-[34,]  -6.9618190 +
-[35,]  -4.6873617 +
-[36,]   5.0325839 +
-[37,] -18.2159090 +
-[38,] -25.8947992 +
-[39,]  -0.6654576 +
-[40,] -21.3340113 +
-attr(,"scaled:center"+
-[1] -0.08680858 +
-attr(,"scaled:scale"+
-[1] 1.039345+
 > # note the below > # note the below
 > plot(error) > plot(error)
 > hist(error) > hist(error)
 +> plot(x,error)
 > cor(x,error) > cor(x,error)
              [,1]              [,1]
 [1,] 7.981058e-17 [1,] 7.981058e-17
 > ################ > ################
-explained  +plot(x,explained)
-             [,1] +
- [1, -1.4273520 +
- [2, 17.1720033 +
- [3, 24.5642656 +
- [4,  1.4670349 +
- [5,  8.5696709 +
- [6, 50.9553654 +
- [7,  4.6240241 +
- [8, -6.8327027 +
- [9, 35.4972458 +
-[10,] -12.7894406 +
-[11,]  -6.0037071 +
-[12,]  22.7665939 +
-[13,]  32.7592720 +
-[14,]   5.0776653 +
-[15,]  28.6722546 +
-[16,]  -8.6180356 +
-[17,]   0.5257494 +
-[18,] -26.2734989 +
-[19,] -11.6690128 +
-[20,]  23.8382597 +
-[21,]  12.9294844 +
-[22,]  -0.2385288 +
-[23,]   9.2115978 +
-[24,]   8.4984744 +
-[25,]  -4.7682316 +
-[26,]   9.4899547 +
-[27,] -20.4771504 +
-[28,] -29.7065732 +
-[29,] -19.8229799 +
-[30,] -49.4332197 +
-[31,]  36.7485236 +
-[32,]  -1.0667066 +
-[33,] -32.5871819 +
-[34,]  -8.5651393 +
-[35,]   1.1992231 +
-[36,] -10.1143632 +
-[37,] -50.3574953 +
-[38,]   0.1296534 +
-[39,] -17.6473510 +
-[40,] -16.2976456 +
-attr(,"scaled:center"+
-[1] -0.08680858 +
-attr(,"scaled:scale") +
-[1] 1.039345+
 > cor(x,explained) > cor(x,explained)
      [,1]      [,1]
 [1,]    1 [1,]    1
 +> plot(x,(explained+m.y))
 +> cor(x,(explained+m.y))
 +     [,1]
 +[1,]    1
 +
 +> head(explained+error)
 +           [,1]
 +[1,] -12.663308
 +[2,]  11.620185
 +[3,]  52.643911
 +[4,] -22.359948
 +[5,]   7.168197
 +[6,]  22.441115
 +> plot(x,(explained+error))
 +> head(explained+error+m.y == y)
 +     [,1]
 +[1,] TRUE
 +[2,] TRUE
 +[3,] TRUE
 +[4,] TRUE
 +[5,] TRUE
 +[6,] TRUE
 +> plot(x,(explained+error+m.y))
  
 +> cor(x,(explained+error))
 +          [,1]
 +[1,] 0.7841619
 +> cor(x,(explained+error+m.y))
 +          [,1]
 +[1,] 0.7841619
 +> cor(x,y)
 +          [,1]
 +[1,] 0.7841619
 > # see this also > # see this also
-> tmp2 <- data.frame(x, explained, error, y) 
 > round(cor(tmp2),3) > round(cor(tmp2),3)
-              x explained error     y +              x     y.hat explained error 
-x         1.000     1.000 0.000 0.784 +x         1.000 0.784 1.000     1.000 0.000 
-explained 1.000     1.000 0.000 0.784 +y         0.784 1.000 0.784     0.784 0.621 
-error     0.000     0.000 1.000 0.621 +y.hat     1.000 0.784 1.000     1.000 0.000 
-y         0.784     0.784 0.621 1.000+explained 1.000 0.784 1.000     1.000 0.000 
 +error     0.000 0.621 0.000     0.000 1.000
 > ############### > ###############
  
 +> # sum of square values 
 > ss.res <- sum(error^2) > ss.res <- sum(error^2)
 > ss.reg <- sum(explained^2) > ss.reg <- sum(explained^2)
Line 910: Line 776:
 [1] 31366.84 [1] 31366.84
  
-> k <- # number of IV +> # degrees of freedom values 
-> df.res <- n - k - 1 +> k <- # number of parameters 
-> df.reg <- 2 - 1 # number of parameter estimation a and b (slope)+> df.reg <- k - 1 # number of parameter estimation a and b (slope) 
 +> df.res <- n - df.reg - 1
 > ms.res <- ss.res/df.res > ms.res <- ss.res/df.res
 > ms.reg <- ss.reg/df.reg > ms.reg <- ss.reg/df.reg
Line 930: Line 797:
 [1] 317.87 [1] 317.87
  
 +> # r square 
 +> ss.y
 +[1] 31366.84
 +> ss.reg
 +[1] 19287.78
 +> r.sq <- ss.reg/ss.y
 +> r.sq
 +[1] 0.6149098
 +> # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은
 +> # ss.reg가 충분히 컸는지를 알아보는 것
 +> # 이는 IV, x가 y를 설명하는데 충분히 기여했는지
 +> # 테스트하는 것. 이는 F-test를 이용해서 하게 된다
 +> # F = MS.bet / MS.within
 +> # regression의 경우는 F = MS.reg / MS.res
 > f.cal <- ms.reg/ms.res > f.cal <- ms.reg/ms.res
 > f.cal > f.cal
 [1] 60.67819 [1] 60.67819
-> pf(f.cal, 138, lower.tail = F)+> pf(f.cal, df.regdf.res, lower.tail = F)
 [1] 2.157284e-09 [1] 2.157284e-09
 +> # check anova test 
 > anova(lm.df) > anova(lm.df)
 Analysis of Variance Table Analysis of Variance Table
Line 966: Line 848:
 F-statistic: 60.68 on 1 and 38 DF,  p-value: 2.157e-09 F-statistic: 60.68 on 1 and 38 DF,  p-value: 2.157e-09
  
-> # r square  
-> ss.y 
-[1] 31366.84 
-> ss.reg 
-[1] 19287.78 
-> r.sq <- ss.reg/ss.y 
-> r.sq 
-[1] 0.6149098 
  
 > # standard error for b coefficient > # standard error for b coefficient
 +> # b coeffficient 값은 estimation 
 +> # n.y 개의 데이터에서 구한 coefficient값 b는 
 +> # 실제 x, y 모집단을 대표하는 b값은 위의
 +> # estimation에서 standard error값을 구간으로 갖는
 +> # 범위일 것. 
 +
 > res <- resid(lm.df) > res <- resid(lm.df)
 +> data.frame(head(res))
 +   head.res.
 +1 -11.235956
 +2  -5.551819
 +3  28.079646
 +4 -23.826982
 +5  -1.401474
 +6 -28.514250
 +> data.frame(head(error))
 +  head.error.
 +1  -11.235956
 +2   -5.551819
 +3   28.079646
 +4  -23.826982
 +5   -1.401474
 +6  -28.514250
 > se.b <- sqrt(ms.res/ss.x) > se.b <- sqrt(ms.res/ss.x)
 > se.b > se.b
 [1] 0.285491 [1] 0.285491
-b + c(-2*se.b, 2*se.b) # as x increased by 1 unit, y would increase b +- 2se.b+> # as x increased by 1 unit, y would increase b +- 2se.b 
 +> b + c(-2*se.b, 2*se.b) 
 [1] 1.652885 2.794849 [1] 1.652885 2.794849
 +> b
 +[1] 2.223867
 +
 > # to be exact  > # to be exact 
-> t.crit <- qt(.025, df.res, lower.tail = F+> t.crit <- qt(.975, df.res) 
-# note that t.crit is equivalent to in z-test+> t.crit 
 +[1] 2.024394
 > b + c(-t.crit*se.b, t.crit*se.b) > b + c(-t.crit*se.b, t.crit*se.b)
 [1] 1.645921 2.801813 [1] 1.645921 2.801813
-# see http://commres.net/wiki/c/ms/2023/schedule/w11.lecture.note#%EA%B8%B0%EC%9A%B8%EA%B8%B0_b%EC%97%90_%EB%8C%80%ED%95%9C_%ED%86%B5%EA%B3%84%ED%95%99%EC%A0%81%EC%9D%B8_%ED%85%8C%EC%8A%A4%ED%8A%B8 +
-> +[1] 2.223867 
->+
 > t.cal <- lm.df$coefficients[2]/se.b > t.cal <- lm.df$coefficients[2]/se.b
 > t.cal > t.cal
Line 999: Line 900:
                        
 2.157284e-09  2.157284e-09 
-> pf(f.cal, 138, lower.tail = F)+> pf(f.cal, df.regdf.res, lower.tail = F)
 [1] 2.157284e-09 [1] 2.157284e-09
  
 > # see also > # see also
 +> t.cal
 +       
 +7.789621 
 > t.cal^2 > t.cal^2
                
Line 1008: Line 912:
 > f.cal > f.cal
 [1] 60.67819 [1] 60.67819
- 
 > >
 > >
c/ms/2025/schedule/w11.lecture.note.1747637209.txt.gz · Last modified: 2025/05/19 15:46 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki