c:ms:2025:schedule:w11.lecture.note
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| c:ms:2025:schedule:w11.lecture.note [2025/05/20 12:48] – [F-test (anova)] hkimscil | c:ms:2025:schedule:w11.lecture.note [2025/05/20 23:59] (current) – [code] hkimscil | ||
|---|---|---|---|
| Line 54: | Line 54: | ||
| y <- (a*x) + b + re | y <- (a*x) + b + re | ||
| y | y | ||
| + | |||
| + | # covariance x and y | ||
| cov(x,y) | cov(x,y) | ||
| sd(x) | sd(x) | ||
| sd(y) | sd(y) | ||
| + | # correlation coefficient | ||
| cov(x, | cov(x, | ||
| cor(x,y) | cor(x,y) | ||
| Line 63: | Line 66: | ||
| df.x <- n.x-1 | df.x <- n.x-1 | ||
| df.y <- df.x | df.y <- df.x | ||
| - | sp.xy <- cov(x,y) * df.x | + | sp.xy <- cov(x,y) * df.x # 혹은 아래처럼 구할 수 있다 |
| + | sp.xy2 <- sum((x-m.x)*(y-m.y)) | ||
| + | sp.xy == sp.xy2 | ||
| ss.x <- var(x) * df.x | ss.x <- var(x) * df.x | ||
| ss.y <- var(y) * df.y | ss.y <- var(y) * df.y | ||
| Line 70: | Line 76: | ||
| </ | </ | ||
| + | ====== Check in R: output ====== | ||
| < | < | ||
| > | > | ||
| Line 230: | Line 236: | ||
| y <- (a*x) + b + re | y <- (a*x) + b + re | ||
| y | y | ||
| + | |||
| m.y <- mean(y) | m.y <- mean(y) | ||
| df.y <- n.y - 1 | df.y <- n.y - 1 | ||
| Line 236: | Line 243: | ||
| var(y) | var(y) | ||
| + | # | ||
| prod <- (x-m.x)*(y-m.y) | prod <- (x-m.x)*(y-m.y) | ||
| sp <- sum(prod) | sp <- sum(prod) | ||
| - | sp/df.y | + | sp/ |
| - | cov(x,y) | + | cov(x, |
| - | s.x <- sd(x) | + | s.x <- sd(x) # x변인에 대한 standard deviation 값 |
| - | s.y <- sd(y) | + | s.y <- sd(y) # y |
| - | (sp/ | + | (sp/ |
| - | cor(x,y) | + | cor(x, |
| df.y | df.y | ||
| df.x | df.x | ||
| - | |||
| - | sqrt(ss.x/ | ||
| - | sd(x) | ||
| - | sqrt(ss.y/ | ||
| - | sd(y) | ||
| cov(x, | cov(x, | ||
| - | (sp/ | + | # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r |
| - | sp/ | + | (sp/ |
| + | # 분모 분자에서 n-1을 제거하고 보면 | ||
| + | sp/ | ||
| cor(x,y) | cor(x,y) | ||
| Line 271: | Line 276: | ||
| # note that the red line is y.hat = a + bx, | # note that the red line is y.hat = a + bx, | ||
| # where error value y.hat - y is minimized | # where error value y.hat - y is minimized | ||
| - | # how to derive a and b | + | |
| - | # how to derive coefficient values | + | |
| - | # see wiki doc at | + | |
| - | # http:// | + | |
| lm.df <- lm(y~x, data = df) | lm.df <- lm(y~x, data = df) | ||
| summary(lm.df) | summary(lm.df) | ||
| Line 281: | Line 283: | ||
| a | a | ||
| b | b | ||
| - | # y = a + b*x 에서 | + | # how to derive |
| + | # how to derive coefficient values | ||
| + | # see wiki doc at | ||
| + | # http:// | ||
| b2 <- sp/ss.x | b2 <- sp/ss.x | ||
| - | b3 <- cov(x, | + | b3 <- cov(x, |
| # m.y = a + (b*m.x) 이므로 | # m.y = a + (b*m.x) 이므로 | ||
| a2 <- m.y - (b2*m.x) | a2 <- m.y - (b2*m.x) | ||
| Line 297: | Line 302: | ||
| # reg, res, total error | # reg, res, total error | ||
| # | # | ||
| - | y.hat <- (a+(b*x)) | + | y.hat <- a+(b*x) |
| y.hat2 <- predict(lm.df) | y.hat2 <- predict(lm.df) | ||
| - | y.hat == y.hat2 | + | head(y) |
| - | y | + | head(y.hat) |
| - | y.hat | + | |
| m.y | m.y | ||
| - | tmp <- data.frame(y, | + | tmp <- data.frame(x, y, y.hat) |
| - | tmp | + | head(tmp) |
| - | plot(x,y) # 원래 데이터터 | + | plot(x,y) # 원래 데이터 |
| - | plot(x, | + | plot(x, |
| plot(x, | plot(x, | ||
| - | error <- y-y.hat | + | error <- y - y.hat |
| - | explained <- y.hat-m.y | + | explained <- y.hat - m.y |
| - | error | + | tmp2 <- data.frame(tmp, |
| + | head(tmp2) | ||
| + | |||
| + | head(y.hat == explained + m.y) | ||
| + | head(y == explained + error + m.y) | ||
| # note the below | # note the below | ||
| plot(error) | plot(error) | ||
| hist(error) | hist(error) | ||
| + | plot(x, | ||
| cor(x, | cor(x, | ||
| ################ | ################ | ||
| - | explained | + | plot(x,explained) |
| cor(x, | cor(x, | ||
| + | plot(x, | ||
| + | cor(x, | ||
| + | head(explained+error) | ||
| + | plot(x, | ||
| + | head(explained+error+m.y == y) | ||
| + | plot(x, | ||
| + | |||
| + | cor(x, | ||
| + | cor(x, | ||
| + | cor(x,y) | ||
| # see this also | # see this also | ||
| - | tmp2 <- data.frame(x, | ||
| round(cor(tmp2), | round(cor(tmp2), | ||
| ############### | ############### | ||
| + | # sum of square values | ||
| ss.res <- sum(error^2) | ss.res <- sum(error^2) | ||
| ss.reg <- sum(explained^2) | ss.reg <- sum(explained^2) | ||
| Line 333: | Line 353: | ||
| ss.res + ss.reg | ss.res + ss.reg | ||
| - | k <- 1 # number of IV | + | # degrees of freedom values |
| - | df.res <- n - k - 1 | + | k <- 2 # number of parameters |
| - | df.reg <- 2 - 1 # number of parameter estimation a and b (slope) | + | df.reg <- k - 1 # number of parameter estimation a and b (slope) |
| + | df.res <- n - df.reg - 1 | ||
| ms.res <- ss.res/ | ms.res <- ss.res/ | ||
| ms.reg <- ss.reg/ | ms.reg <- ss.reg/ | ||
| Line 347: | Line 368: | ||
| ms.res | ms.res | ||
| - | f.cal <- ms.reg/ | ||
| - | f.cal | ||
| - | pf(f.cal, 1, 38, lower.tail = F) | ||
| - | anova(lm.df) | ||
| - | summary(lm.df) | ||
| # r square | # r square | ||
| ss.y | ss.y | ||
| Line 357: | Line 373: | ||
| r.sq <- ss.reg/ss.y | r.sq <- ss.reg/ss.y | ||
| r.sq | r.sq | ||
| + | # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은 | ||
| + | # ss.reg가 충분히 컸는지를 알아보는 것 | ||
| + | # 이는 IV, x가 y를 설명하는데 충분히 기여했는지 | ||
| + | # 테스트하는 것. 이는 F-test를 이용해서 하게 된다 | ||
| + | # F = MS.bet / MS.within | ||
| + | # regression의 경우는 F = MS.reg / MS.res | ||
| + | f.cal <- ms.reg/ | ||
| + | f.cal | ||
| + | pf(f.cal, df.reg, df.res, lower.tail = F) | ||
| + | # check anova test | ||
| + | anova(lm.df) | ||
| + | summary(lm.df) | ||
| # standard error for b coefficient | # standard error for b coefficient | ||
| + | # b coeffficient 값은 estimation | ||
| + | # n.y 개의 데이터에서 구한 coefficient값 b는 | ||
| + | # 실제 x, y 모집단을 대표하는 b값은 위의 | ||
| + | # estimation에서 standard error값을 구간으로 갖는 | ||
| + | # 범위일 것. | ||
| + | |||
| res <- resid(lm.df) | res <- resid(lm.df) | ||
| + | data.frame(head(res)) | ||
| + | data.frame(head(error)) | ||
| se.b <- sqrt(ms.res/ | se.b <- sqrt(ms.res/ | ||
| se.b | se.b | ||
| - | b + c(-2*se.b, 2*se.b) | + | # as x increased by 1 unit, y would increase b +- 2se.b |
| - | # to be exact, we use t.critical value instead of 2 | + | b + c(-2*se.b, |
| - | t.crit <- qt(.025, df.res, lower.tail = F) | + | b |
| - | b + c(-t.crit*se.b, | + | |
| + | # to be exact | ||
| + | t.crit <- qt(.975, df.res) | ||
| + | t.crit | ||
| + | b + c(-t.crit*se.b, | ||
| + | b | ||
| t.cal <- lm.df$coefficients[2]/ | t.cal <- lm.df$coefficients[2]/ | ||
| Line 373: | Line 413: | ||
| pt(t.cal, df.res, lower.tail = F)*2 | pt(t.cal, df.res, lower.tail = F)*2 | ||
| - | pf(f.cal, | + | pf(f.cal, |
| # see also | # see also | ||
| + | t.cal | ||
| t.cal^2 | t.cal^2 | ||
| f.cal | f.cal | ||
| Line 494: | Line 535: | ||
| attr(," | attr(," | ||
| [1] 1.039345 | [1] 1.039345 | ||
| + | > | ||
| > m.y <- mean(y) | > m.y <- mean(y) | ||
| > df.y <- n.y - 1 | > df.y <- n.y - 1 | ||
| Line 503: | Line 545: | ||
| [1,] 804.2779 | [1,] 804.2779 | ||
| > | > | ||
| + | > # | ||
| > prod <- (x-m.x)*(y-m.y) | > prod <- (x-m.x)*(y-m.y) | ||
| > sp <- sum(prod) | > sp <- sum(prod) | ||
| - | > sp/df.y | + | > sp/ |
| [1] 222.3867 | [1] 222.3867 | ||
| - | > cov(x,y) | + | > cov(x, |
| [,1] | [,1] | ||
| [1,] 222.3867 | [1,] 222.3867 | ||
| > | > | ||
| - | > s.x <- sd(x) | + | > s.x <- sd(x) # x변인에 대한 standard deviation 값 |
| - | > s.y <- sd(y) | + | > s.y <- sd(y) # y |
| - | > (sp/ | + | > (sp/ |
| [1] 0.7841619 | [1] 0.7841619 | ||
| - | > cor(x,y) | + | > cor(x, |
| [,1] | [,1] | ||
| [1,] 0.7841619 | [1,] 0.7841619 | ||
| Line 523: | Line 566: | ||
| > df.x | > df.x | ||
| [1] 39 | [1] 39 | ||
| - | > | ||
| - | > sqrt(ss.x/ | ||
| - | [1] 10 | ||
| - | > sd(x) | ||
| - | [1] 10 | ||
| - | > sqrt(ss.y/ | ||
| - | [1] 28.35979 | ||
| - | > sd(y) | ||
| - | [1] 28.35979 | ||
| > | > | ||
| > cov(x, | > cov(x, | ||
| [,1] | [,1] | ||
| [1,] 0.7841619 | [1,] 0.7841619 | ||
| - | > (sp/ | + | > # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r |
| + | > (sp/ | ||
| [1] 0.7841619 | [1] 0.7841619 | ||
| - | > sp/ | + | > # 분모 분자에서 n-1을 제거하고 보면 |
| + | > sp/ | ||
| [1] 0.7841619 | [1] 0.7841619 | ||
| > cor(x,y) | > cor(x,y) | ||
| Line 556: | Line 592: | ||
| > # note that the red line is y.hat = a + bx, | > # note that the red line is y.hat = a + bx, | ||
| > # where error value y.hat - y is minimized | > # where error value y.hat - y is minimized | ||
| - | > # how to derive a and b | + | > |
| - | > # how to derive coefficient values | + | |
| - | > # see wiki doc at | + | |
| - | > # http:// | + | |
| > lm.df <- lm(y~x, data = df) | > lm.df <- lm(y~x, data = df) | ||
| > summary(lm.df) | > summary(lm.df) | ||
| Line 588: | Line 621: | ||
| > b | > b | ||
| [1] 2.223867 | [1] 2.223867 | ||
| - | > # y = a + b*x 에서 | + | > # how to derive |
| + | > # how to derive coefficient values | ||
| + | > # see wiki doc at | ||
| + | > # http:// | ||
| > b2 <- sp/ss.x | > b2 <- sp/ss.x | ||
| - | > b3 <- cov(x, | + | > b3 <- cov(x, |
| > # m.y = a + (b*m.x) 이므로 | > # m.y = a + (b*m.x) 이므로 | ||
| > a2 <- m.y - (b2*m.x) | > a2 <- m.y - (b2*m.x) | ||
| Line 610: | Line 646: | ||
| > # reg, res, total error | > # reg, res, total error | ||
| > # | > # | ||
| - | > y.hat <- (a+(b*x)) | + | > y.hat <- a+(b*x) |
| > y.hat2 <- predict(lm.df) | > y.hat2 <- predict(lm.df) | ||
| - | > y.hat == y.hat2 | + | > head(y) |
| - | [,1] | + | |
| - | [1,] TRUE | + | [1,] 175.7367 |
| - | [2,] TRUE | + | [2,] 200.0202 |
| - | [3,] TRUE | + | [3,] 241.0439 |
| - | [4,] TRUE | + | [4,] 166.0401 |
| - | [5,] TRUE | + | [5,] 195.5682 |
| - | [6,] TRUE | + | [6,] 210.8411 |
| - | [7,] TRUE | + | > head(y.hat) |
| - | [8,] TRUE | + | |
| - | [9,] TRUE | + | [1,] 186.9727 |
| - | [10,] TRUE | + | [2,] 205.5720 |
| - | [11,] TRUE | + | [3,] 212.9643 |
| - | [12,] TRUE | + | [4,] 189.8671 |
| - | [13,] TRUE | + | [5,] 196.9697 |
| - | [14,] TRUE | + | [6,] 239.3554 |
| - | [15,] TRUE | + | |
| - | [16,] TRUE | + | |
| - | [17,] TRUE | + | |
| - | [18,] TRUE | + | |
| - | [19,] TRUE | + | |
| - | [20,] TRUE | + | |
| - | [21,] TRUE | + | |
| - | [22,] TRUE | + | |
| - | [23,] TRUE | + | |
| - | [24,] TRUE | + | |
| - | [25,] TRUE | + | |
| - | [26,] TRUE | + | |
| - | [27,] TRUE | + | |
| - | [28,] TRUE | + | |
| - | [29,] TRUE | + | |
| - | [30,] TRUE | + | |
| - | [31,] TRUE | + | |
| - | [32,] TRUE | + | |
| - | [33,] TRUE | + | |
| - | [34,] TRUE | + | |
| - | [35,] TRUE | + | |
| - | [36,] TRUE | + | |
| - | [37,] TRUE | + | |
| - | [38,] TRUE | + | |
| - | [39,] TRUE | + | |
| - | [40,] TRUE | + | |
| - | > y | + | |
| - | [,1] | + | |
| - | [1,] 175.7367 | + | |
| - | [2,] 200.0202 | + | |
| - | [3,] 241.0439 | + | |
| - | [4,] 166.0401 | + | |
| - | [5,] 195.5682 | + | |
| - | [6,] 210.8411 | + | |
| - | [7,] 181.9459 | + | |
| - | [8,] 193.8544 | + | |
| - | [9,] 225.4417 | + | |
| - | [10,] 182.5276 | + | |
| - | [11,] 194.5656 | + | |
| - | [12,] 232.4421 | + | |
| - | [13,] 255.3070 | + | |
| - | [14,] 209.0313 | + | |
| - | [15,] 219.1602 | + | |
| - | [16,] 181.7819 | + | |
| - | [17,] 214.3124 | + | |
| - | [18,] 187.2961 | + | |
| - | [19,] 207.3581 | + | |
| - | [20,] 235.0318 | + | |
| - | [21,] 199.2244 | + | |
| - | [22,] 208.1031 | + | |
| - | [23,] 182.2575 | + | |
| - | [24,] 201.4683 | + | |
| - | [25,] 169.6124 | + | |
| - | [26,] 173.7941 | + | |
| - | [27,] 161.0332 | + | |
| - | [28,] 152.1292 | + | |
| - | [29,] 153.4874 | + | |
| - | [30,] 147.7652 | + | |
| - | [31,] 189.2685 | + | |
| - | [32,] 188.5691 | + | |
| - | [33,] 155.5620 | + | |
| - | [34,] 172.8731 | + | |
| - | [35,] 184.9119 | + | |
| - | [36,] 183.3182 | + | |
| - | [37,] 119.8266 | + | |
| - | [38,] 162.6349 | + | |
| - | [39,] 170.0872 | + | |
| - | [40,] 150.7684 | + | |
| - | attr(," | + | |
| - | [1] -0.08680858 | + | |
| - | attr(," | + | |
| - | [1] 1.039345 | + | |
| - | > y.hat | + | |
| - | [,1] | + | |
| - | [1,] 186.9727 | + | |
| - | [2,] 205.5720 | + | |
| - | [3,] 212.9643 | + | |
| - | [4,] 189.8671 | + | |
| - | [5,] 196.9697 | + | |
| - | [6,] 239.3554 | + | |
| - | [7,] 193.0240 | + | |
| - | [8,] 181.5673 | + | |
| - | [9,] 223.8973 | + | |
| - | [10,] 175.6106 | + | |
| - | [11,] 182.3963 | + | |
| - | [12,] 211.1666 | + | |
| - | [13,] 221.1593 | + | |
| - | [14,] 193.4777 | + | |
| - | [15,] 217.0723 | + | |
| - | [16,] 179.7820 | + | |
| - | [17,] 188.9258 | + | |
| - | [18,] 162.1265 | + | |
| - | [19,] 176.7310 | + | |
| - | [20,] 212.2383 | + | |
| - | [21,] 201.3295 | + | |
| - | [22,] 188.1615 | + | |
| - | [23,] 197.6116 | + | |
| - | [24,] 196.8985 | + | |
| - | [25,] 183.6318 | + | |
| - | [26,] 197.8900 | + | |
| - | [27,] 167.9229 | + | |
| - | [28,] 158.6935 | + | |
| - | [29,] 168.5770 | + | |
| - | [30,] 138.9668 | + | |
| - | [31,] 225.1485 | + | |
| - | [32,] 187.3333 | + | |
| - | [33,] 155.8128 | + | |
| - | [34,] 179.8349 | + | |
| - | [35,] 189.5992 | + | |
| - | [36,] 178.2857 | + | |
| - | [37,] 138.0425 | + | |
| - | [38,] 188.5297 | + | |
| - | [39,] 170.7527 | + | |
| - | [40,] 172.1024 | + | |
| - | attr(," | + | |
| - | [1] -0.08680858 | + | |
| - | attr(," | + | |
| - | [1] 1.039345 | + | |
| > m.y | > m.y | ||
| [1] 188.4 | [1] 188.4 | ||
| > | > | ||
| - | > tmp <- data.frame(y, | + | > tmp <- data.frame(x, y, y.hat) |
| - | > tmp | + | > head(tmp) |
| - | y y.hat m.y | + | |
| - | 1 175.7367 186.9727 | + | 1 |
| - | 2 200.0202 205.5720 | + | 2 107.72169 |
| - | 3 241.0439 212.9643 | + | 3 111.04574 |
| - | 4 166.0401 189.8671 | + | 4 100.65968 |
| - | 5 195.5682 196.9697 | + | 5 103.85350 |
| - | 6 210.8411 239.3554 | + | 6 122.91296 |
| - | 7 181.9459 193.0240 188.4 | + | > plot(x,y) # 원래 데이터 |
| - | 8 193.8544 181.5673 188.4 | + | > plot(x, |
| - | 9 225.4417 223.8973 188.4 | + | |
| - | 10 182.5276 175.6106 188.4 | + | |
| - | 11 194.5656 182.3963 188.4 | + | |
| - | 12 232.4421 211.1666 188.4 | + | |
| - | 13 255.3070 221.1593 188.4 | + | |
| - | 14 209.0313 193.4777 188.4 | + | |
| - | 15 219.1602 217.0723 188.4 | + | |
| - | 16 181.7819 179.7820 188.4 | + | |
| - | 17 214.3124 188.9258 188.4 | + | |
| - | 18 187.2961 162.1265 188.4 | + | |
| - | 19 207.3581 176.7310 188.4 | + | |
| - | 20 235.0318 212.2383 188.4 | + | |
| - | 21 199.2244 201.3295 188.4 | + | |
| - | 22 208.1031 188.1615 188.4 | + | |
| - | 23 182.2575 197.6116 188.4 | + | |
| - | 24 201.4683 196.8985 188.4 | + | |
| - | 25 169.6124 183.6318 188.4 | + | |
| - | 26 173.7941 197.8900 188.4 | + | |
| - | 27 161.0332 167.9229 188.4 | + | |
| - | 28 152.1292 158.6935 188.4 | + | |
| - | 29 153.4874 168.5770 188.4 | + | |
| - | 30 147.7652 138.9668 188.4 | + | |
| - | 31 189.2685 225.1485 188.4 | + | |
| - | 32 188.5691 187.3333 188.4 | + | |
| - | 33 155.5620 155.8128 188.4 | + | |
| - | 34 172.8731 179.8349 188.4 | + | |
| - | 35 184.9119 189.5992 188.4 | + | |
| - | 36 183.3182 178.2857 188.4 | + | |
| - | 37 119.8266 138.0425 188.4 | + | |
| - | 38 162.6349 188.5297 188.4 | + | |
| - | 39 170.0872 170.7527 188.4 | + | |
| - | 40 150.7684 172.1024 188.4 | + | |
| - | > plot(x,y) # 원래 데이터터 | + | |
| - | > plot(x, | + | |
| > plot(x, | > plot(x, | ||
| > | > | ||
| - | > error <- y-y.hat | + | > error <- y - y.hat |
| - | > explained <- y.hat-m.y | + | > explained <- y.hat - m.y |
| - | > error | + | > tmp2 <- data.frame(tmp, |
| - | [,1] | + | > head(tmp2) |
| - | [1,] -11.2359564 | + | x y y.hat explained |
| - | [2,] -5.5518187 | + | 1 |
| - | [3,] 28.0796455 | + | 2 107.72169 200.0202 205.5720 17.172003 |
| - | [4,] -23.8269825 | + | 3 111.04574 241.0439 212.9643 24.564266 |
| - | | + | 4 100.65968 166.0401 189.8671 |
| - | | + | 5 103.85350 195.5682 196.9697 |
| - | [7,] -11.0781515 | + | 6 122.91296 210.8411 239.3554 50.955365 -28.514250 |
| - | [8,] 12.2870686 | + | > |
| - | | + | > head(y.hat == explained + m.y) |
| - | [10,] 6.9169740 | + | |
| - | [11,] 12.1692981 | + | [1,] TRUE |
| - | [12,] 21.2754922 | + | [2,] TRUE |
| - | [13,] 34.1477152 | + | [3,] TRUE |
| - | [14,] 15.5535765 | + | [4,] TRUE |
| - | [15,] | + | [5,] TRUE |
| - | [16,] 1.9999551 | + | [6,] TRUE |
| - | [17,] 25.3866091 | + | > head(y == explained + error + m.y) |
| - | [18,] | + | |
| - | [19,] | + | [1,] TRUE |
| - | [20,] 22.7934996 | + | [2,] TRUE |
| - | [21,] -2.1051048 | + | [3,] TRUE |
| - | [22,] | + | [4,] TRUE |
| - | [23,] -15.3540820 | + | [5,] TRUE |
| - | [24,] 4.5697607 | + | [6,] TRUE |
| - | [25,] -14.0193869 | + | > |
| - | [26,] -24.0959095 | + | |
| - | [27,] -6.8896271 | + | |
| - | [28,] | + | |
| - | [29,] -15.0896661 | + | |
| - | [30,] 8.7983528 | + | |
| - | [31,] -35.8800136 | + | |
| - | [32,] 1.2357770 | + | |
| - | [33,] | + | |
| - | [34,] | + | |
| - | [35,] -4.6873617 | + | |
| - | [36,] 5.0325839 | + | |
| - | [37,] -18.2159090 | + | |
| - | [38,] -25.8947992 | + | |
| - | [39,] -0.6654576 | + | |
| - | [40,] -21.3340113 | + | |
| - | attr(," | + | |
| - | [1] -0.08680858 | + | |
| - | attr(," | + | |
| - | [1] 1.039345 | + | |
| > # note the below | > # note the below | ||
| > plot(error) | > plot(error) | ||
| > hist(error) | > hist(error) | ||
| + | > plot(x, | ||
| > cor(x, | > cor(x, | ||
| [,1] | [,1] | ||
| [1,] 7.981058e-17 | [1,] 7.981058e-17 | ||
| > ################ | > ################ | ||
| - | > explained | + | > plot(x,explained) |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | [10,] -12.7894406 | + | |
| - | [11,] -6.0037071 | + | |
| - | [12,] 22.7665939 | + | |
| - | [13,] 32.7592720 | + | |
| - | [14,] | + | |
| - | [15,] 28.6722546 | + | |
| - | [16,] -8.6180356 | + | |
| - | [17,] | + | |
| - | [18,] -26.2734989 | + | |
| - | [19,] -11.6690128 | + | |
| - | [20,] 23.8382597 | + | |
| - | [21,] 12.9294844 | + | |
| - | [22,] -0.2385288 | + | |
| - | [23,] | + | |
| - | [24,] | + | |
| - | [25,] -4.7682316 | + | |
| - | [26,] | + | |
| - | [27,] -20.4771504 | + | |
| - | [28,] -29.7065732 | + | |
| - | [29,] -19.8229799 | + | |
| - | [30,] -49.4332197 | + | |
| - | [31,] 36.7485236 | + | |
| - | [32,] -1.0667066 | + | |
| - | [33,] -32.5871819 | + | |
| - | [34,] -8.5651393 | + | |
| - | [35,] | + | |
| - | [36,] -10.1143632 | + | |
| - | [37,] -50.3574953 | + | |
| - | [38,] | + | |
| - | [39,] -17.6473510 | + | |
| - | [40,] -16.2976456 | + | |
| - | attr(," | + | |
| - | [1] -0.08680858 | + | |
| - | attr(," | + | |
| - | [1] 1.039345 | + | |
| > cor(x, | > cor(x, | ||
| + | [,1] | ||
| + | [1,] 1 | ||
| + | > plot(x, | ||
| + | > cor(x, | ||
| [,1] | [,1] | ||
| [1,] 1 | [1,] 1 | ||
| > | > | ||
| + | > head(explained+error) | ||
| + | [,1] | ||
| + | [1,] -12.663308 | ||
| + | [2,] 11.620185 | ||
| + | [3,] 52.643911 | ||
| + | [4,] -22.359948 | ||
| + | [5,] | ||
| + | [6,] 22.441115 | ||
| + | > plot(x, | ||
| + | > head(explained+error+m.y == y) | ||
| + | [,1] | ||
| + | [1,] TRUE | ||
| + | [2,] TRUE | ||
| + | [3,] TRUE | ||
| + | [4,] TRUE | ||
| + | [5,] TRUE | ||
| + | [6,] TRUE | ||
| + | > plot(x, | ||
| + | > | ||
| + | > cor(x, | ||
| + | [,1] | ||
| + | [1,] 0.7841619 | ||
| + | > cor(x, | ||
| + | [,1] | ||
| + | [1,] 0.7841619 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7841619 | ||
| > # see this also | > # see this also | ||
| - | > tmp2 <- data.frame(x, | ||
| > round(cor(tmp2), | > round(cor(tmp2), | ||
| - | x explained error y | + | x |
| - | x | + | x 1.000 0.784 1.000 1.000 0.000 |
| - | explained 1.000 1.000 0.000 0.784 | + | y 0.784 1.000 0.784 0.784 0.621 |
| - | error | + | y.hat |
| - | y 0.784 0.784 0.621 1.000 | + | explained 1.000 0.784 1.000 |
| + | error | ||
| > ############### | > ############### | ||
| > | > | ||
| + | > # sum of square values | ||
| > ss.res <- sum(error^2) | > ss.res <- sum(error^2) | ||
| > ss.reg <- sum(explained^2) | > ss.reg <- sum(explained^2) | ||
| Line 922: | Line 776: | ||
| [1] 31366.84 | [1] 31366.84 | ||
| > | > | ||
| - | > k <- 1 # number of IV | + | > # degrees of freedom values |
| - | > df.res <- n - k - 1 | + | > k <- 2 # number of parameters |
| - | > df.reg <- 2 - 1 # number of parameter estimation a and b (slope) | + | > df.reg <- k - 1 # number of parameter estimation a and b (slope) |
| + | > df.res <- n - df.reg - 1 | ||
| > ms.res <- ss.res/ | > ms.res <- ss.res/ | ||
| > ms.reg <- ss.reg/ | > ms.reg <- ss.reg/ | ||
| Line 942: | Line 797: | ||
| [1] 317.87 | [1] 317.87 | ||
| > | > | ||
| + | > # r square | ||
| + | > ss.y | ||
| + | [1] 31366.84 | ||
| + | > ss.reg | ||
| + | [1] 19287.78 | ||
| + | > r.sq <- ss.reg/ss.y | ||
| + | > r.sq | ||
| + | [1] 0.6149098 | ||
| + | > # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은 | ||
| + | > # ss.reg가 충분히 컸는지를 알아보는 것 | ||
| + | > # 이는 IV, x가 y를 설명하는데 충분히 기여했는지 | ||
| + | > # 테스트하는 것. 이는 F-test를 이용해서 하게 된다 | ||
| + | > # F = MS.bet / MS.within | ||
| + | > # regression의 경우는 F = MS.reg / MS.res | ||
| > f.cal <- ms.reg/ | > f.cal <- ms.reg/ | ||
| > f.cal | > f.cal | ||
| [1] 60.67819 | [1] 60.67819 | ||
| - | > pf(f.cal, | + | > pf(f.cal, |
| [1] 2.157284e-09 | [1] 2.157284e-09 | ||
| + | > # check anova test | ||
| > anova(lm.df) | > anova(lm.df) | ||
| Analysis of Variance Table | Analysis of Variance Table | ||
| Line 978: | Line 848: | ||
| F-statistic: | F-statistic: | ||
| - | > # r square | ||
| - | > ss.y | ||
| - | [1] 31366.84 | ||
| - | > ss.reg | ||
| - | [1] 19287.78 | ||
| - | > r.sq <- ss.reg/ss.y | ||
| - | > r.sq | ||
| - | [1] 0.6149098 | ||
| > | > | ||
| > # standard error for b coefficient | > # standard error for b coefficient | ||
| + | > # b coeffficient 값은 estimation | ||
| + | > # n.y 개의 데이터에서 구한 coefficient값 b는 | ||
| + | > # 실제 x, y 모집단을 대표하는 b값은 위의 | ||
| + | > # estimation에서 standard error값을 구간으로 갖는 | ||
| + | > # 범위일 것. | ||
| + | > | ||
| > res <- resid(lm.df) | > res <- resid(lm.df) | ||
| + | > data.frame(head(res)) | ||
| + | | ||
| + | 1 -11.235956 | ||
| + | 2 -5.551819 | ||
| + | 3 28.079646 | ||
| + | 4 -23.826982 | ||
| + | 5 -1.401474 | ||
| + | 6 -28.514250 | ||
| + | > data.frame(head(error)) | ||
| + | head.error. | ||
| + | 1 -11.235956 | ||
| + | 2 | ||
| + | 3 | ||
| + | 4 -23.826982 | ||
| + | 5 | ||
| + | 6 -28.514250 | ||
| > se.b <- sqrt(ms.res/ | > se.b <- sqrt(ms.res/ | ||
| > se.b | > se.b | ||
| [1] 0.285491 | [1] 0.285491 | ||
| - | > b + c(-2*se.b, 2*se.b) | + | > # as x increased by 1 unit, y would increase b +- 2se.b |
| + | > b + c(-2*se.b, 2*se.b) | ||
| [1] 1.652885 2.794849 | [1] 1.652885 2.794849 | ||
| + | > b | ||
| + | [1] 2.223867 | ||
| + | > | ||
| > # to be exact | > # to be exact | ||
| - | > t.crit <- qt(.025, df.res, lower.tail = F) | + | > t.crit <- qt(.975, df.res) |
| - | > # note that t.crit | + | > t.crit |
| + | [1] 2.024394 | ||
| > b + c(-t.crit*se.b, | > b + c(-t.crit*se.b, | ||
| [1] 1.645921 2.801813 | [1] 1.645921 2.801813 | ||
| - | > # see http:// | + | > b |
| - | > | + | [1] 2.223867 |
| - | > | + | > |
| > t.cal <- lm.df$coefficients[2]/ | > t.cal <- lm.df$coefficients[2]/ | ||
| > t.cal | > t.cal | ||
| Line 1011: | Line 900: | ||
| | | ||
| 2.157284e-09 | 2.157284e-09 | ||
| - | > pf(f.cal, | + | > pf(f.cal, |
| [1] 2.157284e-09 | [1] 2.157284e-09 | ||
| > | > | ||
| > # see also | > # see also | ||
| + | > t.cal | ||
| + | | ||
| + | 7.789621 | ||
| > t.cal^2 | > t.cal^2 | ||
| | | ||
| Line 1020: | Line 912: | ||
| > f.cal | > f.cal | ||
| [1] 60.67819 | [1] 60.67819 | ||
| - | > | ||
| > | > | ||
| > | > | ||
c/ms/2025/schedule/w11.lecture.note.1747745335.txt.gz · Last modified: by hkimscil
