c:ms:2025:schedule:w11.lecture.note
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
c:ms:2025:schedule:w11.lecture.note [2025/05/20 21:24] – hkimscil | c:ms:2025:schedule:w11.lecture.note [2025/05/21 08:59] (current) – [code] hkimscil | ||
---|---|---|---|
Line 27: | Line 27: | ||
\begin{eqnarray*} | \begin{eqnarray*} | ||
\text{F} & = & \frac {\text{MS}_\text{between group}} {\text{MS}_\text{within group}} \\ | \text{F} & = & \frac {\text{MS}_\text{between group}} {\text{MS}_\text{within group}} \\ | ||
- | & = & \frac {\text{SP (X,Y)} }{ \sqrt{\text{SS.x}*\text{SS.y}}} | + | & = & \frac {\text{Difference due to the treatment |
+ | & = & \frac {\text{Difference explained by IV } }{ \text{Difference remained random} } \\ | ||
+ | \text{R}^\text{2} & = & \frac {\text{SS}_\text{between}} {\text{SS}_\text{total} } \hfill \\ | ||
\end{eqnarray*} | \end{eqnarray*} | ||
+ | |||
+ | |||
+ | ====== Check in R ====== | ||
< | < | ||
Line 49: | Line 54: | ||
y <- (a*x) + b + re | y <- (a*x) + b + re | ||
y | y | ||
+ | |||
+ | # covariance x and y | ||
cov(x,y) | cov(x,y) | ||
sd(x) | sd(x) | ||
sd(y) | sd(y) | ||
+ | # correlation coefficient | ||
cov(x, | cov(x, | ||
cor(x,y) | cor(x,y) | ||
Line 58: | Line 66: | ||
df.x <- n.x-1 | df.x <- n.x-1 | ||
df.y <- df.x | df.y <- df.x | ||
- | sp.xy <- cov(x,y) * df.x | + | sp.xy <- cov(x,y) * df.x # 혹은 아래처럼 구할 수 있다 |
+ | sp.xy2 <- sum((x-m.x)*(y-m.y)) | ||
+ | sp.xy == sp.xy2 | ||
ss.x <- var(x) * df.x | ss.x <- var(x) * df.x | ||
ss.y <- var(y) * df.y | ss.y <- var(y) * df.y | ||
Line 65: | Line 76: | ||
</ | </ | ||
+ | ====== Check in R: output ====== | ||
< | < | ||
> | > | ||
Line 225: | Line 236: | ||
y <- (a*x) + b + re | y <- (a*x) + b + re | ||
y | y | ||
+ | |||
m.y <- mean(y) | m.y <- mean(y) | ||
df.y <- n.y - 1 | df.y <- n.y - 1 | ||
Line 231: | Line 243: | ||
var(y) | var(y) | ||
+ | # | ||
prod <- (x-m.x)*(y-m.y) | prod <- (x-m.x)*(y-m.y) | ||
sp <- sum(prod) | sp <- sum(prod) | ||
- | sp/df.y | + | sp/ |
- | cov(x,y) | + | cov(x, |
- | s.x <- sd(x) | + | s.x <- sd(x) # x변인에 대한 standard deviation 값 |
- | s.y <- sd(y) | + | s.y <- sd(y) # y |
- | (sp/ | + | (sp/ |
- | cor(x,y) | + | cor(x, |
df.y | df.y | ||
df.x | df.x | ||
- | |||
- | sqrt(ss.x/ | ||
- | sd(x) | ||
- | sqrt(ss.y/ | ||
- | sd(y) | ||
cov(x, | cov(x, | ||
- | (sp/ | + | # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r |
- | sp/ | + | (sp/ |
+ | # 분모 분자에서 n-1을 제거하고 보면 | ||
+ | sp/ | ||
cor(x,y) | cor(x,y) | ||
Line 266: | Line 276: | ||
# note that the red line is y.hat = a + bx, | # note that the red line is y.hat = a + bx, | ||
# where error value y.hat - y is minimized | # where error value y.hat - y is minimized | ||
- | # how to derive a and b | + | |
- | # how to derive coefficient values | + | |
- | # see wiki doc at | + | |
- | # http:// | + | |
lm.df <- lm(y~x, data = df) | lm.df <- lm(y~x, data = df) | ||
summary(lm.df) | summary(lm.df) | ||
Line 276: | Line 283: | ||
a | a | ||
b | b | ||
- | # y = a + b*x 에서 | + | # how to derive |
+ | # how to derive coefficient values | ||
+ | # see wiki doc at | ||
+ | # http:// | ||
b2 <- sp/ss.x | b2 <- sp/ss.x | ||
- | b3 <- cov(x, | + | b3 <- cov(x, |
# m.y = a + (b*m.x) 이므로 | # m.y = a + (b*m.x) 이므로 | ||
a2 <- m.y - (b2*m.x) | a2 <- m.y - (b2*m.x) | ||
Line 292: | Line 302: | ||
# reg, res, total error | # reg, res, total error | ||
# | # | ||
- | y.hat <- (a+(b*x)) | + | y.hat <- a+(b*x) |
y.hat2 <- predict(lm.df) | y.hat2 <- predict(lm.df) | ||
- | y.hat == y.hat2 | + | head(y) |
- | y | + | head(y.hat) |
- | y.hat | + | |
m.y | m.y | ||
- | tmp <- data.frame(y, | + | tmp <- data.frame(x, y, y.hat) |
- | tmp | + | head(tmp) |
- | plot(x,y) # 원래 데이터터 | + | plot(x,y) # 원래 데이터 |
- | plot(x, | + | plot(x, |
plot(x, | plot(x, | ||
- | error <- y-y.hat | + | error <- y - y.hat |
- | explained <- y.hat-m.y | + | explained <- y.hat - m.y |
- | error | + | tmp2 <- data.frame(tmp, |
+ | head(tmp2) | ||
+ | |||
+ | head(y.hat == explained + m.y) | ||
+ | head(y == explained + error + m.y) | ||
# note the below | # note the below | ||
plot(error) | plot(error) | ||
hist(error) | hist(error) | ||
+ | plot(x, | ||
cor(x, | cor(x, | ||
################ | ################ | ||
- | explained | + | plot(x,explained) |
cor(x, | cor(x, | ||
+ | plot(x, | ||
+ | cor(x, | ||
+ | head(explained+error) | ||
+ | plot(x, | ||
+ | head(explained+error+m.y == y) | ||
+ | plot(x, | ||
+ | |||
+ | cor(x, | ||
+ | cor(x, | ||
+ | cor(x,y) | ||
# see this also | # see this also | ||
- | tmp2 <- data.frame(x, | ||
round(cor(tmp2), | round(cor(tmp2), | ||
############### | ############### | ||
+ | # sum of square values | ||
ss.res <- sum(error^2) | ss.res <- sum(error^2) | ||
ss.reg <- sum(explained^2) | ss.reg <- sum(explained^2) | ||
Line 328: | Line 353: | ||
ss.res + ss.reg | ss.res + ss.reg | ||
- | k <- 1 # number of IV | + | # degrees of freedom values |
- | df.res <- n - k - 1 | + | k <- 2 # number of parameters |
- | df.reg <- 2 - 1 # number of parameter estimation a and b (slope) | + | df.reg <- k - 1 # number of parameter estimation a and b (slope) |
+ | df.res <- n - df.reg - 1 | ||
ms.res <- ss.res/ | ms.res <- ss.res/ | ||
ms.reg <- ss.reg/ | ms.reg <- ss.reg/ | ||
Line 342: | Line 368: | ||
ms.res | ms.res | ||
- | f.cal <- ms.reg/ | ||
- | f.cal | ||
- | pf(f.cal, 1, 38, lower.tail = F) | ||
- | anova(lm.df) | ||
- | summary(lm.df) | ||
# r square | # r square | ||
ss.y | ss.y | ||
Line 352: | Line 373: | ||
r.sq <- ss.reg/ss.y | r.sq <- ss.reg/ss.y | ||
r.sq | r.sq | ||
+ | # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은 | ||
+ | # ss.reg가 충분히 컸는지를 알아보는 것 | ||
+ | # 이는 IV, x가 y를 설명하는데 충분히 기여했는지 | ||
+ | # 테스트하는 것. 이는 F-test를 이용해서 하게 된다 | ||
+ | # F = MS.bet / MS.within | ||
+ | # regression의 경우는 F = MS.reg / MS.res | ||
+ | f.cal <- ms.reg/ | ||
+ | f.cal | ||
+ | pf(f.cal, df.reg, df.res, lower.tail = F) | ||
+ | # check anova test | ||
+ | anova(lm.df) | ||
+ | summary(lm.df) | ||
# standard error for b coefficient | # standard error for b coefficient | ||
+ | # b coeffficient 값은 estimation | ||
+ | # n.y 개의 데이터에서 구한 coefficient값 b는 | ||
+ | # 실제 x, y 모집단을 대표하는 b값은 위의 | ||
+ | # estimation에서 standard error값을 구간으로 갖는 | ||
+ | # 범위일 것. | ||
+ | |||
res <- resid(lm.df) | res <- resid(lm.df) | ||
+ | data.frame(head(res)) | ||
+ | data.frame(head(error)) | ||
se.b <- sqrt(ms.res/ | se.b <- sqrt(ms.res/ | ||
se.b | se.b | ||
- | b + c(-2*se.b, 2*se.b) | + | # as x increased by 1 unit, y would increase b +- 2se.b |
- | # to be exact, we use t.critical value instead of 2 | + | b + c(-2*se.b, |
- | t.crit <- qt(.025, df.res, lower.tail = F) | + | b |
- | b + c(-t.crit*se.b, | + | |
+ | # to be exact | ||
+ | t.crit <- qt(.975, df.res) | ||
+ | t.crit | ||
+ | b + c(-t.crit*se.b, | ||
+ | b | ||
t.cal <- lm.df$coefficients[2]/ | t.cal <- lm.df$coefficients[2]/ | ||
Line 368: | Line 413: | ||
pt(t.cal, df.res, lower.tail = F)*2 | pt(t.cal, df.res, lower.tail = F)*2 | ||
- | pf(f.cal, | + | pf(f.cal, |
# see also | # see also | ||
+ | t.cal | ||
t.cal^2 | t.cal^2 | ||
f.cal | f.cal | ||
Line 489: | Line 535: | ||
attr(," | attr(," | ||
[1] 1.039345 | [1] 1.039345 | ||
+ | > | ||
> m.y <- mean(y) | > m.y <- mean(y) | ||
> df.y <- n.y - 1 | > df.y <- n.y - 1 | ||
Line 498: | Line 545: | ||
[1,] 804.2779 | [1,] 804.2779 | ||
> | > | ||
+ | > # | ||
> prod <- (x-m.x)*(y-m.y) | > prod <- (x-m.x)*(y-m.y) | ||
> sp <- sum(prod) | > sp <- sum(prod) | ||
- | > sp/df.y | + | > sp/ |
[1] 222.3867 | [1] 222.3867 | ||
- | > cov(x,y) | + | > cov(x, |
[,1] | [,1] | ||
[1,] 222.3867 | [1,] 222.3867 | ||
> | > | ||
- | > s.x <- sd(x) | + | > s.x <- sd(x) # x변인에 대한 standard deviation 값 |
- | > s.y <- sd(y) | + | > s.y <- sd(y) # y |
- | > (sp/ | + | > (sp/ |
[1] 0.7841619 | [1] 0.7841619 | ||
- | > cor(x,y) | + | > cor(x, |
[,1] | [,1] | ||
[1,] 0.7841619 | [1,] 0.7841619 | ||
Line 518: | Line 566: | ||
> df.x | > df.x | ||
[1] 39 | [1] 39 | ||
- | > | ||
- | > sqrt(ss.x/ | ||
- | [1] 10 | ||
- | > sd(x) | ||
- | [1] 10 | ||
- | > sqrt(ss.y/ | ||
- | [1] 28.35979 | ||
- | > sd(y) | ||
- | [1] 28.35979 | ||
> | > | ||
> cov(x, | > cov(x, | ||
[,1] | [,1] | ||
[1,] 0.7841619 | [1,] 0.7841619 | ||
- | > (sp/ | + | > # covariance를 각각의 sd값을 곱한여 나눠 준 값이 r |
+ | > (sp/ | ||
[1] 0.7841619 | [1] 0.7841619 | ||
- | > sp/ | + | > # 분모 분자에서 n-1을 제거하고 보면 |
+ | > sp/ | ||
[1] 0.7841619 | [1] 0.7841619 | ||
> cor(x,y) | > cor(x,y) | ||
Line 551: | Line 592: | ||
> # note that the red line is y.hat = a + bx, | > # note that the red line is y.hat = a + bx, | ||
> # where error value y.hat - y is minimized | > # where error value y.hat - y is minimized | ||
- | > # how to derive a and b | + | > |
- | > # how to derive coefficient values | + | |
- | > # see wiki doc at | + | |
- | > # http:// | + | |
> lm.df <- lm(y~x, data = df) | > lm.df <- lm(y~x, data = df) | ||
> summary(lm.df) | > summary(lm.df) | ||
Line 583: | Line 621: | ||
> b | > b | ||
[1] 2.223867 | [1] 2.223867 | ||
- | > # y = a + b*x 에서 | + | > # how to derive |
+ | > # how to derive coefficient values | ||
+ | > # see wiki doc at | ||
+ | > # http:// | ||
> b2 <- sp/ss.x | > b2 <- sp/ss.x | ||
- | > b3 <- cov(x, | + | > b3 <- cov(x, |
> # m.y = a + (b*m.x) 이므로 | > # m.y = a + (b*m.x) 이므로 | ||
> a2 <- m.y - (b2*m.x) | > a2 <- m.y - (b2*m.x) | ||
Line 605: | Line 646: | ||
> # reg, res, total error | > # reg, res, total error | ||
> # | > # | ||
- | > y.hat <- (a+(b*x)) | + | > y.hat <- a+(b*x) |
> y.hat2 <- predict(lm.df) | > y.hat2 <- predict(lm.df) | ||
- | > y.hat == y.hat2 | + | > head(y) |
- | [,1] | + | |
- | [1,] TRUE | + | [1,] 175.7367 |
- | [2,] TRUE | + | [2,] 200.0202 |
- | [3,] TRUE | + | [3,] 241.0439 |
- | [4,] TRUE | + | [4,] 166.0401 |
- | [5,] TRUE | + | [5,] 195.5682 |
- | [6,] TRUE | + | [6,] 210.8411 |
- | [7,] TRUE | + | > head(y.hat) |
- | [8,] TRUE | + | |
- | [9,] TRUE | + | [1,] 186.9727 |
- | [10,] TRUE | + | [2,] 205.5720 |
- | [11,] TRUE | + | [3,] 212.9643 |
- | [12,] TRUE | + | [4,] 189.8671 |
- | [13,] TRUE | + | [5,] 196.9697 |
- | [14,] TRUE | + | [6,] 239.3554 |
- | [15,] TRUE | + | |
- | [16,] TRUE | + | |
- | [17,] TRUE | + | |
- | [18,] TRUE | + | |
- | [19,] TRUE | + | |
- | [20,] TRUE | + | |
- | [21,] TRUE | + | |
- | [22,] TRUE | + | |
- | [23,] TRUE | + | |
- | [24,] TRUE | + | |
- | [25,] TRUE | + | |
- | [26,] TRUE | + | |
- | [27,] TRUE | + | |
- | [28,] TRUE | + | |
- | [29,] TRUE | + | |
- | [30,] TRUE | + | |
- | [31,] TRUE | + | |
- | [32,] TRUE | + | |
- | [33,] TRUE | + | |
- | [34,] TRUE | + | |
- | [35,] TRUE | + | |
- | [36,] TRUE | + | |
- | [37,] TRUE | + | |
- | [38,] TRUE | + | |
- | [39,] TRUE | + | |
- | [40,] TRUE | + | |
- | > y | + | |
- | [,1] | + | |
- | [1,] 175.7367 | + | |
- | [2,] 200.0202 | + | |
- | [3,] 241.0439 | + | |
- | [4,] 166.0401 | + | |
- | [5,] 195.5682 | + | |
- | [6,] 210.8411 | + | |
- | [7,] 181.9459 | + | |
- | [8,] 193.8544 | + | |
- | [9,] 225.4417 | + | |
- | [10,] 182.5276 | + | |
- | [11,] 194.5656 | + | |
- | [12,] 232.4421 | + | |
- | [13,] 255.3070 | + | |
- | [14,] 209.0313 | + | |
- | [15,] 219.1602 | + | |
- | [16,] 181.7819 | + | |
- | [17,] 214.3124 | + | |
- | [18,] 187.2961 | + | |
- | [19,] 207.3581 | + | |
- | [20,] 235.0318 | + | |
- | [21,] 199.2244 | + | |
- | [22,] 208.1031 | + | |
- | [23,] 182.2575 | + | |
- | [24,] 201.4683 | + | |
- | [25,] 169.6124 | + | |
- | [26,] 173.7941 | + | |
- | [27,] 161.0332 | + | |
- | [28,] 152.1292 | + | |
- | [29,] 153.4874 | + | |
- | [30,] 147.7652 | + | |
- | [31,] 189.2685 | + | |
- | [32,] 188.5691 | + | |
- | [33,] 155.5620 | + | |
- | [34,] 172.8731 | + | |
- | [35,] 184.9119 | + | |
- | [36,] 183.3182 | + | |
- | [37,] 119.8266 | + | |
- | [38,] 162.6349 | + | |
- | [39,] 170.0872 | + | |
- | [40,] 150.7684 | + | |
- | attr(," | + | |
- | [1] -0.08680858 | + | |
- | attr(," | + | |
- | [1] 1.039345 | + | |
- | > y.hat | + | |
- | [,1] | + | |
- | [1,] 186.9727 | + | |
- | [2,] 205.5720 | + | |
- | [3,] 212.9643 | + | |
- | [4,] 189.8671 | + | |
- | [5,] 196.9697 | + | |
- | [6,] 239.3554 | + | |
- | [7,] 193.0240 | + | |
- | [8,] 181.5673 | + | |
- | [9,] 223.8973 | + | |
- | [10,] 175.6106 | + | |
- | [11,] 182.3963 | + | |
- | [12,] 211.1666 | + | |
- | [13,] 221.1593 | + | |
- | [14,] 193.4777 | + | |
- | [15,] 217.0723 | + | |
- | [16,] 179.7820 | + | |
- | [17,] 188.9258 | + | |
- | [18,] 162.1265 | + | |
- | [19,] 176.7310 | + | |
- | [20,] 212.2383 | + | |
- | [21,] 201.3295 | + | |
- | [22,] 188.1615 | + | |
- | [23,] 197.6116 | + | |
- | [24,] 196.8985 | + | |
- | [25,] 183.6318 | + | |
- | [26,] 197.8900 | + | |
- | [27,] 167.9229 | + | |
- | [28,] 158.6935 | + | |
- | [29,] 168.5770 | + | |
- | [30,] 138.9668 | + | |
- | [31,] 225.1485 | + | |
- | [32,] 187.3333 | + | |
- | [33,] 155.8128 | + | |
- | [34,] 179.8349 | + | |
- | [35,] 189.5992 | + | |
- | [36,] 178.2857 | + | |
- | [37,] 138.0425 | + | |
- | [38,] 188.5297 | + | |
- | [39,] 170.7527 | + | |
- | [40,] 172.1024 | + | |
- | attr(," | + | |
- | [1] -0.08680858 | + | |
- | attr(," | + | |
- | [1] 1.039345 | + | |
> m.y | > m.y | ||
[1] 188.4 | [1] 188.4 | ||
> | > | ||
- | > tmp <- data.frame(y, | + | > tmp <- data.frame(x, y, y.hat) |
- | > tmp | + | > head(tmp) |
- | y y.hat m.y | + | |
- | 1 175.7367 186.9727 | + | 1 |
- | 2 200.0202 205.5720 | + | 2 107.72169 |
- | 3 241.0439 212.9643 | + | 3 111.04574 |
- | 4 166.0401 189.8671 | + | 4 100.65968 |
- | 5 195.5682 196.9697 | + | 5 103.85350 |
- | 6 210.8411 239.3554 | + | 6 122.91296 |
- | 7 181.9459 193.0240 188.4 | + | > plot(x,y) # 원래 데이터 |
- | 8 193.8544 181.5673 188.4 | + | > plot(x, |
- | 9 225.4417 223.8973 188.4 | + | |
- | 10 182.5276 175.6106 188.4 | + | |
- | 11 194.5656 182.3963 188.4 | + | |
- | 12 232.4421 211.1666 188.4 | + | |
- | 13 255.3070 221.1593 188.4 | + | |
- | 14 209.0313 193.4777 188.4 | + | |
- | 15 219.1602 217.0723 188.4 | + | |
- | 16 181.7819 179.7820 188.4 | + | |
- | 17 214.3124 188.9258 188.4 | + | |
- | 18 187.2961 162.1265 188.4 | + | |
- | 19 207.3581 176.7310 188.4 | + | |
- | 20 235.0318 212.2383 188.4 | + | |
- | 21 199.2244 201.3295 188.4 | + | |
- | 22 208.1031 188.1615 188.4 | + | |
- | 23 182.2575 197.6116 188.4 | + | |
- | 24 201.4683 196.8985 188.4 | + | |
- | 25 169.6124 183.6318 188.4 | + | |
- | 26 173.7941 197.8900 188.4 | + | |
- | 27 161.0332 167.9229 188.4 | + | |
- | 28 152.1292 158.6935 188.4 | + | |
- | 29 153.4874 168.5770 188.4 | + | |
- | 30 147.7652 138.9668 188.4 | + | |
- | 31 189.2685 225.1485 188.4 | + | |
- | 32 188.5691 187.3333 188.4 | + | |
- | 33 155.5620 155.8128 188.4 | + | |
- | 34 172.8731 179.8349 188.4 | + | |
- | 35 184.9119 189.5992 188.4 | + | |
- | 36 183.3182 178.2857 188.4 | + | |
- | 37 119.8266 138.0425 188.4 | + | |
- | 38 162.6349 188.5297 188.4 | + | |
- | 39 170.0872 170.7527 188.4 | + | |
- | 40 150.7684 172.1024 188.4 | + | |
- | > plot(x,y) # 원래 데이터터 | + | |
- | > plot(x, | + | |
> plot(x, | > plot(x, | ||
> | > | ||
- | > error <- y-y.hat | + | > error <- y - y.hat |
- | > explained <- y.hat-m.y | + | > explained <- y.hat - m.y |
- | > error | + | > tmp2 <- data.frame(tmp, |
- | [,1] | + | > head(tmp2) |
- | [1,] -11.2359564 | + | x y y.hat explained |
- | [2,] -5.5518187 | + | 1 |
- | [3,] 28.0796455 | + | 2 107.72169 200.0202 205.5720 17.172003 |
- | [4,] -23.8269825 | + | 3 111.04574 241.0439 212.9643 24.564266 |
- | | + | 4 100.65968 166.0401 189.8671 |
- | | + | 5 103.85350 195.5682 196.9697 |
- | [7,] -11.0781515 | + | 6 122.91296 210.8411 239.3554 50.955365 -28.514250 |
- | [8,] 12.2870686 | + | > |
- | | + | > head(y.hat == explained + m.y) |
- | [10,] 6.9169740 | + | |
- | [11,] 12.1692981 | + | [1,] TRUE |
- | [12,] 21.2754922 | + | [2,] TRUE |
- | [13,] 34.1477152 | + | [3,] TRUE |
- | [14,] 15.5535765 | + | [4,] TRUE |
- | [15,] | + | [5,] TRUE |
- | [16,] 1.9999551 | + | [6,] TRUE |
- | [17,] 25.3866091 | + | > head(y == explained + error + m.y) |
- | [18,] | + | |
- | [19,] | + | [1,] TRUE |
- | [20,] 22.7934996 | + | [2,] TRUE |
- | [21,] -2.1051048 | + | [3,] TRUE |
- | [22,] | + | [4,] TRUE |
- | [23,] -15.3540820 | + | [5,] TRUE |
- | [24,] 4.5697607 | + | [6,] TRUE |
- | [25,] -14.0193869 | + | > |
- | [26,] -24.0959095 | + | |
- | [27,] -6.8896271 | + | |
- | [28,] | + | |
- | [29,] -15.0896661 | + | |
- | [30,] 8.7983528 | + | |
- | [31,] -35.8800136 | + | |
- | [32,] 1.2357770 | + | |
- | [33,] | + | |
- | [34,] | + | |
- | [35,] -4.6873617 | + | |
- | [36,] 5.0325839 | + | |
- | [37,] -18.2159090 | + | |
- | [38,] -25.8947992 | + | |
- | [39,] -0.6654576 | + | |
- | [40,] -21.3340113 | + | |
- | attr(," | + | |
- | [1] -0.08680858 | + | |
- | attr(," | + | |
- | [1] 1.039345 | + | |
> # note the below | > # note the below | ||
> plot(error) | > plot(error) | ||
> hist(error) | > hist(error) | ||
+ | > plot(x, | ||
> cor(x, | > cor(x, | ||
[,1] | [,1] | ||
[1,] 7.981058e-17 | [1,] 7.981058e-17 | ||
> ################ | > ################ | ||
- | > explained | + | > plot(x,explained) |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | [10,] -12.7894406 | + | |
- | [11,] -6.0037071 | + | |
- | [12,] 22.7665939 | + | |
- | [13,] 32.7592720 | + | |
- | [14,] | + | |
- | [15,] 28.6722546 | + | |
- | [16,] -8.6180356 | + | |
- | [17,] | + | |
- | [18,] -26.2734989 | + | |
- | [19,] -11.6690128 | + | |
- | [20,] 23.8382597 | + | |
- | [21,] 12.9294844 | + | |
- | [22,] -0.2385288 | + | |
- | [23,] | + | |
- | [24,] | + | |
- | [25,] -4.7682316 | + | |
- | [26,] | + | |
- | [27,] -20.4771504 | + | |
- | [28,] -29.7065732 | + | |
- | [29,] -19.8229799 | + | |
- | [30,] -49.4332197 | + | |
- | [31,] 36.7485236 | + | |
- | [32,] -1.0667066 | + | |
- | [33,] -32.5871819 | + | |
- | [34,] -8.5651393 | + | |
- | [35,] | + | |
- | [36,] -10.1143632 | + | |
- | [37,] -50.3574953 | + | |
- | [38,] | + | |
- | [39,] -17.6473510 | + | |
- | [40,] -16.2976456 | + | |
- | attr(," | + | |
- | [1] -0.08680858 | + | |
- | attr(," | + | |
- | [1] 1.039345 | + | |
> cor(x, | > cor(x, | ||
[,1] | [,1] | ||
[1,] 1 | [1,] 1 | ||
+ | > plot(x, | ||
+ | > cor(x, | ||
+ | [,1] | ||
+ | [1,] 1 | ||
+ | > | ||
+ | > head(explained+error) | ||
+ | [,1] | ||
+ | [1,] -12.663308 | ||
+ | [2,] 11.620185 | ||
+ | [3,] 52.643911 | ||
+ | [4,] -22.359948 | ||
+ | [5,] | ||
+ | [6,] 22.441115 | ||
+ | > plot(x, | ||
+ | > head(explained+error+m.y == y) | ||
+ | [,1] | ||
+ | [1,] TRUE | ||
+ | [2,] TRUE | ||
+ | [3,] TRUE | ||
+ | [4,] TRUE | ||
+ | [5,] TRUE | ||
+ | [6,] TRUE | ||
+ | > plot(x, | ||
> | > | ||
+ | > cor(x, | ||
+ | [,1] | ||
+ | [1,] 0.7841619 | ||
+ | > cor(x, | ||
+ | [,1] | ||
+ | [1,] 0.7841619 | ||
+ | > cor(x,y) | ||
+ | [,1] | ||
+ | [1,] 0.7841619 | ||
> # see this also | > # see this also | ||
- | > tmp2 <- data.frame(x, | ||
> round(cor(tmp2), | > round(cor(tmp2), | ||
- | x explained error y | + | x |
- | x | + | x 1.000 0.784 1.000 1.000 0.000 |
- | explained 1.000 1.000 0.000 0.784 | + | y 0.784 1.000 0.784 0.784 0.621 |
- | error | + | y.hat |
- | y 0.784 0.784 0.621 1.000 | + | explained 1.000 0.784 1.000 |
+ | error | ||
> ############### | > ############### | ||
> | > | ||
+ | > # sum of square values | ||
> ss.res <- sum(error^2) | > ss.res <- sum(error^2) | ||
> ss.reg <- sum(explained^2) | > ss.reg <- sum(explained^2) | ||
Line 917: | Line 776: | ||
[1] 31366.84 | [1] 31366.84 | ||
> | > | ||
- | > k <- 1 # number of IV | + | > # degrees of freedom values |
- | > df.res <- n - k - 1 | + | > k <- 2 # number of parameters |
- | > df.reg <- 2 - 1 # number of parameter estimation a and b (slope) | + | > df.reg <- k - 1 # number of parameter estimation a and b (slope) |
+ | > df.res <- n - df.reg - 1 | ||
> ms.res <- ss.res/ | > ms.res <- ss.res/ | ||
> ms.reg <- ss.reg/ | > ms.reg <- ss.reg/ | ||
Line 937: | Line 797: | ||
[1] 317.87 | [1] 317.87 | ||
> | > | ||
+ | > # r square | ||
+ | > ss.y | ||
+ | [1] 31366.84 | ||
+ | > ss.reg | ||
+ | [1] 19287.78 | ||
+ | > r.sq <- ss.reg/ss.y | ||
+ | > r.sq | ||
+ | [1] 0.6149098 | ||
+ | > # 위의 r.sq 값이 충분히 컸는지를 알아보는 것은 | ||
+ | > # ss.reg가 충분히 컸는지를 알아보는 것 | ||
+ | > # 이는 IV, x가 y를 설명하는데 충분히 기여했는지 | ||
+ | > # 테스트하는 것. 이는 F-test를 이용해서 하게 된다 | ||
+ | > # F = MS.bet / MS.within | ||
+ | > # regression의 경우는 F = MS.reg / MS.res | ||
> f.cal <- ms.reg/ | > f.cal <- ms.reg/ | ||
> f.cal | > f.cal | ||
[1] 60.67819 | [1] 60.67819 | ||
- | > pf(f.cal, | + | > pf(f.cal, |
[1] 2.157284e-09 | [1] 2.157284e-09 | ||
+ | > # check anova test | ||
> anova(lm.df) | > anova(lm.df) | ||
Analysis of Variance Table | Analysis of Variance Table | ||
Line 973: | Line 848: | ||
F-statistic: | F-statistic: | ||
- | > # r square | ||
- | > ss.y | ||
- | [1] 31366.84 | ||
- | > ss.reg | ||
- | [1] 19287.78 | ||
- | > r.sq <- ss.reg/ss.y | ||
- | > r.sq | ||
- | [1] 0.6149098 | ||
> | > | ||
> # standard error for b coefficient | > # standard error for b coefficient | ||
+ | > # b coeffficient 값은 estimation | ||
+ | > # n.y 개의 데이터에서 구한 coefficient값 b는 | ||
+ | > # 실제 x, y 모집단을 대표하는 b값은 위의 | ||
+ | > # estimation에서 standard error값을 구간으로 갖는 | ||
+ | > # 범위일 것. | ||
+ | > | ||
> res <- resid(lm.df) | > res <- resid(lm.df) | ||
+ | > data.frame(head(res)) | ||
+ | | ||
+ | 1 -11.235956 | ||
+ | 2 -5.551819 | ||
+ | 3 28.079646 | ||
+ | 4 -23.826982 | ||
+ | 5 -1.401474 | ||
+ | 6 -28.514250 | ||
+ | > data.frame(head(error)) | ||
+ | head.error. | ||
+ | 1 -11.235956 | ||
+ | 2 | ||
+ | 3 | ||
+ | 4 -23.826982 | ||
+ | 5 | ||
+ | 6 -28.514250 | ||
> se.b <- sqrt(ms.res/ | > se.b <- sqrt(ms.res/ | ||
> se.b | > se.b | ||
[1] 0.285491 | [1] 0.285491 | ||
- | > b + c(-2*se.b, 2*se.b) | + | > # as x increased by 1 unit, y would increase b +- 2se.b |
+ | > b + c(-2*se.b, 2*se.b) | ||
[1] 1.652885 2.794849 | [1] 1.652885 2.794849 | ||
+ | > b | ||
+ | [1] 2.223867 | ||
+ | > | ||
> # to be exact | > # to be exact | ||
- | > t.crit <- qt(.025, df.res, lower.tail = F) | + | > t.crit <- qt(.975, df.res) |
- | > # note that t.crit | + | > t.crit |
+ | [1] 2.024394 | ||
> b + c(-t.crit*se.b, | > b + c(-t.crit*se.b, | ||
[1] 1.645921 2.801813 | [1] 1.645921 2.801813 | ||
- | > # see http:// | + | > b |
- | > | + | [1] 2.223867 |
- | > | + | > |
> t.cal <- lm.df$coefficients[2]/ | > t.cal <- lm.df$coefficients[2]/ | ||
> t.cal | > t.cal | ||
Line 1006: | Line 900: | ||
| | ||
2.157284e-09 | 2.157284e-09 | ||
- | > pf(f.cal, | + | > pf(f.cal, |
[1] 2.157284e-09 | [1] 2.157284e-09 | ||
> | > | ||
> # see also | > # see also | ||
+ | > t.cal | ||
+ | | ||
+ | 7.789621 | ||
> t.cal^2 | > t.cal^2 | ||
| | ||
Line 1015: | Line 912: | ||
> f.cal | > f.cal | ||
[1] 60.67819 | [1] 60.67819 | ||
- | > | ||
> | > | ||
> | > |
c/ms/2025/schedule/w11.lecture.note.1747743846.txt.gz · Last modified: 2025/05/20 21:24 by hkimscil