c:ms:regression_lecture_note_for_r
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| c:ms:regression_lecture_note_for_r [2024/05/28 22:33] – hkimscil | c:ms:regression_lecture_note_for_r [2024/06/06 08:07] (current) – [Graphics output] hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Code ====== | ||
| + | You need to install some packages | ||
| + | < | ||
| + | install.packages(" | ||
| + | install.packages(" | ||
| + | install.packages(" | ||
| + | |||
| + | </ | ||
| < | < | ||
| ################ | ################ | ||
| rnorm2 <- function(n, | rnorm2 <- function(n, | ||
| - | n <- 400 | + | n <- 16 |
| set.seed(101) | set.seed(101) | ||
| Line 12: | Line 20: | ||
| points(rand.00, | points(rand.00, | ||
| - | b = 170 / 265 | + | b <- 170 / 265 |
| + | b <- round(b,1) | ||
| b | b | ||
| y <- b * x + rand.00 | y <- b * x + rand.00 | ||
| Line 30: | Line 39: | ||
| lm.y.x <- lm(y ~ x, data = df) | lm.y.x <- lm(y ~ x, data = df) | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| - | # str(lm.y.x) | + | |
| + | ########################### | ||
| + | # double check with this | ||
| + | ########################### | ||
| + | str(lm.y.x) | ||
| + | y.res <- lm.y.x$residuals | ||
| + | y.pre <- lm.y.x$fitted.values | ||
| + | y.exp <- y.pre - m.y | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | ########################### | ||
| lm.y.x$coefficients | lm.y.x$coefficients | ||
| intercept <- lm.y.x$coefficients[1] | intercept <- lm.y.x$coefficients[1] | ||
| Line 74: | Line 94: | ||
| b | b | ||
| a | a | ||
| + | # we got these from the above | ||
| + | slope | ||
| + | intercept | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| lm.y.x$coefficients | lm.y.x$coefficients | ||
| - | |||
| # str(lm.y.x) | # str(lm.y.x) | ||
| Line 97: | Line 119: | ||
| r.square <- ss.reg / ss.y | r.square <- ss.reg / ss.y | ||
| r.square | r.square | ||
| + | sqrt(r.square) | ||
| + | cor(x,y) | ||
| df.tot <- df.y | df.tot <- df.y | ||
| Line 130: | Line 153: | ||
| res <- lm.y.x$residuals | res <- lm.y.x$residuals | ||
| reg <- lm.y.x$fitted.values-m.y | reg <- lm.y.x$fitted.values-m.y | ||
| + | |||
| + | # The above can be derived as follows | ||
| + | y.hat <- intercept + (slope * x) | ||
| + | y.hat | ||
| + | head(y.hat) | ||
| + | head(lm.y.x$fitted.values) | ||
| + | y.pre <- y.hat | ||
| + | y.res <- y - y.pre | ||
| + | y.res | ||
| + | head(res) | ||
| + | head(y.res) | ||
| + | |||
| + | y.reg <- y.pre - m.y | ||
| + | head(reg) | ||
| + | head(y.reg) | ||
| + | ######################################## | ||
| # This is essentially a perfect linear | # This is essentially a perfect linear | ||
| Line 142: | Line 181: | ||
| summary(lm.res.x) | summary(lm.res.x) | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| - | </ | ||
| + | ######################################## | ||
| + | # Now let's get back to the lm output | ||
| + | # this is to evalue the significance of | ||
| + | # the slope of x (b) | ||
| + | summary(lm.y.x) | ||
| + | |||
| + | # the slope, we already got this | ||
| + | b <- slope | ||
| + | # intercept | ||
| + | a <- intercept | ||
| + | |||
| + | # the real data look like this | ||
| + | plot(x,y) | ||
| + | abline(h = mean(y), lwd=1.5, col=" | ||
| + | abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | |||
| + | # blue line is regression line | ||
| + | # this slope is an estimate of a real | ||
| + | # thing (population) rather than the | ||
| + | # sample we are using right now) | ||
| + | library(ggplot2) | ||
| + | |||
| + | ggplot(data = df, aes(x,y)) + | ||
| + | geom_point() + | ||
| + | geom_smooth(method=" | ||
| + | |||
| + | # the real slope should reside in between | ||
| + | # the violet area. The area is estimated | ||
| + | # with standard error of slop (regression | ||
| + | # coefficient) | ||
| + | |||
| + | # in word | ||
| + | # se for b is | ||
| + | # from summary(lm.y.x) | ||
| + | |||
| + | summary(lm.y.x) | ||
| + | se.b <- 0.0401 | ||
| + | ci.se.b <- 1.96 * se.b | ||
| + | # this is the lowest estimate | ||
| + | b - ci.se.b | ||
| + | # the highest estimate | ||
| + | b + ci.se.b | ||
| + | # calculated estimate | ||
| + | b | ||
| + | |||
| + | # how to get se for b? | ||
| + | # https:// | ||
| + | # see also http:// | ||
| + | |||
| + | ss.res | ||
| + | ss.x | ||
| + | # we didn't get ss.x | ||
| + | ss.x <- sum((x-m.x)^2) | ||
| + | ss.x | ||
| + | |||
| + | sqrt( (ss.res/ | ||
| + | # the above is the same as the below | ||
| + | sqrt( ss.res / (n-2)*ss.x ) | ||
| + | sqrt( 1/(n-2) * (ss.res/ | ||
| + | se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | |||
| + | # now | ||
| + | summary(lm.y.x) | ||
| + | # if b is 0, b does not do anything (null hypothesis) | ||
| + | # To test if b is working, we do t-test by | ||
| + | # b (current coefficient) - 0 (not working) / se | ||
| + | # = t.calculated. We test if this is significant | ||
| + | # tp(t.b.cal, df=n-2) | ||
| + | |||
| + | t.b.cal <- (b - 0)/ | ||
| + | t.b.cal | ||
| + | # k = num of predictor variables | ||
| + | k <- 1 | ||
| + | df.t <- n-k-1 | ||
| + | |||
| + | 2*pt(t.b.cal, | ||
| + | pf(t.b.cal^2, | ||
| + | </ | ||
| + | ====== Output ====== | ||
| < | < | ||
| + | > ################ | ||
| > rnorm2 <- function(n, | > rnorm2 <- function(n, | ||
| - | > n <- 400 | + | > n <- 16 |
| > | > | ||
| > set.seed(101) | > set.seed(101) | ||
| Line 157: | Line 275: | ||
| > points(rand.00, | > points(rand.00, | ||
| > | > | ||
| - | > b = 170 / 265 | + | > b <- 170 / 265 |
| + | > b <- round(b,1) | ||
| > b | > b | ||
| - | [1] 0.6415094 | + | [1] 0.6 |
| > y <- b * x + rand.00 | > y <- b * x + rand.00 | ||
| > | > | ||
| Line 165: | Line 284: | ||
| > head(df) | > head(df) | ||
| | | ||
| - | 1 260.3289 164.1701 | + | 1 256.4615 144.9723 |
| - | 2 273.9897 149.7065 | + | 2 273.7812 167.6050 |
| - | 3 254.9034 140.6106 | + | 3 249.5828 141.2775 |
| - | 4 268.7321 179.5289 | + | 4 267.1155 135.1836 |
| - | 5 270.2313 176.5863 | + | 5 269.0162 161.7509 |
| - | 6 283.6541 172.5179 | + | 6 286.0342 183.7182 |
| > | > | ||
| > # plot method 0 | > # plot method 0 | ||
| Line 189: | Line 308: | ||
| Residuals: | Residuals: | ||
| Min 1Q Median | Min 1Q Median | ||
| - | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
| Coefficients: | Coefficients: | ||
| - | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
| - | (Intercept) | + | (Intercept) |
| - | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
| --- | --- | ||
| Signif. codes: | Signif. codes: | ||
| - | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
| - | Multiple R-squared: | + | Multiple R-squared: |
| - | F-statistic: | + | F-statistic: |
| - | > # str(lm.y.x) | + | > |
| + | > ########################### | ||
| + | > # double check with this | ||
| + | > ########################### | ||
| + | > str(lm.y.x) | ||
| + | List of 12 | ||
| + | $ coefficients : Named num [1:2] -52.9 0.8 | ||
| + | ..- attr(*, " | ||
| + | $ residuals | ||
| + | ..- attr(*, " | ||
| + | $ effects | ||
| + | ..- attr(*, " | ||
| + | $ rank : int 2 | ||
| + | $ fitted.values: | ||
| + | ..- attr(*, " | ||
| + | $ assign | ||
| + | $ qr :List of 5 | ||
| + | ..$ qr : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ... | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..$ : chr [1:16] " | ||
| + | .. .. ..$ : chr [1:2] " | ||
| + | .. ..- attr(*, " | ||
| + | ..$ qraux: num [1:2] 1.25 1.18 | ||
| + | ..$ pivot: int [1:2] 1 2 | ||
| + | ..$ tol : num 1e-07 | ||
| + | ..$ rank : int 2 | ||
| + | ..- attr(*, " | ||
| + | $ df.residual | ||
| + | $ xlevels | ||
| + | $ call : language lm(formula = y ~ x, data = df) | ||
| + | $ terms :Classes ' | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. ..$ : chr " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | $ model :' | ||
| + | ..$ y: num [1:16] 145 168 141 135 162 ... | ||
| + | ..$ x: num [1:16] 256 274 250 267 269 ... | ||
| + | ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | .. .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. .. ..$ : chr " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | - attr(*, " | ||
| + | > y.res <- lm.y.x$residuals | ||
| + | > y.pre <- lm.y.x$fitted.values | ||
| + | > y.exp <- y.pre - m.y | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > ########################### | ||
| > lm.y.x$coefficients | > lm.y.x$coefficients | ||
| (Intercept) | (Intercept) | ||
| - | 0.8021833 | + | -52.8817788 |
| > intercept <- lm.y.x$coefficients[1] | > intercept <- lm.y.x$coefficients[1] | ||
| > slope <- lm.y.x$coefficients[2] | > slope <- lm.y.x$coefficients[2] | ||
| > intercept | > intercept | ||
| (Intercept) | (Intercept) | ||
| - | | + | |
| > slope | > slope | ||
| x | x | ||
| - | 0.6384823 | + | 0.7995539 |
| > | > | ||
| > # library(ggplot2) | > # library(ggplot2) | ||
| Line 224: | Line 411: | ||
| > ss.y <- sum((y-mean(y))^2) | > ss.y <- sum((y-mean(y))^2) | ||
| > ss.y | > ss.y | ||
| - | [1] 94052.83 | + | [1] 4183.193 |
| > df.y <- length(y)-1 | > df.y <- length(y)-1 | ||
| > df.y | > df.y | ||
| - | [1] 399 | + | [1] 15 |
| > ss.y/df.y | > ss.y/df.y | ||
| - | [1] 235.7214 | + | [1] 278.8795 |
| > var(y) | > var(y) | ||
| [,1] | [,1] | ||
| - | [1,] 235.7214 | + | [1,] 278.8795 |
| > | > | ||
| > m.x <- mean(x) | > m.x <- mean(x) | ||
| Line 239: | Line 426: | ||
| > df.tot <- n-1 | > df.tot <- n-1 | ||
| > sp/df.tot | > sp/df.tot | ||
| - | [1] 143.6585 | + | [1] 179.8996 |
| > cov.xy <- cov(x,y) | > cov.xy <- cov(x,y) | ||
| > | > | ||
| Line 248: | Line 435: | ||
| > r.xy | > r.xy | ||
| [,1] | [,1] | ||
| - | [1,] 0.6237932 | + | [1,] 0.7181756 |
| > cor(x,y) | > cor(x,y) | ||
| [,1] | [,1] | ||
| - | [1,] 0.6237932 | + | [1,] 0.7181756 |
| > cor.test(x, | > cor.test(x, | ||
| Line 257: | Line 444: | ||
| data: x and y | data: x and y | ||
| - | t = 15.922, df = 398, p-value | + | t = 3.8616, df = 14, p-value |
| alternative hypothesis: true correlation is not equal to 0 | alternative hypothesis: true correlation is not equal to 0 | ||
| 95 percent confidence interval: | 95 percent confidence interval: | ||
| - | 0.5599929 | + | 0.3454526 |
| sample estimates: | sample estimates: | ||
| cor | cor | ||
| - | 0.6237932 | + | 0.7181756 |
| > | > | ||
| Line 274: | Line 461: | ||
| > b | > b | ||
| [,1] | [,1] | ||
| - | [1,] 0.6384823 | + | [1,] 0.7995539 |
| > a | > a | ||
| [,1] | [,1] | ||
| - | [1,] 0.8021833 | + | [1,] -52.88178 |
| + | > # we got these from the above | ||
| + | > slope | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > intercept | ||
| + | (Intercept) | ||
| + | -52.88178 | ||
| > | > | ||
| > | > | ||
| Line 287: | Line 481: | ||
| Residuals: | Residuals: | ||
| Min 1Q Median | Min 1Q Median | ||
| - | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
| Coefficients: | Coefficients: | ||
| - | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
| - | (Intercept) | + | (Intercept) |
| - | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
| --- | --- | ||
| Signif. codes: | Signif. codes: | ||
| - | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
| - | Multiple R-squared: | + | Multiple R-squared: |
| - | F-statistic: | + | F-statistic: |
| > lm.y.x$coefficients | > lm.y.x$coefficients | ||
| (Intercept) | (Intercept) | ||
| - | 0.8021833 | + | -52.8817788 |
| - | > | + | |
| > # str(lm.y.x) | > # str(lm.y.x) | ||
| > | > | ||
| Line 309: | Line 502: | ||
| > y <- y | > y <- y | ||
| > m.y | > m.y | ||
| - | [1] 170 | + | [1] 159 |
| > | > | ||
| > res <- y - y.pred | > res <- y - y.pred | ||
| > reg <- y.pred - m.y | > reg <- y.pred - m.y | ||
| > ss.y | > ss.y | ||
| - | [1] 94052.83 | + | [1] 4183.193 |
| > ss.res <- sum(res^2) | > ss.res <- sum(res^2) | ||
| > ss.reg <- sum(reg^2) | > ss.reg <- sum(reg^2) | ||
| > ss.y | > ss.y | ||
| - | [1] 94052.83 | + | [1] 4183.193 |
| > ss.res | > ss.res | ||
| - | [1] 57455.18 | + | [1] 2025.602 |
| > ss.reg | > ss.reg | ||
| - | [1] 36597.65 | + | [1] 2157.592 |
| > ss.res + ss.reg | > ss.res + ss.reg | ||
| - | [1] 94052.83 | + | [1] 4183.193 |
| > | > | ||
| > r.square <- ss.reg / ss.y | > r.square <- ss.reg / ss.y | ||
| > r.square | > r.square | ||
| - | [1] 0.389118 | + | [1] 0.5157762 |
| - | > | + | > sqrt(r.square) |
| + | [1] 0.7181756 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| > | > | ||
| > df.tot <- df.y | > df.tot <- df.y | ||
| Line 338: | Line 535: | ||
| [1] 1 | [1] 1 | ||
| > df.res | > df.res | ||
| - | [1] 398 | + | [1] 14 |
| > df.tot | > df.tot | ||
| - | [1] 399 | + | [1] 15 |
| > | > | ||
| > ss.tot <- ss.y | > ss.tot <- ss.y | ||
| > ss.reg | > ss.reg | ||
| - | [1] 36597.65 | + | [1] 2157.592 |
| > ss.res | > ss.res | ||
| - | [1] 57455.18 | + | [1] 2025.602 |
| > ss.tot | > ss.tot | ||
| - | [1] 94052.83 | + | [1] 4183.193 |
| > | > | ||
| > ms.reg <- ss.reg / df.reg | > ms.reg <- ss.reg / df.reg | ||
| > ms.res <- ss.res / df.res | > ms.res <- ss.res / df.res | ||
| > ms.reg | > ms.reg | ||
| - | [1] 36597.65 | + | [1] 2157.592 |
| > ms.res | > ms.res | ||
| - | [1] 144.3597 | + | [1] 144.6858 |
| > | > | ||
| > f.cal <- ms.reg/ | > f.cal <- ms.reg/ | ||
| > f.cal | > f.cal | ||
| - | [1] 253.517 | + | [1] 14.91225 |
| > | > | ||
| > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
| > p.val | > p.val | ||
| - | [1] 1.623694e-44 | + | [1] 0.001727459 |
| > anova(lm.y.x) | > anova(lm.y.x) | ||
| Analysis of Variance Table | Analysis of Variance Table | ||
| Response: y | Response: y | ||
| - | Df Sum Sq Mean Sq F value Pr(> | + | |
| - | x | + | x 1 2157.6 2157.59 |
| - | Residuals | + | Residuals |
| --- | --- | ||
| Signif. codes: | Signif. codes: | ||
| Line 380: | Line 577: | ||
| > res <- lm.y.x$residuals | > res <- lm.y.x$residuals | ||
| > reg <- lm.y.x$fitted.values-m.y | > reg <- lm.y.x$fitted.values-m.y | ||
| + | > | ||
| + | > # The above can be derived as follows | ||
| + | > y.hat <- intercept + (slope * x) | ||
| + | > y.hat | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | [7,] 167.0666 | ||
| + | [8,] 155.5354 | ||
| + | [9,] 171.7678 | ||
| + | [10,] 153.7931 | ||
| + | [11,] 165.6110 | ||
| + | [12,] 144.7831 | ||
| + | [13,] 179.8185 | ||
| + | [14,] 134.1906 | ||
| + | [15,] 153.5815 | ||
| + | [16,] 154.2648 | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(y.hat) | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | > head(lm.y.x$fitted.values) | ||
| + | | ||
| + | 152.1730 166.0210 146.6731 160.6914 162.2112 175.8179 | ||
| + | > y.pre <- y.hat | ||
| + | > y.res <- y - y.pre | ||
| + | > y.res | ||
| + | [,1] | ||
| + | | ||
| + | | ||
| + | | ||
| + | [4,] -25.5078178 | ||
| + | | ||
| + | | ||
| + | | ||
| + | [8,] -16.3176727 | ||
| + | | ||
| + | [10,] -15.1613471 | ||
| + | [11,] | ||
| + | [12,] | ||
| + | [13,] | ||
| + | [14,] 15.4541201 | ||
| + | [15,] 15.9625789 | ||
| + | [16,] | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(res) | ||
| + | 1 | ||
| + | | ||
| + | > head(y.res) | ||
| + | [,1] | ||
| + | [1,] -7.2007773 | ||
| + | [2,] | ||
| + | [3,] -5.3956693 | ||
| + | [4,] -25.5078178 | ||
| + | [5,] -0.4602376 | ||
| + | [6,] | ||
| + | > | ||
| + | > y.reg <- y.pre - m.y | ||
| + | > head(reg) | ||
| + | | ||
| + | | ||
| + | > head(y.reg) | ||
| + | [,1] | ||
| + | [1,] -6.826963 | ||
| + | [2,] | ||
| + | [3,] -12.326872 | ||
| + | [4,] | ||
| + | [5,] | ||
| + | [6,] 16.817938 | ||
| + | > ######################################## | ||
| > | > | ||
| > # This is essentially a perfect linear | > # This is essentially a perfect linear | ||
| Line 392: | Line 674: | ||
| Residuals: | Residuals: | ||
| | | ||
| - | -6.828e-14 -7.900e-15 -5.500e-16 6.410e-15 | + | -1.216e-14 -9.993e-15 -2.381e-15 7.912e-15 |
| Coefficients: | Coefficients: | ||
| Estimate Std. Error t value Pr(> | Estimate Std. Error t value Pr(> | ||
| - | (Intercept) -1.692e+02 | + | (Intercept) -2.119e+02 |
| - | x | + | x |
| --- | --- | ||
| Signif. codes: | Signif. codes: | ||
| - | Residual standard error: | + | Residual standard error: |
| Multiple R-squared: | Multiple R-squared: | ||
| - | F-statistic: | + | F-statistic: |
| + | 경고메시지(들): | ||
| + | summary.lm(lm.reg.x)에서: | ||
| + | essentially perfect fit: summary may be unreliable | ||
| > | > | ||
| > # The relationship between residuals and | > # The relationship between residuals and | ||
| Line 416: | Line 701: | ||
| Residuals: | Residuals: | ||
| Min 1Q Median | Min 1Q Median | ||
| - | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
| Coefficients: | Coefficients: | ||
| Estimate Std. Error t value Pr(>|t|) | Estimate Std. Error t value Pr(>|t|) | ||
| - | (Intercept) -3.586e-15 1.064e+01 | + | (Intercept) -1.956e-14 5.495e+01 |
| - | x | + | x |
| - | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
| - | Multiple R-squared: | + | Multiple R-squared: |
| - | F-statistic: | + | F-statistic: |
| > summary(lm.y.x) | > summary(lm.y.x) | ||
| Line 434: | Line 719: | ||
| Residuals: | Residuals: | ||
| Min 1Q Median | Min 1Q Median | ||
| - | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
| Coefficients: | Coefficients: | ||
| - | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
| - | (Intercept) | + | (Intercept) |
| - | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
| --- | --- | ||
| Signif. codes: | Signif. codes: | ||
| - | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
| - | Multiple R-squared: | + | Multiple R-squared: |
| - | F-statistic: | + | F-statistic: |
| > | > | ||
| + | > ######################################## | ||
| + | > # Now let's get back to the lm output | ||
| + | > # this is to evalue the significance of | ||
| + | > # the slope of x (b) | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| > | > | ||
| + | > # the slope, we already got this | ||
| + | > b <- slope | ||
| + | > # intercept | ||
| + | > a <- intercept | ||
| > | > | ||
| + | > # the real data look like this | ||
| + | > plot(x,y) | ||
| + | > abline(h = mean(y), lwd=1.5, col=" | ||
| + | > abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | > | ||
| + | > # blue line is regression line | ||
| + | > # this slope is an estimate of a real | ||
| + | > # thing (population) rather than the | ||
| + | > # sample we are using right now) | ||
| + | > library(ggplot2) | ||
| + | > | ||
| + | > ggplot(data = df, aes(x,y)) + | ||
| + | + | ||
| + | + | ||
| + | `geom_smooth()` using formula = 'y ~ x' | ||
| + | > | ||
| + | > # the real slope should reside in between | ||
| + | > # the violet area. The area is estimated | ||
| + | > # with standard error of slop (regression | ||
| + | > # coefficient) | ||
| + | > | ||
| + | > # in word | ||
| + | > # se for b is | ||
| + | > # from summary(lm.y.x) | ||
| + | > | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > se.b <- 0.0401 | ||
| + | > ci.se.b <- 1.96 * se.b | ||
| + | > # this is the lowest estimate | ||
| + | > b - ci.se.b | ||
| + | x | ||
| + | 0.7209579 | ||
| + | > # the highest estimate | ||
| + | > b + ci.se.b | ||
| + | x | ||
| + | 0.8781499 | ||
| + | > # calculated estimate | ||
| + | > b | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > | ||
| + | > # how to get se for b? | ||
| + | > # https:// | ||
| + | > # see also http:// | ||
| + | > | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > # we didn't get ss.x | ||
| + | > ss.x <- sum((x-m.x)^2) | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > | ||
| + | > sqrt( (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > # the above is the same as the below | ||
| + | > sqrt( ss.res / (n-2)*ss.x ) | ||
| + | [1] 698.7952 | ||
| + | > sqrt( 1/(n-2) * (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | > | ||
| + | > # now | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > # if b is 0, b does not do anything (null hypothesis) | ||
| + | > # To test if b is working, we do t-test by | ||
| + | > # b (current coefficient) - 0 (not working) / se | ||
| + | > # = t.calculated. We test if this is significant | ||
| + | > # tp(t.b.cal, df=n-2) | ||
| + | > | ||
| + | > t.b.cal <- (b - 0)/ | ||
| + | > t.b.cal | ||
| + | | ||
| + | 3.861639 | ||
| + | > # k = num of predictor variables | ||
| + | > k <- 1 | ||
| + | > df.t <- n-k-1 | ||
| + | > | ||
| + | > 2*pt(t.b.cal, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > pf(t.b.cal^2, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > | ||
| </ | </ | ||
| + | ====== Graphics output ====== | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | |||
c/ms/regression_lecture_note_for_r.1716935598.txt.gz · Last modified: by hkimscil
