c:ms:regression_lecture_note_for_r
Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| c:ms:regression_lecture_note_for_r [2024/05/21 23:59] – created hkimscil | c:ms:regression_lecture_note_for_r [2024/06/06 08:07] (current) – [Graphics output] hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Code ====== | ||
| + | You need to install some packages | ||
| < | < | ||
| - | ################ | + | install.packages(" |
| + | install.packages(" | ||
| + | install.packages(" | ||
| + | </ | ||
| + | < | ||
| + | ################ | ||
| rnorm2 <- function(n, | rnorm2 <- function(n, | ||
| - | n <- 900 | + | n <- 16 |
| set.seed(101) | set.seed(101) | ||
| - | x <- rnorm2(n, | + | x <- rnorm2(n, |
| - | rand.00 <- rnorm(n, 0, 12) | + | rand.00 <- rnorm2(n, 0, 12) |
| - | rand.01 <- rnorm(n, 0, 240) | + | rand.01 <- rnorm2(n, 0, 240) |
| plot(rand.01) | plot(rand.01) | ||
| points(rand.00, | points(rand.00, | ||
| - | b = 170 / 265 | + | b <- 170 / 265 |
| + | b <- round(b,1) | ||
| b | b | ||
| y <- b * x + rand.00 | y <- b * x + rand.00 | ||
| Line 29: | Line 37: | ||
| # method 2 | # method 2 | ||
| - | lm.mod <- lm(y ~ x, data = df) | + | lm.y.x <- lm(y ~ x, data = df) |
| - | summary(lm.mod) | + | summary(lm.y.x) |
| - | str(lm.mod) | + | |
| - | intercept <- lm.mod$coefficients[1] | + | ########################### |
| - | slope <- lm.mod$coefficients[2] | + | # double check with this |
| + | ########################### | ||
| + | str(lm.y.x) | ||
| + | y.res <- lm.y.x$residuals | ||
| + | y.pre <- lm.y.x$fitted.values | ||
| + | y.exp <- y.pre - m.y | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | ########################### | ||
| + | lm.y.x$coefficients | ||
| + | intercept <- lm.y.x$coefficients[1] | ||
| + | slope <- lm.y.x$coefficients[2] | ||
| intercept | intercept | ||
| slope | slope | ||
| - | ggplot(data=df, | ||
| - | geom_point(color=" | ||
| - | geom_hline(aes(yintercept=mean(y))) + | ||
| - | geom_abline(intercept=intercept, | ||
| + | # library(ggplot2) | ||
| + | # ggplot(data=df, | ||
| + | # geom_point(color=" | ||
| + | # geom_hline(aes(yintercept=mean(y))) + | ||
| + | # geom_abline(intercept=intercept, | ||
| + | # | ||
| ##### | ##### | ||
| ss.y <- sum((y-mean(y))^2) | ss.y <- sum((y-mean(y))^2) | ||
| Line 49: | Line 71: | ||
| var(y) | var(y) | ||
| - | cov.val <- cov(x,y) | ||
| m.x <- mean(x) | m.x <- mean(x) | ||
| m.y <- mean(y) | m.y <- mean(y) | ||
| Line 55: | Line 76: | ||
| df.tot <- n-1 | df.tot <- n-1 | ||
| sp/df.tot | sp/df.tot | ||
| - | cov.val | + | cov.xy <- cov(x,y) |
| sd.x <- sd(x) | sd.x <- sd(x) | ||
| sd.y <- sd(y) | sd.y <- sd(y) | ||
| - | r.cal <- cov.val/ | + | r.xy <- cov.xy/ |
| - | r.cal | + | r.xy |
| cor(x,y) | cor(x,y) | ||
| cor.test(x, | cor.test(x, | ||
| b <- cov(x,y) / var(x) | b <- cov(x,y) / var(x) | ||
| + | # note that | ||
| + | # (sp(x, | ||
| + | # = sp(x,y) / ss(x) | ||
| # m.y = a + b * mean.x | # m.y = a + b * mean.x | ||
| a <- m.y - (b * m.x) | a <- m.y - (b * m.x) | ||
| b | b | ||
| a | a | ||
| + | # we got these from the above | ||
| + | slope | ||
| + | intercept | ||
| - | lm.mod <- lm(y ~ x, data=df) | + | summary(lm.y.x) |
| - | summary(lm.mod) | + | lm.y.x$coefficients |
| - | lm.xy$coefficients | + | # str(lm.y.x) |
| - | y.pred <- lm.xy$fitted.values | + | y.pred <- lm.y.x$fitted.values |
| y <- y | y <- y | ||
| m.y | m.y | ||
| - | residual | + | res <- y - y.pred |
| - | explained | + | reg <- y.pred - m.y |
| ss.y | ss.y | ||
| - | ss.res <- sum(residual^2) | + | ss.res <- sum(res^2) |
| - | ss.reg <- sum(explained^2) | + | ss.reg <- sum(reg^2) |
| ss.y | ss.y | ||
| ss.res | ss.res | ||
| Line 92: | Line 119: | ||
| r.square <- ss.reg / ss.y | r.square <- ss.reg / ss.y | ||
| r.square | r.square | ||
| + | sqrt(r.square) | ||
| + | cor(x,y) | ||
| - | ss.tot <- ss.y | ||
| - | ss.tot | ||
| - | ss.reg | ||
| - | ss.res | ||
| df.tot <- df.y | df.tot <- df.y | ||
| df.reg <- 2 - 1 | df.reg <- 2 - 1 | ||
| Line 105: | Line 130: | ||
| df.tot | df.tot | ||
| + | ss.tot <- ss.y | ||
| ss.reg | ss.reg | ||
| ss.res | ss.res | ||
| Line 119: | Line 145: | ||
| p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
| p.val | p.val | ||
| + | anova(lm.y.x) | ||
| - | anova(lm.xy) | + | ######################################## |
| + | # also make it sure that you understand | ||
| + | ######################################## | ||
| - | r.cal^2 | + | res <- lm.y.x$residuals |
| + | reg <- lm.y.x$fitted.values-m.y | ||
| + | # The above can be derived as follows | ||
| + | y.hat <- intercept + (slope * x) | ||
| + | y.hat | ||
| + | head(y.hat) | ||
| + | head(lm.y.x$fitted.values) | ||
| + | y.pre <- y.hat | ||
| + | y.res <- y - y.pre | ||
| + | y.res | ||
| + | head(res) | ||
| + | head(y.res) | ||
| + | |||
| + | y.reg <- y.pre - m.y | ||
| + | head(reg) | ||
| + | head(y.reg) | ||
| + | ######################################## | ||
| + | |||
| + | # This is essentially a perfect linear | ||
| + | # relationship between the regression | ||
| + | # line and x data | ||
| + | lm.reg.x <- lm(reg~x, data = df) | ||
| + | summary(lm.reg.x) | ||
| + | |||
| + | # The relationship between residuals and | ||
| + | # x should be zero | ||
| + | lm.res.x <- lm(res~x, data = df) | ||
| + | summary(lm.res.x) | ||
| + | summary(lm.y.x) | ||
| + | |||
| + | ######################################## | ||
| + | # Now let's get back to the lm output | ||
| + | # this is to evalue the significance of | ||
| + | # the slope of x (b) | ||
| + | summary(lm.y.x) | ||
| + | |||
| + | # the slope, we already got this | ||
| + | b <- slope | ||
| + | # intercept | ||
| + | a <- intercept | ||
| + | |||
| + | # the real data look like this | ||
| + | plot(x,y) | ||
| + | abline(h = mean(y), lwd=1.5, col=" | ||
| + | abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | |||
| + | # blue line is regression line | ||
| + | # this slope is an estimate of a real | ||
| + | # thing (population) rather than the | ||
| + | # sample we are using right now) | ||
| + | library(ggplot2) | ||
| + | |||
| + | ggplot(data = df, aes(x,y)) + | ||
| + | geom_point() + | ||
| + | geom_smooth(method=" | ||
| + | |||
| + | # the real slope should reside in between | ||
| + | # the violet area. The area is estimated | ||
| + | # with standard error of slop (regression | ||
| + | # coefficient) | ||
| + | |||
| + | # in word | ||
| + | # se for b is | ||
| + | # from summary(lm.y.x) | ||
| + | |||
| + | summary(lm.y.x) | ||
| + | se.b <- 0.0401 | ||
| + | ci.se.b <- 1.96 * se.b | ||
| + | # this is the lowest estimate | ||
| + | b - ci.se.b | ||
| + | # the highest estimate | ||
| + | b + ci.se.b | ||
| + | # calculated estimate | ||
| + | b | ||
| + | |||
| + | # how to get se for b? | ||
| + | # https:// | ||
| + | # see also http:// | ||
| + | |||
| + | ss.res | ||
| + | ss.x | ||
| + | # we didn't get ss.x | ||
| + | ss.x <- sum((x-m.x)^2) | ||
| + | ss.x | ||
| + | |||
| + | sqrt( (ss.res/ | ||
| + | # the above is the same as the below | ||
| + | sqrt( ss.res / (n-2)*ss.x ) | ||
| + | sqrt( 1/(n-2) * (ss.res/ | ||
| + | se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | |||
| + | # now | ||
| + | summary(lm.y.x) | ||
| + | # if b is 0, b does not do anything (null hypothesis) | ||
| + | # To test if b is working, we do t-test by | ||
| + | # b (current coefficient) - 0 (not working) / se | ||
| + | # = t.calculated. We test if this is significant | ||
| + | # tp(t.b.cal, df=n-2) | ||
| + | |||
| + | t.b.cal <- (b - 0)/ | ||
| + | t.b.cal | ||
| + | # k = num of predictor variables | ||
| + | k <- 1 | ||
| + | df.t <- n-k-1 | ||
| + | |||
| + | 2*pt(t.b.cal, | ||
| + | pf(t.b.cal^2, | ||
| </ | </ | ||
| + | |||
| + | ====== Output ====== | ||
| + | < | ||
| + | > ################ | ||
| + | > rnorm2 <- function(n, | ||
| + | > n <- 16 | ||
| + | > | ||
| + | > set.seed(101) | ||
| + | > x <- rnorm2(n, 265, 15) | ||
| + | > rand.00 <- rnorm2(n, 0, 12) | ||
| + | > | ||
| + | > rand.01 <- rnorm2(n, 0, 240) | ||
| + | > plot(rand.01) | ||
| + | > points(rand.00, | ||
| + | > | ||
| + | > b <- 170 / 265 | ||
| + | > b <- round(b,1) | ||
| + | > b | ||
| + | [1] 0.6 | ||
| + | > y <- b * x + rand.00 | ||
| + | > | ||
| + | > df <- data.frame(x, | ||
| + | > head(df) | ||
| + | | ||
| + | 1 256.4615 144.9723 | ||
| + | 2 273.7812 167.6050 | ||
| + | 3 249.5828 141.2775 | ||
| + | 4 267.1155 135.1836 | ||
| + | 5 269.0162 161.7509 | ||
| + | 6 286.0342 183.7182 | ||
| + | > | ||
| + | > # plot method 0 | ||
| + | > plot(x,y) | ||
| + | > | ||
| + | > # method 1 | ||
| + | > plot(x, y, pch = 1, cex = 1, col = " | ||
| + | > abline(h = mean(y), lwd=1.5, col=" | ||
| + | > abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | > | ||
| + | > # method 2 | ||
| + | > lm.y.x <- lm(y ~ x, data = df) | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > ########################### | ||
| + | > # double check with this | ||
| + | > ########################### | ||
| + | > str(lm.y.x) | ||
| + | List of 12 | ||
| + | $ coefficients : Named num [1:2] -52.9 0.8 | ||
| + | ..- attr(*, " | ||
| + | $ residuals | ||
| + | ..- attr(*, " | ||
| + | $ effects | ||
| + | ..- attr(*, " | ||
| + | $ rank : int 2 | ||
| + | $ fitted.values: | ||
| + | ..- attr(*, " | ||
| + | $ assign | ||
| + | $ qr :List of 5 | ||
| + | ..$ qr : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ... | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..$ : chr [1:16] " | ||
| + | .. .. ..$ : chr [1:2] " | ||
| + | .. ..- attr(*, " | ||
| + | ..$ qraux: num [1:2] 1.25 1.18 | ||
| + | ..$ pivot: int [1:2] 1 2 | ||
| + | ..$ tol : num 1e-07 | ||
| + | ..$ rank : int 2 | ||
| + | ..- attr(*, " | ||
| + | $ df.residual | ||
| + | $ xlevels | ||
| + | $ call : language lm(formula = y ~ x, data = df) | ||
| + | $ terms :Classes ' | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. ..$ : chr " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | $ model :' | ||
| + | ..$ y: num [1:16] 145 168 141 135 162 ... | ||
| + | ..$ x: num [1:16] 256 274 250 267 269 ... | ||
| + | ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | .. .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. .. ..$ : chr " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | - attr(*, " | ||
| + | > y.res <- lm.y.x$residuals | ||
| + | > y.pre <- lm.y.x$fitted.values | ||
| + | > y.exp <- y.pre - m.y | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > ########################### | ||
| + | > lm.y.x$coefficients | ||
| + | (Intercept) | ||
| + | -52.8817788 | ||
| + | > intercept <- lm.y.x$coefficients[1] | ||
| + | > slope <- lm.y.x$coefficients[2] | ||
| + | > intercept | ||
| + | (Intercept) | ||
| + | -52.88178 | ||
| + | > slope | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > | ||
| + | > # library(ggplot2) | ||
| + | > # ggplot(data=df, | ||
| + | > # geom_point(color=" | ||
| + | > # geom_hline(aes(yintercept=mean(y))) + | ||
| + | > # geom_abline(intercept=intercept, | ||
| + | > # | ||
| + | > ##### | ||
| + | > ss.y <- sum((y-mean(y))^2) | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > df.y <- length(y)-1 | ||
| + | > df.y | ||
| + | [1] 15 | ||
| + | > ss.y/df.y | ||
| + | [1] 278.8795 | ||
| + | > var(y) | ||
| + | [,1] | ||
| + | [1,] 278.8795 | ||
| + | > | ||
| + | > m.x <- mean(x) | ||
| + | > m.y <- mean(y) | ||
| + | > sp <- sum((x-m.x)*(y-m.y)) | ||
| + | > df.tot <- n-1 | ||
| + | > sp/df.tot | ||
| + | [1] 179.8996 | ||
| + | > cov.xy <- cov(x,y) | ||
| + | > | ||
| + | > sd.x <- sd(x) | ||
| + | > sd.y <- sd(y) | ||
| + | > | ||
| + | > r.xy <- cov.xy/ | ||
| + | > r.xy | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > cor.test(x, | ||
| + | |||
| + | Pearson' | ||
| + | |||
| + | data: x and y | ||
| + | t = 3.8616, df = 14, p-value = 0.001727 | ||
| + | alternative hypothesis: true correlation is not equal to 0 | ||
| + | 95 percent confidence interval: | ||
| + | | ||
| + | sample estimates: | ||
| + | cor | ||
| + | 0.7181756 | ||
| + | |||
| + | > | ||
| + | > b <- cov(x,y) / var(x) | ||
| + | > # note that | ||
| + | > # (sp(x, | ||
| + | > # = sp(x,y) / ss(x) | ||
| + | > # m.y = a + b * mean.x | ||
| + | > a <- m.y - (b * m.x) | ||
| + | > b | ||
| + | [,1] | ||
| + | [1,] 0.7995539 | ||
| + | > a | ||
| + | [,1] | ||
| + | [1,] -52.88178 | ||
| + | > # we got these from the above | ||
| + | > slope | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > intercept | ||
| + | (Intercept) | ||
| + | -52.88178 | ||
| + | > | ||
| + | > | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > lm.y.x$coefficients | ||
| + | (Intercept) | ||
| + | -52.8817788 | ||
| + | > # str(lm.y.x) | ||
| + | > | ||
| + | > y.pred <- lm.y.x$fitted.values | ||
| + | > y <- y | ||
| + | > m.y | ||
| + | [1] 159 | ||
| + | > | ||
| + | > res <- y - y.pred | ||
| + | > reg <- y.pred - m.y | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > ss.res <- sum(res^2) | ||
| + | > ss.reg <- sum(reg^2) | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.reg | ||
| + | [1] 2157.592 | ||
| + | > ss.res + ss.reg | ||
| + | [1] 4183.193 | ||
| + | > | ||
| + | > r.square <- ss.reg / ss.y | ||
| + | > r.square | ||
| + | [1] 0.5157762 | ||
| + | > sqrt(r.square) | ||
| + | [1] 0.7181756 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > | ||
| + | > df.tot <- df.y | ||
| + | > df.reg <- 2 - 1 | ||
| + | > df.res <- df.tot - df.reg | ||
| + | > | ||
| + | > df.reg | ||
| + | [1] 1 | ||
| + | > df.res | ||
| + | [1] 14 | ||
| + | > df.tot | ||
| + | [1] 15 | ||
| + | > | ||
| + | > ss.tot <- ss.y | ||
| + | > ss.reg | ||
| + | [1] 2157.592 | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.tot | ||
| + | [1] 4183.193 | ||
| + | > | ||
| + | > ms.reg <- ss.reg / df.reg | ||
| + | > ms.res <- ss.res / df.res | ||
| + | > ms.reg | ||
| + | [1] 2157.592 | ||
| + | > ms.res | ||
| + | [1] 144.6858 | ||
| + | > | ||
| + | > f.cal <- ms.reg/ | ||
| + | > f.cal | ||
| + | [1] 14.91225 | ||
| + | > | ||
| + | > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
| + | > p.val | ||
| + | [1] 0.001727459 | ||
| + | > anova(lm.y.x) | ||
| + | Analysis of Variance Table | ||
| + | |||
| + | Response: y | ||
| + | Df Sum Sq Mean Sq F value | ||
| + | x 1 2157.6 2157.59 | ||
| + | Residuals 14 2025.6 | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | > | ||
| + | > ######################################## | ||
| + | > # also make it sure that you understand | ||
| + | > ######################################## | ||
| + | > | ||
| + | > res <- lm.y.x$residuals | ||
| + | > reg <- lm.y.x$fitted.values-m.y | ||
| + | > | ||
| + | > # The above can be derived as follows | ||
| + | > y.hat <- intercept + (slope * x) | ||
| + | > y.hat | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | [7,] 167.0666 | ||
| + | [8,] 155.5354 | ||
| + | [9,] 171.7678 | ||
| + | [10,] 153.7931 | ||
| + | [11,] 165.6110 | ||
| + | [12,] 144.7831 | ||
| + | [13,] 179.8185 | ||
| + | [14,] 134.1906 | ||
| + | [15,] 153.5815 | ||
| + | [16,] 154.2648 | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(y.hat) | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | > head(lm.y.x$fitted.values) | ||
| + | | ||
| + | 152.1730 166.0210 146.6731 160.6914 162.2112 175.8179 | ||
| + | > y.pre <- y.hat | ||
| + | > y.res <- y - y.pre | ||
| + | > y.res | ||
| + | [,1] | ||
| + | | ||
| + | | ||
| + | | ||
| + | [4,] -25.5078178 | ||
| + | | ||
| + | | ||
| + | | ||
| + | [8,] -16.3176727 | ||
| + | | ||
| + | [10,] -15.1613471 | ||
| + | [11,] | ||
| + | [12,] | ||
| + | [13,] | ||
| + | [14,] 15.4541201 | ||
| + | [15,] 15.9625789 | ||
| + | [16,] | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(res) | ||
| + | 1 | ||
| + | | ||
| + | > head(y.res) | ||
| + | [,1] | ||
| + | [1,] -7.2007773 | ||
| + | [2,] | ||
| + | [3,] -5.3956693 | ||
| + | [4,] -25.5078178 | ||
| + | [5,] -0.4602376 | ||
| + | [6,] | ||
| + | > | ||
| + | > y.reg <- y.pre - m.y | ||
| + | > head(reg) | ||
| + | | ||
| + | | ||
| + | > head(y.reg) | ||
| + | [,1] | ||
| + | [1,] -6.826963 | ||
| + | [2,] | ||
| + | [3,] -12.326872 | ||
| + | [4,] | ||
| + | [5,] | ||
| + | [6,] 16.817938 | ||
| + | > ######################################## | ||
| + | > | ||
| + | > # This is essentially a perfect linear | ||
| + | > # relationship between the regression | ||
| + | > # line and x data | ||
| + | > lm.reg.x <- lm(reg~x, data = df) | ||
| + | > summary(lm.reg.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = reg ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | | ||
| + | -1.216e-14 -9.993e-15 -2.381e-15 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -2.119e+02 | ||
| + | x 7.996e-01 | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 1.165e-14 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | 경고메시지(들): | ||
| + | summary.lm(lm.reg.x)에서: | ||
| + | essentially perfect fit: summary may be unreliable | ||
| + | > | ||
| + | > # The relationship between residuals and | ||
| + | > # x should be zero | ||
| + | > lm.res.x <- lm(res~x, data = df) | ||
| + | > summary(lm.res.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = res ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(>|t|) | ||
| + | (Intercept) -1.956e-14 | ||
| + | x 6.880e-17 | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > ######################################## | ||
| + | > # Now let's get back to the lm output | ||
| + | > # this is to evalue the significance of | ||
| + | > # the slope of x (b) | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > # the slope, we already got this | ||
| + | > b <- slope | ||
| + | > # intercept | ||
| + | > a <- intercept | ||
| + | > | ||
| + | > # the real data look like this | ||
| + | > plot(x,y) | ||
| + | > abline(h = mean(y), lwd=1.5, col=" | ||
| + | > abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | > | ||
| + | > # blue line is regression line | ||
| + | > # this slope is an estimate of a real | ||
| + | > # thing (population) rather than the | ||
| + | > # sample we are using right now) | ||
| + | > library(ggplot2) | ||
| + | > | ||
| + | > ggplot(data = df, aes(x,y)) + | ||
| + | + | ||
| + | + | ||
| + | `geom_smooth()` using formula = 'y ~ x' | ||
| + | > | ||
| + | > # the real slope should reside in between | ||
| + | > # the violet area. The area is estimated | ||
| + | > # with standard error of slop (regression | ||
| + | > # coefficient) | ||
| + | > | ||
| + | > # in word | ||
| + | > # se for b is | ||
| + | > # from summary(lm.y.x) | ||
| + | > | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > se.b <- 0.0401 | ||
| + | > ci.se.b <- 1.96 * se.b | ||
| + | > # this is the lowest estimate | ||
| + | > b - ci.se.b | ||
| + | x | ||
| + | 0.7209579 | ||
| + | > # the highest estimate | ||
| + | > b + ci.se.b | ||
| + | x | ||
| + | 0.8781499 | ||
| + | > # calculated estimate | ||
| + | > b | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > | ||
| + | > # how to get se for b? | ||
| + | > # https:// | ||
| + | > # see also http:// | ||
| + | > | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > # we didn't get ss.x | ||
| + | > ss.x <- sum((x-m.x)^2) | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > | ||
| + | > sqrt( (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > # the above is the same as the below | ||
| + | > sqrt( ss.res / (n-2)*ss.x ) | ||
| + | [1] 698.7952 | ||
| + | > sqrt( 1/(n-2) * (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | > | ||
| + | > # now | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min 1Q Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > # if b is 0, b does not do anything (null hypothesis) | ||
| + | > # To test if b is working, we do t-test by | ||
| + | > # b (current coefficient) - 0 (not working) / se | ||
| + | > # = t.calculated. We test if this is significant | ||
| + | > # tp(t.b.cal, df=n-2) | ||
| + | > | ||
| + | > t.b.cal <- (b - 0)/ | ||
| + | > t.b.cal | ||
| + | | ||
| + | 3.861639 | ||
| + | > # k = num of predictor variables | ||
| + | > k <- 1 | ||
| + | > df.t <- n-k-1 | ||
| + | > | ||
| + | > 2*pt(t.b.cal, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > pf(t.b.cal^2, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > | ||
| + | </ | ||
| + | ====== Graphics output ====== | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | |||
c/ms/regression_lecture_note_for_r.1716335993.txt.gz · Last modified: by hkimscil
