This is an old revision of the document!
Table of Contents
Multiple Regression
> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")
> str(datavar)
'data.frame': 29 obs. of 5 variables:
$ YEAR : int 1 2 3 4 5 6 7 8 9 10 ...
$ ROLL : int 5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
$ UNEM : num 8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
$ HGRAD: int 9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
$ INC : int 1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
>
onePredictorModel <- lm(ROLL ~ UNEM, data = datavar) twoPredictorModel <- lm(ROLL ~ UNEM + HGRAD, data = datavar) threePredictorModel <- lm(ROLL ~ UNEM + HGRAD + INC, data = datavar)
summary(onePredictorModel) summary(twoPredictorModel) summary(threePredictorModel)
> summary(onePredictorModel)
Call:
lm(formula = ROLL ~ UNEM, data = datavar)
Residuals:
Min 1Q Median 3Q Max
-7640.0 -1046.5 602.8 1934.3 4187.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3957.0 4000.1 0.989 0.3313
UNEM 1133.8 513.1 2.210 0.0358 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3049 on 27 degrees of freedom
Multiple R-squared: 0.1531, Adjusted R-squared: 0.1218
F-statistic: 4.883 on 1 and 27 DF, p-value: 0.03579
> summary(twoPredictorModel)
Call:
lm(formula = ROLL ~ UNEM + HGRAD, data = datavar)
Residuals:
Min 1Q Median 3Q Max
-2102.2 -861.6 -349.4 374.5 3603.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.256e+03 2.052e+03 -4.023 0.00044 ***
UNEM 6.983e+02 2.244e+02 3.111 0.00449 **
HGRAD 9.423e-01 8.613e-02 10.941 3.16e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1313 on 26 degrees of freedom
Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
>
> summary(threePredictorModel)
Call:
lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
Residuals:
Min 1Q Median 3Q Max
-1148.84 -489.71 -1.88 387.40 1425.75
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.153e+03 1.053e+03 -8.691 5.02e-09 ***
UNEM 4.501e+02 1.182e+02 3.809 0.000807 ***
HGRAD 4.065e-01 7.602e-02 5.347 1.52e-05 ***
INC 4.275e+00 4.947e-01 8.642 5.59e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 670.4 on 25 degrees of freedom
Multiple R-squared: 0.9621, Adjusted R-squared: 0.9576
F-statistic: 211.5 on 3 and 25 DF, p-value: < 2.2e-16
anova(onePredictorModel, twoPredictorModel, threePredictorModel) Analysis of Variance Table Model 1: ROLL ~ UNEM Model 2: ROLL ~ UNEM + HGRAD Model 3: ROLL ~ UNEM + HGRAD + INC Res.Df RSS Df Sum of Sq F Pr(>F) 1 27 251084710 2 26 44805568 1 206279143 458.92 < 2.2e-16 *** 3 25 11237313 1 33568255 74.68 5.594e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 >
Housing
e.g.
etc
library(tidyverse)
data("marketing", package = "datarium")
head(marketing, 4)
- Data, marketing is the impact of the amount of money spent on three advertising medias (youtube, facebook and newspaper) on sales.
- Note that to list all the independent (explanatory) variables, you could use
lm (sales ~ ., data=“marketing”). - You could also use
-sign to subtract ivs.lm(sales ~ . - newspapers, data = “marketing”)
mod <- lm(sales ~ . , data=marketing) summary(mod) summary(mod)$coefficients
mod2 <- lm(sales ~ . - newspaper, data = marketing) summary(mod2)
> mod <- lm(sales ~ . , data=marketing)
> summary(mod)
Call:
lm(formula = sales ~ ., data = marketing)
Residuals:
Min 1Q Median 3Q Max
-10.5932 -1.0690 0.2902 1.4272 3.3951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.526667 0.374290 9.422 <2e-16 ***
youtube 0.045765 0.001395 32.809 <2e-16 ***
facebook 0.188530 0.008611 21.893 <2e-16 ***
newspaper -0.001037 0.005871 -0.177 0.86
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.023 on 196 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
> summary(mod)$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.526667243 0.374289884 9.4222884 1.267295e-17
youtube 0.045764645 0.001394897 32.8086244 1.509960e-81
facebook 0.188530017 0.008611234 21.8934961 1.505339e-54
newspaper -0.001037493 0.005871010 -0.1767146 8.599151e-01
> mod2 <- lm(sales ~ . - newspaper, data = marketing)
> summary(mod2)
Call:
lm(formula = sales ~ . - newspaper, data = marketing)
Residuals:
Min 1Q Median 3Q Max
-10.5572 -1.0502 0.2906 1.4049 3.3994
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.50532 0.35339 9.919 <2e-16 ***
youtube 0.04575 0.00139 32.909 <2e-16 ***
facebook 0.18799 0.00804 23.382 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.018 on 197 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
sales = 3.5 + 0.045*youtube + 0.187*facebook.
Prediction
Predict(lm.fit, newdata=data.frame(x1=c(2,3,4), x2=c(3,3,3))
Year <- c(2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016)
Month <- c(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
Interest_Rate <- c(2.75, 2.5, 2.5, 2.5, 2.5, 2.5,
2.5, 2.25, 2.25, 2.25, 2, 2, 2, 1.75,
1.75, 1.75, 1.75, 1.75, 1.75, 1.75,
1.75, 1.75, 1.75, 1.75)
Unemployment_Rate <- c(5.3, 5.3, 5.3, 5.3, 5.4, 5.6,
5.5, 5.5, 5.5, 5.6, 5.7, 5.9, 6, 5.9,
5.8, 6.1, 6.2, 6.1, 6.1, 6.1, 5.9, 6.2,
6.2, 6.1)
Stock_Index_Price <- c(1464, 1394, 1357, 1293, 1256,
1254, 1234, 1195, 1159, 1167, 1130, 1075,
1047, 965, 943, 958, 971, 949, 884, 866,
876, 822, 704, 719)
dat.stock <- data.frame(Year, Month, Interest_Rate, Unemployment_Rate, Stock_Index_Price) dat.stock Year Month Interest_Rate Unemployment_Rate Stock_Index_Price 1 2017 12 2.75 5.3 1464 2 2017 11 2.50 5.3 1394 3 2017 10 2.50 5.3 1357 4 2017 9 2.50 5.3 1293 5 2017 8 2.50 5.4 1256 6 2017 7 2.50 5.6 1254 7 2017 6 2.50 5.5 1234 8 2017 5 2.25 5.5 1195 9 2017 4 2.25 5.5 1159 10 2017 3 2.25 5.6 1167 11 2017 2 2.00 5.7 1130 12 2017 1 2.00 5.9 1075 13 2016 12 2.00 6.0 1047 14 2016 11 1.75 5.9 965 15 2016 10 1.75 5.8 943 16 2016 9 1.75 6.1 958 17 2016 8 1.75 6.2 971 18 2016 7 1.75 6.1 949 19 2016 6 1.75 6.1 884 20 2016 5 1.75 6.1 866 21 2016 4 1.75 5.9 876 22 2016 3 1.75 6.2 822 23 2016 2 1.75 6.2 704 24 2016 1 1.75 6.1 719 >
lm.stock <- lm(Stock_Index_Price~Interest_Rate+Unemployment_Rate, data=dat.stock) summary(lm.stock) lm.stock$coefficient coefficients(lm.stock)
> lm.stock <- lm(Stock_Index_Price~Interest_Rate+Unemployment_Rate, data=dat.stock)
> summary(lm.stock)
Call:
lm(formula = Stock_Index_Price ~ Interest_Rate + Unemployment_Rate,
data = dat.stock)
Residuals:
Min 1Q Median 3Q Max
-158.205 -41.667 -6.248 57.741 118.810
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1798.4 899.2 2.000 0.05861 .
Interest_Rate 345.5 111.4 3.103 0.00539 **
Unemployment_Rate -250.1 117.9 -2.121 0.04601 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 70.56 on 21 degrees of freedom
Multiple R-squared: 0.8976, Adjusted R-squared: 0.8879
F-statistic: 92.07 on 2 and 21 DF, p-value: 4.043e-11
> lm.stock$coefficient
(Intercept) Interest_Rate Unemployment_Rate
1798.4040 345.5401 -250.1466
> coefficients(lm.stock)
(Intercept) Interest_Rate Unemployment_Rate
1798.4040 345.5401 -250.1466
>
>
dat.stock.new.iv <- data.frame(Interest_Rate=c(5,6,2,1), Unemployment_Rate=c(8,8.5,4,3)) predict(lm.stock, newdata=dat.stock.new.iv)
> dat.stock.new.iv <- data.frame(Interest_Rate=c(5,6,2,1), Unemployment_Rate=c(8,8.5,4,3))
> predict(lm.stock, newdata=dat.stock.new.iv)
1 2 3 4
1524.932 1745.399 1488.898 1393.504
>
Partial, Semi-partial Correlation and R squared value
아래에서, summary(lm.stock)은
> lm.stock <- lm(Stock_Index_Price~Interest_Rate+Unemployment_Rate, data=dat.stock)
> summary(lm.stock)
Call:
lm(formula = Stock_Index_Price ~ Interest_Rate + Unemployment_Rate,
data = dat.stock)
Residuals:
Min 1Q Median 3Q Max
-158.205 -41.667 -6.248 57.741 118.810
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1798.4 899.2 2.000 0.05861 .
Interest_Rate 345.5 111.4 3.103 0.00539 **
Unemployment_Rate -250.1 117.9 -2.121 0.04601 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 70.56 on 21 degrees of freedom
Multiple R-squared: 0.8976, Adjusted R-squared: 0.8879
F-statistic: 92.07 on 2 and 21 DF, p-value: 4.043e-11
>
> anova(lm.stock)
Analysis of Variance Table
Response: Stock_Index_Price
Df Sum Sq Mean Sq F value Pr(>F)
Interest_Rate 1 894463 894463 179.6477 9.231e-12 ***
Unemployment_Rate 1 22394 22394 4.4977 0.04601 *
Residuals 21 104559 4979
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
| df | SS | ms | F | |
| interest | 1 (a) | 894463 (1) | 894463 | 179.6471179 |
| unemp | 1 (b) | 22394 (2) | 22394 | 4.497690299 |
| res | 21 © | 104559 (3) | 4979 | |
| total | 23 | 1021416 (4) | ||
| interst + enemp | 916857 (5) |
(4) = (1) + (2) + (3)
(5) = (1) + (2)
from the above summary(lm.stock)
Multiple R-squared: 0.8976, . . . .
F-statistic: 92.07 on 2 and 21 DF, p-value: 4.043e-11
\begin{eqnarray*} R^2 & = & \dfrac {SS_{reg}}{SS_{total}} \\ & = & \dfrac{(5)}{(4)}, \qquad \text{from the table} \\ & = & \dfrac {916857}{1021416} \\ & = & 0.897633286 \\ & = & 0.8976 \end{eqnarray*}
\begin{eqnarray*} F & = & \dfrac {MS_{reg}} {MS_{res}} \\ & = & \dfrac {(5)/(a)+(b)}{(3)/(c)} \\ & = & \dfrac {916857/2}{104559/21} \\ & = & 92.0724041 \\ & = & 92.07 \qquad (df = 2, 21) \end{eqnarray*}
그런데, 각각의 IV와 DV 간의 correlation 값은:
> cor.test(Stock_Index_Price, Interest_Rate)
Pearson's product-moment correlation
data: Stock_Index_Price and Interest_Rate
t = 12.45, df = 22, p-value = 1.954e-11
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8552498 0.9721916
sample estimates:
cor
0.9357932
> cor.test(Stock_Index_Price, Unemployment_Rate)
Pearson's product-moment correlation
data: Stock_Index_Price and Unemployment_Rate
t = -11.196, df = 22, p-value = 1.487e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.9662308 -0.8264284
sample estimates:
cor
-0.9223376
>
Stock_Index_Price ~ Interest_Rate = 0.9357932
Stock_Index_Price ~ Unemployment_Rate = -0.9223376
각각의 r square 값은
Stock_Index_Price ~ Interest_Rate -> 0.875708913
Stock_Index_Price ~ Unemployment_Rate -> 0.850706648
이 둘의 합은 1.726415562 로 1을 넘는다. 즉, 각 독립변인의 영향력은 중복 (독립변인 간의 상관관계) 되어 있다는 뜻.
install.packages("ppcor")
library(ppcor)
> pcor.test(Stock_Index_Price, Interest_Rate, Unemployment_Rate) estimate p.value statistic n gp Method 1 0.560649 0.005389174 3.102717 24 1 pearson
> spcor.test(Stock_Index_Price, Interest_Rate, Unemployment_Rate) estimate p.value statistic n gp Method 1 0.2166264 0.3207966 1.016852 24 1 pearson
0.2166264^2 = 0.046926997
b + c / a + b + c + d = 0.8757
b / a + b + c + d = 0.0467
c / a + b + c + d = 0.8288
> spcor.test(Stock_Index_Price, Unemployment_Rate, Interest_Rate)
estimate p.value statistic n gp Method
1 -0.1480697 0.5001535 -0.6861036 24 1 pearson
>
(-0.1480697)^2 = 0.021924636 = 0.0219
d + c / a + b + c + d = 0.8507
d / a + b + c + d = 0.0219
c / a + b + c + d = 0.8288
위와 아래의 c / a + b + c + d = 0.8288 가 일치하므로, 독립변인 둘의 겹치는 영향력이 크다는 것을 알수 있다.
> lm.stock.int <- lm(Stock_Index_Price~Interest_Rate, data=dat.stock)
> summary(lm.stock.int)
Call:
lm(formula = Stock_Index_Price ~ Interest_Rate, data = dat.stock)
Residuals:
Min 1Q Median 3Q Max
-183.892 -30.181 4.455 56.608 101.057
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -99.46 95.21 -1.045 0.308
Interest_Rate 564.20 45.32 12.450 1.95e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 75.96 on 22 degrees of freedom
Multiple R-squared: 0.8757, Adjusted R-squared: 0.8701
F-statistic: 155 on 1 and 22 DF, p-value: 1.954e-11
> lm.stock.unemp <- lm(Stock_Index_Price ~ Unemployment_Rate, data=dat.stock)
> summary(lm.stock.unemp)
Call:
lm(formula = Stock_Index_Price ~ Unemployment_Rate, data = dat.stock)
Residuals:
Min 1Q Median 3Q Max
-159.671 -41.996 2.089 72.381 151.226
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4471.3 304.2 14.7 7.41e-13 ***
Unemployment_Rate -589.0 52.6 -11.2 1.49e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 83.25 on 22 degrees of freedom
Multiple R-squared: 0.8507, Adjusted R-squared: 0.8439
F-statistic: 125.4 on 1 and 22 DF, p-value: 1.487e-10
>
>
> anova(lm.stock.int, lm.stock) Analysis of Variance Table Model 1: Stock_Index_Price ~ Interest_Rate Model 2: Stock_Index_Price ~ Interest_Rate + Unemployment_Rate Res.Df RSS Df Sum of Sq F Pr(>F) 1 22 126953 2 21 104559 1 22394 4.4977 0.04601 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> anova(lm.stock.unemp, lm.stock) Analysis of Variance Table Model 1: Stock_Index_Price ~ Unemployment_Rate Model 2: Stock_Index_Price ~ Interest_Rate + Unemployment_Rate Res.Df RSS Df Sum of Sq F Pr(>F) 1 22 152491 2 21 104559 1 47932 9.6269 0.005389 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 >
Alternatively,
> install.packages("lmtest")
> library(lmtest)
> lrtest(lm.stock.int, lm.stock)
Likelihood ratio test
Model 1: Stock_Index_Price ~ Interest_Rate
Model 2: Stock_Index_Price ~ Interest_Rate + Unemployment_Rate
#Df LogLik Df Chisq Pr(>Chisq)
1 3 -136.94
2 4 -134.61 1 4.6576 0.03092 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>

