Differences

This shows you the differences between two versions of the page.

--- r:multiple_regression [2019/11/08 10:59] – [Prediction] hkimscil
+++ r:multiple_regression [2021/11/04 08:18] – [Multiple Regression] hkimscil
@@ Line 1: / Line 1: @@
 ====== Multiple Regression ======
 {{:R:dataset_hlr.csv|data file}}
-<code>> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")
+University of New Mexico enrollment data (for 30 years)
+ROLL: # of enrollment
+UNEM: enemployment level
+HGRAD: # of High school graduates
+INC: income level
+<code>
+# data import
+> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")
 > str(datavar)
 'data.frame':	29 obs. of  5 variables:
@@ Line 11: / Line 18: @@
 >
 </code>
 <code>
-onePredictorModel <- lm(ROLL ~ UNEM, data = datavar)
+two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar)
-twoPredictorModel <- lm(ROLL ~ UNEM + HGRAD, data = datavar)
+summary(two.predictor.model)
-threePredictorModel <- lm(ROLL ~ UNEM + HGRAD + INC, data = datavar)
+two.predictor.model
 </code>
-<code>summary(onePredictorModel)
+<code>
-summary(twoPredictorModel)
+three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar)
-summary(threePredictorModel)
+summary(three.predictor.model)
+three.predictor.model
 </code>
-<code>> summary(onePredictorModel)
+<code>
+> two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar)
-Call:
+> summary(two.predictor.model)
-lm(formula = ROLL ~ UNEM, data = datavar)
-Residuals:
-    Min      1Q  Median      3Q     Max
--7640.0 -1046.5   602.8  1934.3  4187.2
-Coefficients:
-            Estimate Std. Error t value Pr(>|t|)
-(Intercept)   3957.0     4000.1   0.989   0.3313
-UNEM          1133.8      513.1   2.210   0.0358 *
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-Residual standard error: 3049 on 27 degrees of freedom
-Multiple R-squared:  0.1531,	Adjusted R-squared:  0.1218
-F-statistic: 4.883 on 1 and 27 DF,  p-value: 0.03579
-</code>
-<code>> summary(twoPredictorModel)
 Call:
@@ Line 65: / Line 53: @@
 F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
-> </code>
+> two.predictor.model
+Call:
+lm(formula = ROLL ~ UNEM + HGRAD, data = datavar)
+Coefficients:
+(Intercept)         UNEM        HGRAD
+ -8255.7511     698.2681       0.9423
+>
+</code>
 <code>
-> summary(threePredictorModel)
+> three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar)
+> summary(three.predictor.model)
 Call:
@@ Line 89: / Line 89: @@
 F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16
+> three.predictor.model
+Call:
+lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
+Coefficients:
+(Intercept)         UNEM        HGRAD          INC
+ -9153.2545     450.1245       0.4065       4.2749
+>
 </code>
-<code>anova(onePredictorModel, twoPredictorModel, threePredictorModel)
+만약에
-Analysis of Variance Table
+  * unemployment rate (UNEM) = 9%, 12%, 3%
+  * spring high school graduating class (HGRAD) = 100000, 98000, 78000
+  * a per capita income (INC) of $30,000, $2800, $36000
+  * 일 때, enrollment는 어떻게 predict할 수 있을까?
-Model 1: ROLL ~ UNEM
+위에서 얻은 prediction model은 아래와 같다.
-Model 2: ROLL ~ UNEM + HGRAD
+$$ \hat{Y} = -9153.2545 + 450.1245 \cdot UNEM + 0.4065 \cdot HGRAD + 4.2749 \cdot INC $$
-Model 3: ROLL ~ UNEM + HGRAD + INC
+여기에 위의 정보를 대입해 보면 된다.
-  Res.Df       RSS Df Sum of Sq      F    Pr(>F)
-     27 251084710
+<code>
-     26  44805568  1 206279143 458.92 < 2.2e-16 ***
+new.data <- data.frame(UNEM=c(9, 12, 3), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 36000))
-     25  11237313  1  33568255  74.68 5.594e-09 ***
+predict(three.predictor.model, newdata=new.data)
----
+</code>
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+<code>
+> new.data <- data.frame(UNEM=c(9, 10, 15), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 19000))
+> predict(three.predictor.model, newdata=new.data)
+        2        3
+.0 154879.4 110526.6
 >
+</code>
+\begin{align*}
+\hat{Y} & = -9153.2545 + 450.1245 \cdot \text{UNEM} + 0.4065 \cdot \text{HGRAD} + 4.2749 \cdot \text{INC}  \\
+.0 & = -9153.2545 + 450.1245 \cdot (9) + 0.4065 \cdot (100000) + 4.2749 \cdot (30000) \\
+.4 & = -9153.2545 + 450.1245 \cdot (10) + 0.4065 \cdot (98000) + 4.2749 \cdot (28000) \\
+.6 & = -9153.2545 + 450.1245 \cdot (15) + 0.4065 \cdot (78000) + 4.2749 \cdot (19000) \\
+\end{align*}
+beta coefficient 살펴보기
+see [[:beta coefficients]]
+<code>
+# install.packages('lm.beta')
+# library(lm.beta)
+lm.beta(three.predictor.model)
+</code>
+<code>
+> # install.packages('lm.beta')
+> # library(lm.beta)
+> lm.beta(three.predictor.model)
+Call:
+lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
+Standardized Coefficients::
+(Intercept)        UNEM       HGRAD         INC
+.0000000   0.1553619   0.3656177   0.6061762
+>
+</code>
+by hand
+<code>
+# coefficient * (sd(x)/sd(y)) 이므로
+#
+attach(datavar)
+sd.roll <- sd(ROLL)
+sd.unem <- sd(UNEM)
+sd.hgrad <- sd(HGRAD)
+sd.inc <- sd(INC)
+b.unem <- three.predictor.model$coefficients[2]
+b.hgrad <- three.predictor.model$coefficients[3]
+b.inc <- three.predictor.model$coefficients[4]
+## or
+b.unem <- 4.501e+02
+b.hgrad <- 4.065e-01
+b.inc <- 4.275e+00
+b.unem * (sd.unem / sd.roll)
+b.hgrad * (sd.hgrad / sd.roll)
+b.inc * (sd.inc / sd.roll)
+lm.beta(three.predictor.model)
+</code>
+output of the above
+<code>
+> sd.roll <- sd(ROLL)
+> sd.unem <- sd(UNEM)
+> sd.hgrad <- sd(HGRAD)
+> sd.inc <- sd(INC)
+>
+> b.unem <- three.predictor.model$coefficients[2]
+> b.hgrad <- three.predictor.model$coefficients[3]
+> b.inc <- three.predictor.model$coefficients[4]
+>
+> ## or
+> b.unem <- 4.501e+02
+> b.hgrad <- 4.065e-01
+> b.inc <- 4.275e+00
+>
+>
+> b.unem * (sd.unem / sd.roll)
+[1] 0.1554
+> b.hgrad * (sd.hgrad / sd.roll)
+[1] 0.3656
+> b.inc * (sd.inc / sd.roll)
+[1] 0.6062
+>
+> lm.beta(three.predictor.model)
+Call:
+lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
+Standardized Coefficients::
+(Intercept)        UNEM       HGRAD         INC
+.0000      0.1554      0.3656      0.6062
+>
+</code>
+see also [[:sequential_regression#eg_3_college_enrollment_in_new_mexico_university|Sequential method]] regression modeling by hand
+see also [[:statistical regression methods]] regression modeling by computing
+<code>
+> fit <- three.predictor.model
+> step <- stepAIC(fit, direction="both")
+Start:  AIC=381.2
+ROLL ~ UNEM + HGRAD + INC
+        Df Sum of Sq      RSS AIC
+<none>               11237313 381
+- UNEM   1   6522098 17759411 392
+- HGRAD  1  12852039 24089352 401
+- INC    1  33568255 44805568 419
+>
 </code>
 ====== Housing ======
@@ Line 111: / Line 239: @@
 ====== etc ======
+{{:marketing_from_datarium.csv}}
 <code>
+marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv")
+</code>
+<code>
+# install.packages("tidyverse", dep=TRUE)
 library(tidyverse)
 data("marketing", package = "datarium")
@@ Line 119: / Line 253: @@
   * Note that to list all the independent (explanatory) variables, you could use ''lm (sales ~ ., data="marketing")''.
   * You could also use ''-'' sign to subtract ivs. ''lm(sales ~ . - newspapers, data = "marketing")''
 <code>
@@ Line 345: / Line 480: @@
 | interest  | 1 (a)  | 894463 (1)  | 894463  | 179.6471179  |
 | unemp  | 1 (b)  | 22394 (2) | 22394  | 4.497690299  |
-| res  | 21 (c)  | 104559 (3) | 4979  |   |
+| res  | 21 %%(%%c%%)%%  | 104559 (3) | 4979  |   |
 | total  | 23  | 1021416 (4)  |   |   |
-| interst \\ + enemp  |   | 916857 (5)  |   |   |
+| interest \\ + enemp  |   | 916857 (5)  |   |   |
 (4) = (1) + (2) + (3)
@@ Line 534: / Line 669: @@
 Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > </code>
+====== e.g. 5 ======
+http://www.alexanderdemos.org/Class2.html
+<code>
+#packages we will need to conduct to create and graph our data
+library(MASS) #create data
+library(car) #graph data
+py1 =.6 #Cor between X1 (Practice Time) and Memory Errors
+py2 =.4 #Cor between X2 (Performance Anxiety) and Memory Errors
+p12= .3 #Cor between X1 (Practice Time) and X2 (Performance Anxiety)
+Means.X1X2Y<- c(10,10,10) #set the means of X and Y variables
+CovMatrix.X1X2Y <- matrix(c(1,p12,py1,
+                            p12,1,py2,
+                            py1,py2,1),3,3) # creates the covariate matrix
+#build the correlated variables. Note: empirical=TRUE means make the correlation EXACTLY r.
+# if we say empirical=FALSE, the correlation would be normally distributed around r
+set.seed(42)
+CorrDataT<-mvrnorm(n=100, mu=Means.X1X2Y,Sigma=CovMatrix.X1X2Y, empirical=TRUE)
+#Convert them to a "Data.Frame" & add our labels to the vectors we created
+CorrDataT<-as.data.frame(CorrDataT)
+colnames(CorrDataT) <- c("Practice","Anxiety","Memory")
+#make the scatter plots
+scatterplot(Memory~Practice,CorrDataT)
+scatterplot(Memory~Anxiety,CorrDataT)
+scatterplot(Anxiety~Practice,CorrDataT)
+# Pearson Correlations
+ry1<-cor(CorrDataT$Memory,CorrDataT$Practice)
+ry2<-cor(CorrDataT$Memory,CorrDataT$Anxiety)
+r12<-cor(CorrDataT$Anxiety,CorrDataT$Practice)
+</code>
+<code>
+ry1
+ry2
+r12
+ry1^2
+ry2^2
+r12^2
+</code>
+<code>
+> ry1
+[1] 0.6
+> ry2
+[1] 0.4
+> r12
+[1] 0.3
+>
+> ry1^2
+[1] 0.36
+> ry2^2
+[1] 0.16
+> r12^2
+[1] 0.09
+>
+</code>
+<code>
+> lm.m.pa <- lm(Memory~Practice+Anxiety, data=CorrDataT)
+> summary(lm.m.pa)
+Call:
+lm(formula = Memory ~ Practice + Anxiety, data = CorrDataT)
+Residuals:
+     Min       1Q   Median       3Q      Max
+-1.99998 -0.54360  0.04838  0.43878  2.74186
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  2.30769    0.96783   2.384  0.01905 *
+Practice     0.52747    0.08153   6.469 4.01e-09 ***
+Anxiety      0.24176    0.08153   2.965  0.00381 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.7739 on 97 degrees of freedom
+Multiple R-squared:  0.4132,	Adjusted R-squared:  0.4011
+F-statistic: 34.15 on 2 and 97 DF,  p-value: 5.919e-12
+</code>
+{{  http://www.alexanderdemos.org/RegressionClass/Ballantine.jpg?200  }}
+a+b+c+e = variance of y = total variance = 100% = 1 이라고 보면
+r<sup>2</sup> = a + b + c / a + b + c + e = a + b + c = 0.4132 (from summary(lm.m.pa))
+이 중에서 우리는 이미
+a + c = 0.6<sup>2</sup> = 0.36
+b + c = 0.4<sup>2</sup> = 0.16
+따라서 Practice를 제어한 Anxiety의 영향력은
+r<sup>2</sup> - (a+c) = 0.4132 - 0.36 = 0.0532
+반대로, Anxiety를 제어한 Practice의 영향력은
+r<sup>2</sup> - (a+c) = 0.4132 - 0.16 = 0.2532