User Tools

Site Tools


r:multiple_regression

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Last revisionBoth sides next revision
r:multiple_regression [2019/11/08 10:59] – [Prediction] hkimscilr:multiple_regression [2021/11/04 08:18] – [Multiple Regression] hkimscil
Line 1: Line 1:
 ====== Multiple Regression ====== ====== Multiple Regression ======
 {{:R:dataset_hlr.csv|data file}} {{:R:dataset_hlr.csv|data file}}
-<code>> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")+University of New Mexico enrollment data (for 30 years)  
 +ROLL: # of enrollment  
 +UNEM: enemployment level 
 +HGRAD: # of High school graduates  
 +INC: income level 
 +<code> 
 +# data import 
 +> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")
 > str(datavar) > str(datavar)
 'data.frame': 29 obs. of  5 variables: 'data.frame': 29 obs. of  5 variables:
Line 11: Line 18:
  
 </code> </code>
- 
 <code> <code>
-onePredictorModel <- lm(ROLL ~ UNEM, data = datavar) +two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar) 
-twoPredictorModel <- lm(ROLL ~ UNEM + HGRAD, data = datavar) +summary(two.predictor.model) 
-threePredictorModel <- lm(ROLL ~ UNEM + HGRAD + INC, data = datavar)+two.predictor.model
 </code> </code>
  
-<code>summary(onePredictorModel+<code> 
-summary(twoPredictorModel+three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar
-summary(threePredictorModel)+summary(three.predictor.model
 +three.predictor.model
 </code> </code>
  
-<code>> summary(onePredictorModel) +<code> 
- +two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar) 
-Call: +summary(two.predictor.model)
-lm(formula = ROLL ~ UNEM, data = datavar) +
- +
-Residuals: +
-    Min      1Q  Median      3Q     Max  +
--7640.0 -1046.5   602.8  1934.3  4187.2  +
- +
-Coefficients: +
-            Estimate Std. Error t value Pr(>|t|)   +
-(Intercept)   3957.0     4000.1   0.989   0.3313   +
-UNEM          1133.8      513.1   2.210   0.0358 * +
---- +
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +
- +
-Residual standard error: 3049 on 27 degrees of freedom +
-Multiple R-squared:  0.1531, Adjusted R-squared:  0.1218  +
-F-statistic: 4.883 on 1 and 27 DF,  p-value: 0.03579 +
-</code> +
- +
-<code>> summary(twoPredictorModel)+
  
 Call: Call:
Line 65: Line 53:
 F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11 F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
  
-> </code>+two.predictor.model 
 + 
 +Call: 
 +lm(formula = ROLL ~ UNEM + HGRAD, data = datavar) 
 + 
 +Coefficients: 
 +(Intercept)         UNEM        HGRAD   
 + -8255.7511     698.2681       0.9423   
 + 
 +>  
 +</code> 
 <code> <code>
-> summary(threePredictorModel)+> three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar) 
 +> summary(three.predictor.model)
  
 Call: Call:
Line 89: Line 89:
 F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16 F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16
  
 +> three.predictor.model
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Coefficients:
 +(Intercept)         UNEM        HGRAD          INC  
 + -9153.2545     450.1245       0.4065       4.2749  
 +
 +
 </code> </code>
  
-<code>anova(onePredictorModeltwoPredictorModelthreePredictorModel+만약에  
-Analysis of Variance Table+  * unemployment rate (UNEM) = 9%12%3% 
 +  * spring high school graduating class (HGRAD= 100000, 98000, 78000 
 +  * a per capita income (INC) of $30,000, $2800, $36000 
 +  * 일 때, enrollment는 어떻게 predict할 수 있을까?
  
-Model 1: ROLL ~ UNEM +위에서 얻은 prediction model은 아래와 같다.  
-Model 2: ROLL ~ UNEM HGRAD +$$ \hat{Y} = -9153.2545 450.1245 \cdot UNEM + 0.4065 \cdot HGRAD + 4.2749 \cdot INC $$ 
-Model 3: ROLL ~ UNEM + HGRAD + INC +여기에 위의 정보를 대입해 보면 된다.  
-  Res.Df       RSS Df Sum of Sq      F    Pr(>F)     + 
-1     27 251084710                                   +<code> 
-2     26  44805568  1 206279143 458.92 2.2e-16 *** +new.data <data.frame(UNEM=c(9, 12, 3), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 36000)) 
-    25  11237313  1  33568255  74.68 5.594e-09 *** +predict(three.predictor.model, newdata=new.data) 
---- +</code> 
-Signifcodes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1+ 
 +<code> 
 +> new.data <- data.frame(UNEM=c(9, 10, 15), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 19000)) 
 +> predict(three.predictor.model, newdata=new.data) 
 +              2        3  
 +163792.0 154879.4 110526.6 
  
 +</code>
 +\begin{align*}
 +\hat{Y} & = -9153.2545 + 450.1245 \cdot \text{UNEM} + 0.4065 \cdot \text{HGRAD} + 4.2749 \cdot \text{INC}  \\ 
 +163792.0 & = -9153.2545 + 450.1245 \cdot (9) + 0.4065 \cdot (100000) + 4.2749 \cdot (30000) \\
 +154879.4 & = -9153.2545 + 450.1245 \cdot (10) + 0.4065 \cdot (98000) + 4.2749 \cdot (28000) \\
 +110526.6 & = -9153.2545 + 450.1245 \cdot (15) + 0.4065 \cdot (78000) + 4.2749 \cdot (19000) \\
 +
 +\end{align*}
 +
 +beta coefficient 살펴보기 
 +see [[:beta coefficients]]
 +<code>
 +# install.packages('lm.beta')
 +# library(lm.beta)
 +lm.beta(three.predictor.model)
 +</code>
 +
 +<code>
 +> # install.packages('lm.beta')
 +> # library(lm.beta)
 +> lm.beta(three.predictor.model)
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Standardized Coefficients::
 +(Intercept)        UNEM       HGRAD         INC 
 +  0.0000000   0.1553619   0.3656177   0.6061762 
 +
 +
 +</code>
 +by hand
 +<code>
 +# coefficient * (sd(x)/sd(y)) 이므로
 +
 +attach(datavar)
 +sd.roll <- sd(ROLL)
 +sd.unem <- sd(UNEM)
 +sd.hgrad <- sd(HGRAD)
 +sd.inc <- sd(INC)
 +
 +b.unem <- three.predictor.model$coefficients[2]
 +b.hgrad <- three.predictor.model$coefficients[3] 
 +b.inc <- three.predictor.model$coefficients[4]
 +
 +## or
 +b.unem <- 4.501e+02
 +b.hgrad <- 4.065e-01
 +b.inc <- 4.275e+00
 +
 +
 +b.unem * (sd.unem / sd.roll)
 +b.hgrad * (sd.hgrad / sd.roll)
 +b.inc * (sd.inc / sd.roll)
 +
 +lm.beta(three.predictor.model)
 +
 +</code>
 +output of the above
 +<code>
 +> sd.roll <- sd(ROLL)
 +> sd.unem <- sd(UNEM)
 +> sd.hgrad <- sd(HGRAD)
 +> sd.inc <- sd(INC)
 +
 +> b.unem <- three.predictor.model$coefficients[2]
 +> b.hgrad <- three.predictor.model$coefficients[3] 
 +> b.inc <- three.predictor.model$coefficients[4]
 +
 +> ## or
 +> b.unem <- 4.501e+02
 +> b.hgrad <- 4.065e-01
 +> b.inc <- 4.275e+00
 +
 +
 +> b.unem * (sd.unem / sd.roll)
 +[1] 0.1554
 +> b.hgrad * (sd.hgrad / sd.roll)
 +[1] 0.3656
 +> b.inc * (sd.inc / sd.roll)
 +[1] 0.6062
 +
 +> lm.beta(three.predictor.model)
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Standardized Coefficients::
 +(Intercept)        UNEM       HGRAD         INC 
 +     0.0000      0.1554      0.3656      0.6062 
 +
 +
 +</code>
 +
 +see also [[:sequential_regression#eg_3_college_enrollment_in_new_mexico_university|Sequential method]] regression modeling by hand
 +see also [[:statistical regression methods]] regression modeling by computing 
 +
 +<code>
 +> fit <- three.predictor.model
 +> step <- stepAIC(fit, direction="both")
 +Start:  AIC=381.2
 +ROLL ~ UNEM + HGRAD + INC
 +
 +        Df Sum of Sq      RSS AIC
 +<none>               11237313 381
 +- UNEM     6522098 17759411 392
 +- HGRAD  1  12852039 24089352 401
 +- INC    1  33568255 44805568 419
 +
 +
 </code> </code>
 ====== Housing ====== ====== Housing ======
Line 111: Line 239:
  
 ====== etc ====== ====== etc ======
 +{{:marketing_from_datarium.csv}}
 <code> <code>
 +marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv")
 +</code>
 +
 +<code>
 +# install.packages("tidyverse", dep=TRUE)
 library(tidyverse) library(tidyverse)
 data("marketing", package = "datarium") data("marketing", package = "datarium")
Line 119: Line 253:
   * Note that to list all the independent (explanatory) variables, you could use ''lm (sales ~ ., data="marketing")''.   * Note that to list all the independent (explanatory) variables, you could use ''lm (sales ~ ., data="marketing")''.
   * You could also use ''-'' sign to subtract ivs. ''lm(sales ~ . - newspapers, data = "marketing")''   * You could also use ''-'' sign to subtract ivs. ''lm(sales ~ . - newspapers, data = "marketing")''
 +
      
 <code> <code>
Line 345: Line 480:
 | interest  | 1 (a)  | 894463 (1)  | 894463  | 179.6471179  | | interest  | 1 (a)  | 894463 (1)  | 894463  | 179.6471179  |
 | unemp  | 1 (b)  | 22394 (2) | 22394  | 4.497690299  | | unemp  | 1 (b)  | 22394 (2) | 22394  | 4.497690299  |
-| res  | 21 (c)  | 104559 (3) | 4979  |   |+| res  | 21 %%(%%c%%)%%  | 104559 (3) | 4979  |   |
 | total  | 23  | 1021416 (4)  |     | | total  | 23  | 1021416 (4)  |     |
-interst \\ + enemp  |   | 916857 (5)  |     |+interest \\ + enemp  |   | 916857 (5)  |     |
  
 (4) = (1) + (2) + (3) (4) = (1) + (2) + (3)
Line 534: Line 669:
 Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > </code> > </code>
 +====== e.g. 5 ======
 +http://www.alexanderdemos.org/Class2.html
 +
 +<code>
 +#packages we will need to conduct to create and graph our data
 +library(MASS) #create data
 +library(car) #graph data
 +py1 =.6 #Cor between X1 (Practice Time) and Memory Errors
 +py2 =.4 #Cor between X2 (Performance Anxiety) and Memory Errors
 +p12= .3 #Cor between X1 (Practice Time) and X2 (Performance Anxiety)
 +Means.X1X2Y<- c(10,10,10) #set the means of X and Y variables
 +CovMatrix.X1X2Y <- matrix(c(1,p12,py1,
 +                            p12,1,py2,
 +                            py1,py2,1),3,3) # creates the covariate matrix 
 +#build the correlated variables. Note: empirical=TRUE means make the correlation EXACTLY r. 
 +# if we say empirical=FALSE, the correlation would be normally distributed around r
 +set.seed(42)
 +CorrDataT<-mvrnorm(n=100, mu=Means.X1X2Y,Sigma=CovMatrix.X1X2Y, empirical=TRUE)
 +#Convert them to a "Data.Frame" & add our labels to the vectors we created
 +CorrDataT<-as.data.frame(CorrDataT)
 +colnames(CorrDataT) <- c("Practice","Anxiety","Memory")
 +#make the scatter plots
 +scatterplot(Memory~Practice,CorrDataT)
 +scatterplot(Memory~Anxiety,CorrDataT)
 +scatterplot(Anxiety~Practice,CorrDataT)
 +# Pearson Correlations
 +ry1<-cor(CorrDataT$Memory,CorrDataT$Practice)
 +ry2<-cor(CorrDataT$Memory,CorrDataT$Anxiety)
 +r12<-cor(CorrDataT$Anxiety,CorrDataT$Practice)
 +</code>
 +
 +<code>
 +ry1
 +ry2
 +r12
 +ry1^2
 +ry2^2
 +r12^2
 +</code>
 +
 +<code>
 +> ry1
 +[1] 0.6
 +> ry2
 +[1] 0.4
 +> r12
 +[1] 0.3
 +
 +> ry1^2
 +[1] 0.36
 +> ry2^2
 +[1] 0.16
 +> r12^2
 +[1] 0.09
 +
 +</code>
 +
 +<code>
 +> lm.m.pa <- lm(Memory~Practice+Anxiety, data=CorrDataT)
 +> summary(lm.m.pa)
 +
 +Call:
 +lm(formula = Memory ~ Practice + Anxiety, data = CorrDataT)
 +
 +Residuals:
 +     Min       1Q   Median       3Q      Max 
 +-1.99998 -0.54360  0.04838  0.43878  2.74186 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)  2.30769    0.96783   2.384  0.01905 *  
 +Practice     0.52747    0.08153   6.469 4.01e-09 ***
 +Anxiety      0.24176    0.08153   2.965  0.00381 ** 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.7739 on 97 degrees of freedom
 +Multiple R-squared:  0.4132, Adjusted R-squared:  0.4011 
 +F-statistic: 34.15 on 2 and 97 DF,  p-value: 5.919e-12
 +
 +</code>
 +
 +{{  http://www.alexanderdemos.org/RegressionClass/Ballantine.jpg?200  }}
 +
 +a+b+c+e = variance of y = total variance = 100% = 1 이라고 보면 
 +r<sup>2</sup> = a + b + c / a + b + c + e = a + b + c = 0.4132 (from summary(lm.m.pa))
 +
 +이 중에서 우리는 이미 
 +a + c = 0.6<sup>2</sup> = 0.36
 +b + c = 0.4<sup>2</sup> = 0.16
 +
 +따라서 Practice를 제어한 Anxiety의 영향력은 
 +r<sup>2</sup> - (a+c) = 0.4132 - 0.36 = 0.0532
 +반대로, Anxiety를 제어한 Practice의 영향력은 
 +r<sup>2</sup> - (a+c) = 0.4132 - 0.16 = 0.2532
  
r/multiple_regression.txt · Last modified: 2023/10/19 08:23 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki