User Tools

Site Tools


r:multiple_regression

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
r:multiple_regression [2019/11/08 10:59] – [Prediction] hkimscilr:multiple_regression [2023/10/19 08:23] (current) hkimscil
Line 1: Line 1:
 ====== Multiple Regression ====== ====== Multiple Regression ======
 {{:R:dataset_hlr.csv|data file}} {{:R:dataset_hlr.csv|data file}}
-<code>> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")+University of New Mexico enrollment data (for 30 years)  
 +ROLL: # of enrollment  
 +UNEM: enemployment level 
 +HGRAD: # of High school graduates  
 +INC: income level 
 +<code> 
 +# data import 
 +> datavar <- read.csv("http://commres.net/wiki/_media/r/dataset_hlr.csv")
 > str(datavar) > str(datavar)
 'data.frame': 29 obs. of  5 variables: 'data.frame': 29 obs. of  5 variables:
Line 11: Line 18:
  
 </code> </code>
- 
 <code> <code>
-onePredictorModel <- lm(ROLL ~ UNEM, data = datavar) +two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar) 
-twoPredictorModel <- lm(ROLL ~ UNEM + HGRAD, data = datavar) +summary(two.predictor.model) 
-threePredictorModel <- lm(ROLL ~ UNEM + HGRAD + INC, data = datavar)+two.predictor.model
 </code> </code>
  
-<code>summary(onePredictorModel+<code> 
-summary(twoPredictorModel+three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar
-summary(threePredictorModel)+summary(three.predictor.model
 +three.predictor.model
 </code> </code>
  
-<code>> summary(onePredictorModel) +<code> 
- +two.predictor.model <- lm(ROLL ~ UNEM + HGRAD, datavar) 
-Call: +summary(two.predictor.model)
-lm(formula = ROLL ~ UNEM, data = datavar) +
- +
-Residuals: +
-    Min      1Q  Median      3Q     Max  +
--7640.0 -1046.5   602.8  1934.3  4187.2  +
- +
-Coefficients: +
-            Estimate Std. Error t value Pr(>|t|)   +
-(Intercept)   3957.0     4000.1   0.989   0.3313   +
-UNEM          1133.8      513.1   2.210   0.0358 * +
---- +
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +
- +
-Residual standard error: 3049 on 27 degrees of freedom +
-Multiple R-squared:  0.1531, Adjusted R-squared:  0.1218  +
-F-statistic: 4.883 on 1 and 27 DF,  p-value: 0.03579 +
-</code> +
- +
-<code>> summary(twoPredictorModel)+
  
 Call: Call:
Line 65: Line 53:
 F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11 F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
  
-> </code>+two.predictor.model 
 + 
 +Call: 
 +lm(formula = ROLL ~ UNEM + HGRAD, data = datavar) 
 + 
 +Coefficients: 
 +(Intercept)         UNEM        HGRAD   
 + -8255.7511     698.2681       0.9423   
 + 
 +>  
 +</code> 
 <code> <code>
-> summary(threePredictorModel)+> three.predictor.model <- lm(ROLL ~ UNEM + HGRAD + INC, datavar) 
 +> summary(three.predictor.model)
  
 Call: Call:
Line 89: Line 89:
 F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16 F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16
  
 +> three.predictor.model
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Coefficients:
 +(Intercept)         UNEM        HGRAD          INC  
 + -9153.2545     450.1245       0.4065       4.2749  
 +
 +
 </code> </code>
  
-<code>anova(onePredictorModeltwoPredictorModelthreePredictorModel+만약에  
-Analysis of Variance Table+  * unemployment rate (UNEM) = 9%12%3% 
 +  * spring high school graduating class (HGRAD= 100000, 98000, 78000 
 +  * a per capita income (INC) of \$30000, \$28000, \$36000 
 +  * 일 때, enrollment는 어떻게 predict할 수 있을까?
  
-Model 1: ROLL ~ UNEM +위에서 얻은 prediction model은 아래와 같다.  
-Model 2: ROLL ~ UNEM HGRAD +$$ \hat{Y} = -9153.2545 450.1245 \cdot UNEM + 0.4065 \cdot HGRAD + 4.2749 \cdot INC $$ 
-Model 3: ROLL ~ UNEM + HGRAD + INC +여기에 위의 정보를 대입해 보면 된다.  
-  Res.Df       RSS Df Sum of Sq      F    Pr(>F)     + 
-1     27 251084710                                   +<code> 
-2     26  44805568  1 206279143 458.92 2.2e-16 *** +new.data <data.frame(UNEM=c(9, 12, 3), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 36000)) 
-    25  11237313  1  33568255  74.68 5.594e-09 *** +predict(three.predictor.model, newdata=new.data) 
---- +</code> 
-Signifcodes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1+ 
 +<code> 
 +> new.data <- data.frame(UNEM=c(9, 10, 15), HGRAD=c(100000, 98000, 78000), INC=c(30000, 28000, 19000)) 
 +> predict(three.predictor.model, newdata=new.data) 
 +              2        3  
 +163792.0 154879.4 110526.6 
  
 +</code>
 +\begin{align*}
 +\hat{Y} & = -9153.2545 + 450.1245 \cdot \text{UNEM} + 0.4065 \cdot \text{HGRAD} + 4.2749 \cdot \text{INC}  \\ 
 +163792.0 & = -9153.2545 + 450.1245 \cdot (9) + 0.4065 \cdot (100000) + 4.2749 \cdot (30000) \\
 +154879.4 & = -9153.2545 + 450.1245 \cdot (10) + 0.4065 \cdot (98000) + 4.2749 \cdot (28000) \\
 +110526.6 & = -9153.2545 + 450.1245 \cdot (15) + 0.4065 \cdot (78000) + 4.2749 \cdot (19000) \\
 +
 +\end{align*}
 +
 +beta coefficient 살펴보기 
 +see [[:beta coefficients]]
 +<code>
 +# install.packages('lm.beta')
 +# library(lm.beta)
 +lm.beta(three.predictor.model)
 +</code>
 +
 +<code>
 +> # install.packages('lm.beta')
 +> # library(lm.beta)
 +> lm.beta(three.predictor.model)
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Standardized Coefficients::
 +(Intercept)        UNEM       HGRAD         INC 
 +  0.0000000   0.1553619   0.3656177   0.6061762 
 +
 +
 +</code>
 +by hand
 +<code>
 +# coefficient * (sd(x)/sd(y)) 이므로
 +
 +attach(datavar)
 +sd.roll <- sd(ROLL)
 +sd.unem <- sd(UNEM)
 +sd.hgrad <- sd(HGRAD)
 +sd.inc <- sd(INC)
 +
 +b.unem <- three.predictor.model$coefficients[2]
 +b.hgrad <- three.predictor.model$coefficients[3] 
 +b.inc <- three.predictor.model$coefficients[4]
 +
 +## or
 +b.unem <- 4.501e+02
 +b.hgrad <- 4.065e-01
 +b.inc <- 4.275e+00
 +
 +
 +b.unem * (sd.unem / sd.roll)
 +b.hgrad * (sd.hgrad / sd.roll)
 +b.inc * (sd.inc / sd.roll)
 +
 +lm.beta(three.predictor.model)
 +
 +</code>
 +output of the above
 +<code>
 +> sd.roll <- sd(ROLL)
 +> sd.unem <- sd(UNEM)
 +> sd.hgrad <- sd(HGRAD)
 +> sd.inc <- sd(INC)
 +
 +> b.unem <- three.predictor.model$coefficients[2]
 +> b.hgrad <- three.predictor.model$coefficients[3] 
 +> b.inc <- three.predictor.model$coefficients[4]
 +
 +> ## or
 +> b.unem <- 4.501e+02
 +> b.hgrad <- 4.065e-01
 +> b.inc <- 4.275e+00
 +
 +
 +> b.unem * (sd.unem / sd.roll)
 +[1] 0.1554
 +> b.hgrad * (sd.hgrad / sd.roll)
 +[1] 0.3656
 +> b.inc * (sd.inc / sd.roll)
 +[1] 0.6062
 +
 +> lm.beta(three.predictor.model)
 +
 +Call:
 +lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
 +
 +Standardized Coefficients::
 +(Intercept)        UNEM       HGRAD         INC 
 +     0.0000      0.1554      0.3656      0.6062 
 +
 +
 +</code>
 +
 +see also [[:sequential_regression#eg_3_college_enrollment_in_new_mexico_university|Sequential method]] regression modeling by hand
 +see also [[:statistical regression methods]] regression modeling by computing 
 +
 +<code>
 +> fit <- three.predictor.model
 +> step <- stepAIC(fit, direction="both")
 +Start:  AIC=381.2
 +ROLL ~ UNEM + HGRAD + INC
 +
 +        Df Sum of Sq      RSS AIC
 +<none>               11237313 381
 +- UNEM     6522098 17759411 392
 +- HGRAD  1  12852039 24089352 401
 +- INC    1  33568255 44805568 419
 +
 +
 </code> </code>
 ====== Housing ====== ====== Housing ======
Line 111: Line 239:
  
 ====== etc ====== ====== etc ======
 +{{:marketing_from_datarium.csv}}
 <code> <code>
 +marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv")
 +</code>
 +
 +<code>
 +# install.packages("tidyverse", dep=TRUE)
 library(tidyverse) library(tidyverse)
 data("marketing", package = "datarium") data("marketing", package = "datarium")
Line 119: Line 253:
   * Note that to list all the independent (explanatory) variables, you could use ''lm (sales ~ ., data="marketing")''.   * Note that to list all the independent (explanatory) variables, you could use ''lm (sales ~ ., data="marketing")''.
   * You could also use ''-'' sign to subtract ivs. ''lm(sales ~ . - newspapers, data = "marketing")''   * You could also use ''-'' sign to subtract ivs. ''lm(sales ~ . - newspapers, data = "marketing")''
 +
      
 <code> <code>
Line 345: Line 480:
 | interest  | 1 (a)  | 894463 (1)  | 894463  | 179.6471179  | | interest  | 1 (a)  | 894463 (1)  | 894463  | 179.6471179  |
 | unemp  | 1 (b)  | 22394 (2) | 22394  | 4.497690299  | | unemp  | 1 (b)  | 22394 (2) | 22394  | 4.497690299  |
-| res  | 21 (c)  | 104559 (3) | 4979  |   |+| res  | 21 %%(%%c%%)%%  | 104559 (3) | 4979  |   |
 | total  | 23  | 1021416 (4)  |     | | total  | 23  | 1021416 (4)  |     |
-interst \\ + enemp  |   | 916857 (5)  |     |+interest \\ + enemp  |   | 916857 (5)  |     |
  
 (4) = (1) + (2) + (3) (4) = (1) + (2) + (3)
Line 534: Line 669:
 Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > </code> > </code>
 +====== e.g. 5 ======
 +http://www.alexanderdemos.org/Class2.html
 +
 +<code>
 +#packages we will need to conduct to create and graph our data
 +library(MASS) #create data
 +library(car) #graph data
 +py1 =.6 #Cor between X1 (Practice Time) and Memory Errors
 +py2 =.4 #Cor between X2 (Performance Anxiety) and Memory Errors
 +p12= .3 #Cor between X1 (Practice Time) and X2 (Performance Anxiety)
 +Means.X1X2Y<- c(10,10,10) #set the means of X and Y variables
 +CovMatrix.X1X2Y <- matrix(c(1,p12,py1,
 +                            p12,1,py2,
 +                            py1,py2,1),3,3) # creates the covariate matrix 
 +#build the correlated variables. Note: empirical=TRUE means make the correlation EXACTLY r. 
 +# if we say empirical=FALSE, the correlation would be normally distributed around r
 +set.seed(42)
 +CorrDataT<-mvrnorm(n=100, mu=Means.X1X2Y,Sigma=CovMatrix.X1X2Y, empirical=TRUE)
 +#Convert them to a "Data.Frame" & add our labels to the vectors we created
 +CorrDataT<-as.data.frame(CorrDataT)
 +colnames(CorrDataT) <- c("Practice","Anxiety","Memory")
 +#make the scatter plots
 +scatterplot(Memory~Practice,CorrDataT)
 +scatterplot(Memory~Anxiety,CorrDataT)
 +scatterplot(Anxiety~Practice,CorrDataT)
 +# Pearson Correlations
 +ry1<-cor(CorrDataT$Memory,CorrDataT$Practice)
 +ry2<-cor(CorrDataT$Memory,CorrDataT$Anxiety)
 +r12<-cor(CorrDataT$Anxiety,CorrDataT$Practice)
 +</code>
 +
 +<code>
 +ry1
 +ry2
 +r12
 +ry1^2
 +ry2^2
 +r12^2
 +</code>
 +
 +<code>
 +> ry1
 +[1] 0.6
 +> ry2
 +[1] 0.4
 +> r12
 +[1] 0.3
 +
 +> ry1^2
 +[1] 0.36
 +> ry2^2
 +[1] 0.16
 +> r12^2
 +[1] 0.09
 +
 +</code>
 +
 +<code>
 +> lm.m.pa <- lm(Memory~Practice+Anxiety, data=CorrDataT)
 +> summary(lm.m.pa)
 +
 +Call:
 +lm(formula = Memory ~ Practice + Anxiety, data = CorrDataT)
 +
 +Residuals:
 +     Min       1Q   Median       3Q      Max 
 +-1.99998 -0.54360  0.04838  0.43878  2.74186 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)  2.30769    0.96783   2.384  0.01905 *  
 +Practice     0.52747    0.08153   6.469 4.01e-09 ***
 +Anxiety      0.24176    0.08153   2.965  0.00381 ** 
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 0.7739 on 97 degrees of freedom
 +Multiple R-squared:  0.4132, Adjusted R-squared:  0.4011 
 +F-statistic: 34.15 on 2 and 97 DF,  p-value: 5.919e-12
 +
 +</code>
 +
 +{{  http://www.alexanderdemos.org/RegressionClass/Ballantine.jpg?200  }}
 +
 +a+b+c+e = variance of y = total variance = 100% = 1 이라고 보면 
 +r<sup>2</sup> = a + b + c / a + b + c + e = a + b + c = 0.4132 (from summary(lm.m.pa))
 +
 +이 중에서 우리는 이미 
 +a + c = 0.6<sup>2</sup> = 0.36
 +b + c = 0.4<sup>2</sup> = 0.16
 +
 +따라서 Practice를 제어한 Anxiety의 영향력은 
 +r<sup>2</sup> - (a+c) = 0.4132 - 0.36 = 0.0532
 +반대로, Anxiety를 제어한 Practice의 영향력은 
 +r<sup>2</sup> - (a+c) = 0.4132 - 0.16 = 0.2532
  
r/multiple_regression.1573178381.txt.gz · Last modified: 2019/11/08 10:59 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki