====== Beta coefficients in linear regression ======
{{:pasted:20190521-113150.png?200}}
\begin{align*}
\large{\beta = b * \frac{sd(x)}{sd(y)}} \
\end{align*}
# import test score data "tests_cor.csv"
tests <- read.csv("http://commres.net/wiki/_media/r/tests_cor.csv")
colnames(tests) <- c("ser", "sat", "clep", "gpa")
tests <- subset(tests, select=c("sat", "clep", "gpa"))
attach(tests)
lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests)
summary(lm.gpa.clepsat)
Call:
lm(formula = gpa ~ clep + sat, data = tests)
Residuals:
Min 1Q Median 3Q Max
-0.197888 -0.128974 -0.000528 0.131170 0.226404
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1607560 0.4081117 2.844 0.0249 *
clep 0.0729294 0.0253799 2.874 0.0239 *
sat -0.0007015 0.0012564 -0.558 0.5940
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1713 on 7 degrees of freedom
Multiple R-squared: 0.7778, Adjusted R-squared: 0.7143
F-statistic: 12.25 on 2 and 7 DF, p-value: 0.005175
>
> sd.clep <- sd(clep)
> sd.sat <- sd(sat)
> sd.gpa <- sd(gpa)
> lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests)
> summary(lm.gpa.clepsat)
Call:
lm(formula = gpa ~ clep + sat, data = tests)
Residuals:
Min 1Q Median 3Q Max
-0.197888 -0.128974 -0.000528 0.131170 0.226404
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1607560 0.4081117 2.844 0.0249 *
clep 0.0729294 0.0253799 2.874 0.0239 *
sat -0.0007015 0.0012564 -0.558 0.5940
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1713 on 7 degrees of freedom
Multiple R-squared: 0.7778, Adjusted R-squared: 0.7143
F-statistic: 12.25 on 2 and 7 DF, p-value: 0.005175
> b.clep <- 0.0729294
> b.sat <- -0.0007015
> beta.clep <- b.clep * (sd.clep/sd.gpa)
> beta.sat <- b.sat * (sd.sat/sd.gpa)
> lm.beta(lm.gpa.clepsat)
Call:
lm(formula = gpa ~ clep + sat, data = tests)
Standardized Coefficients::
(Intercept) clep sat
0.0000000 1.0556486 -0.2051189
> beta.clep
[1] 1.055648
> beta.sat
[1] -0.2051187
>
====== e.g. ======
# get marketing data
marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv")
head(marketing)
# note that I need - X to get rid of X column in the marketing data
mod <- lm(sales ~ . - X, data=marketing)
summary(mod)
> marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv")
> head(marketing)
X youtube facebook newspaper sales
1 1 276.12 45.36 83.04 26.52
2 2 53.40 47.16 54.12 12.48
3 3 20.64 55.08 83.16 11.16
4 4 181.80 49.56 70.20 22.20
5 5 216.96 12.96 70.08 15.48
6 6 10.44 58.68 90.00 8.64
# note that I need - X to get rid of X column in the marketing data
> mod <- lm(sales ~ . - X, data=marketing)
> summary(mod)
Call:
lm(formula = sales ~ . - X, data = marketing)
Residuals:
Min 1Q Median 3Q Max
-10.5932 -1.0690 0.2902 1.4272 3.3951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.526667 0.374290 9.422 <2e-16 ***
youtube 0.045765 0.001395 32.809 <2e-16 ***
facebook 0.188530 0.008611 21.893 <2e-16 ***
newspaper -0.001037 0.005871 -0.177 0.86
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.023 on 196 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
install.packages(lm.beta)
library(lm.beta)
lm.beta(mod)
lm.beta(mod)
Call:
lm(formula = sales ~ . - X, data = marketing)
Standardized Coefficients::
(Intercept) youtube facebook newspaper
0.000000000 0.753065912 0.536481550 -0.004330686
>
These beta coefficients also can be got from the coefficents from standardized data.
mod.formula <- sales ~ youtube + facebook + newspaper
all.vars(mod.formula)
marketing.temp <- sapply(marketing[ , all.vars(mod.formula)], scale)
head(marketing.temp)
mod.scaled <- lm(sales ~ ., data=marketing.scaled)
head(marketing.scaled)
coefficients(mod.scaled)
> mod.formula <- sales ~ youtube + facebook + newspaper
> all.vars(mod.formula)
[1] "sales" "youtube" "facebook" "newspaper"
> marketing.temp <- sapply(marketing[ , all.vars(mod.formula)], scale)
> head(marketing.temp)
sales youtube facebook newspaper
[1,] 1.5481681 0.96742460 0.9790656 1.7744925
[2,] -0.6943038 -1.19437904 1.0800974 0.6679027
[3,] -0.9051345 -1.51235985 1.5246374 1.7790842
[4,] 0.8581768 0.05191939 1.2148065 1.2831850
[5,] -0.2151431 0.39319551 -0.8395070 1.2785934
[6,] -1.3076295 -1.61136487 1.7267010 2.0408088
> mod.scaled <- lm(sales ~ ., data=marketing.scaled)
> head(marketing.scaled)
sales youtube facebook newspaper
1 1.5481681 0.96742460 0.9790656 1.7744925
2 -0.6943038 -1.19437904 1.0800974 0.6679027
3 -0.9051345 -1.51235985 1.5246374 1.7790842
4 0.8581768 0.05191939 1.2148065 1.2831850
5 -0.2151431 0.39319551 -0.8395070 1.2785934
6 -1.3076295 -1.61136487 1.7267010 2.0408088
> coefficients(mod.scaled)
(Intercept) youtube facebook newspaper
-5.034110e-16 7.530659e-01 5.364815e-01 -4.330686e-03
>
>
check out that
''lm.beta(mod) == coefficients(mod.scaled)''
and
베타를 구하고 나면 서로의 계수값을 절대비교할 수 있다.
''lm.beta(mod)''의 아웃풋을 보고
''youtube'', ''facebook'', and ''newspaper'' 순으로 설명력을 갖는다고 말할 수 있다 (혹은 newspaper를 분석에서 제외하고 다시 분석하여 둘만의 설명력을 보는 것도 방법이다.