====== Beta coefficients in linear regression ====== {{:pasted:20190521-113150.png?200}} \begin{align*} \large{\beta = b * \frac{sd(x)}{sd(y)}} \ \end{align*} # import test score data "tests_cor.csv" tests <- read.csv("http://commres.net/wiki/_media/r/tests_cor.csv") colnames(tests) <- c("ser", "sat", "clep", "gpa") tests <- subset(tests, select=c("sat", "clep", "gpa")) attach(tests) lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests) summary(lm.gpa.clepsat) Call: lm(formula = gpa ~ clep + sat, data = tests) Residuals: Min 1Q Median 3Q Max -0.197888 -0.128974 -0.000528 0.131170 0.226404 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.1607560 0.4081117 2.844 0.0249 * clep 0.0729294 0.0253799 2.874 0.0239 * sat -0.0007015 0.0012564 -0.558 0.5940 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.1713 on 7 degrees of freedom Multiple R-squared: 0.7778, Adjusted R-squared: 0.7143 F-statistic: 12.25 on 2 and 7 DF, p-value: 0.005175 > > sd.clep <- sd(clep) > sd.sat <- sd(sat) > sd.gpa <- sd(gpa) > lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests) > summary(lm.gpa.clepsat) Call: lm(formula = gpa ~ clep + sat, data = tests) Residuals: Min 1Q Median 3Q Max -0.197888 -0.128974 -0.000528 0.131170 0.226404 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.1607560 0.4081117 2.844 0.0249 * clep 0.0729294 0.0253799 2.874 0.0239 * sat -0.0007015 0.0012564 -0.558 0.5940 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.1713 on 7 degrees of freedom Multiple R-squared: 0.7778, Adjusted R-squared: 0.7143 F-statistic: 12.25 on 2 and 7 DF, p-value: 0.005175 > b.clep <- 0.0729294 > b.sat <- -0.0007015 > beta.clep <- b.clep * (sd.clep/sd.gpa) > beta.sat <- b.sat * (sd.sat/sd.gpa) > lm.beta(lm.gpa.clepsat) Call: lm(formula = gpa ~ clep + sat, data = tests) Standardized Coefficients:: (Intercept) clep sat 0.0000000 1.0556486 -0.2051189 > beta.clep [1] 1.055648 > beta.sat [1] -0.2051187 > ====== e.g. ====== # get marketing data marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv") head(marketing) # note that I need - X to get rid of X column in the marketing data mod <- lm(sales ~ . - X, data=marketing) summary(mod) > marketing <- read.csv("http://commres.net/wiki/_media/marketing_from_datarium.csv") > head(marketing) X youtube facebook newspaper sales 1 1 276.12 45.36 83.04 26.52 2 2 53.40 47.16 54.12 12.48 3 3 20.64 55.08 83.16 11.16 4 4 181.80 49.56 70.20 22.20 5 5 216.96 12.96 70.08 15.48 6 6 10.44 58.68 90.00 8.64 # note that I need - X to get rid of X column in the marketing data > mod <- lm(sales ~ . - X, data=marketing) > summary(mod) Call: lm(formula = sales ~ . - X, data = marketing) Residuals: Min 1Q Median 3Q Max -10.5932 -1.0690 0.2902 1.4272 3.3951 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.526667 0.374290 9.422 <2e-16 *** youtube 0.045765 0.001395 32.809 <2e-16 *** facebook 0.188530 0.008611 21.893 <2e-16 *** newspaper -0.001037 0.005871 -0.177 0.86 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.023 on 196 degrees of freedom Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956 F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16 install.packages(lm.beta) library(lm.beta) lm.beta(mod) lm.beta(mod) Call: lm(formula = sales ~ . - X, data = marketing) Standardized Coefficients:: (Intercept) youtube facebook newspaper 0.000000000 0.753065912 0.536481550 -0.004330686 > These beta coefficients also can be got from the coefficents from standardized data. mod.formula <- sales ~ youtube + facebook + newspaper all.vars(mod.formula) marketing.temp <- sapply(marketing[ , all.vars(mod.formula)], scale) head(marketing.temp) mod.scaled <- lm(sales ~ ., data=marketing.scaled) head(marketing.scaled) coefficients(mod.scaled) > mod.formula <- sales ~ youtube + facebook + newspaper > all.vars(mod.formula) [1] "sales" "youtube" "facebook" "newspaper" > marketing.temp <- sapply(marketing[ , all.vars(mod.formula)], scale) > head(marketing.temp) sales youtube facebook newspaper [1,] 1.5481681 0.96742460 0.9790656 1.7744925 [2,] -0.6943038 -1.19437904 1.0800974 0.6679027 [3,] -0.9051345 -1.51235985 1.5246374 1.7790842 [4,] 0.8581768 0.05191939 1.2148065 1.2831850 [5,] -0.2151431 0.39319551 -0.8395070 1.2785934 [6,] -1.3076295 -1.61136487 1.7267010 2.0408088 > mod.scaled <- lm(sales ~ ., data=marketing.scaled) > head(marketing.scaled) sales youtube facebook newspaper 1 1.5481681 0.96742460 0.9790656 1.7744925 2 -0.6943038 -1.19437904 1.0800974 0.6679027 3 -0.9051345 -1.51235985 1.5246374 1.7790842 4 0.8581768 0.05191939 1.2148065 1.2831850 5 -0.2151431 0.39319551 -0.8395070 1.2785934 6 -1.3076295 -1.61136487 1.7267010 2.0408088 > coefficients(mod.scaled) (Intercept) youtube facebook newspaper -5.034110e-16 7.530659e-01 5.364815e-01 -4.330686e-03 > > check out that ''lm.beta(mod) == coefficients(mod.scaled)'' and 베타를 구하고 나면 서로의 계수값을 절대비교할 수 있다. ''lm.beta(mod)''의 아웃풋을 보고 ''youtube'', ''facebook'', and ''newspaper'' 순으로 설명력을 갖는다고 말할 수 있다 (혹은 newspaper를 분석에서 제외하고 다시 분석하여 둘만의 설명력을 보는 것도 방법이다.