This is an old revision of the document!
Table of Contents
Categorical variables
2 groups
rscr01
rm(list=ls())
library(dplyr)
library(ggplot2)
ss <- function(x) {
sum((x-mean(x))^2)
}
sp <- function(x, y) {
sum((x-mean(x))*(y-mean(y)))
}
# 1. Generate numerical variable
set.seed(12)
n <- 30
n.1 <- round(rnorm(n, mean = 50, sd = 20),0) # Continuous variable
n.2 = round(rnorm(n, 55, 20),0)
n.3 = round(rnorm(n, 60, 22),0)
sc <- data.frame(n.1,n.2,n.3)
sc.s <- stack(sc)
sc.s <- sc.s[-2]
head(sc.s)
c.1 <- (rep("lo", n))
c.2 <- (rep("mid", n))
c.3 <- rep("hi", n)
ca <- data.frame(c.1,c.2,c.3)
head(ca)
ca.s <- stack(ca)
ca.s <- ca.s[-2]
head(ca.s)
df <- data.frame(sc.s, ca.s)
colnames(df) <-c("score", "group")
df$group <- factor(df$group)
str(df)
head(df)
amean <- aggregate(df[, 1], list(df$group), mean)
colnames(amean)=c("group", "mean")
amean
boxplot(df$score~df$group)
# 세 그룹 간의 차이를 보려면
# aov 테스트를 (F-test) 하면
# 된다.
fm.df <- aov(score~group, data=df)
summary(fm.df)
# 이것을 regression을 해도 된다
m.df <- lm(score~group, data=df)
summary(m.df)
anova(m.df)
summary(fm.df)
boxplot(df$score~df$group)
amean
# 2. Manually create dummy variables (k - 1 = 3 variables needed)
df$lo <- ifelse(df$group == "lo", 1, 0)
df$mid <- ifelse(df$group == "mid", 1, 0)
df$hi <- ifelse(df$group == "hi", 1, 0)
head(df)
m.tmp <- lm(score~lo+mid, data=df)
summary(m.tmp)
summary(m.df)
m.tmp2 <- lm(score~mid+hi, data=df)
summary(m.tmp2)
b.hi <- m.tmp$coefficients[1]
b.lo <- m.tmp$coefficients[2]
b.mid <- m.tmp$coefficients[3]
b.lo
b.mid
b.hi
mean.hi <- b.hi
mean.mid <- mean.hi+b.mid
mean.lo <- mean.hi+b.lo
data.frame(mean.hi, mean.lo, mean.mid)
amean
am.df <- aov(score~group, df)
summary(am.df)
posthoc.am.df <- TukeyHSD(am.df)
posthoc.am.df
# t-test의 의미 = baseline과 (절편) 차이가 있는가 테스트
# basedline의 t-test는 baseline이 0과 다른가?
# 다른 조건과 차이를 테스트하는 것은 아님 (주의)
summary(m.df)
# 위에서 grouplo의 significance는 lo 그룹과 hi 그룹이
# (절편, baseline) 차이가 있는가를 테스트한 것
# 마찬가지로 groupmid의 t-test는 baseline인 grouphi와
# 차이가 있는지 본 것으로 not significant
# 여기서 보지 못한 것은 mid와 lo그룹 간의 차이가 있는지를
# 보는 것이다
# 이를 위해서는 아래를 보면 된다
summary(m.tmp2)
# 위의 결과는 baseline인 lo가 mid와는 차이가 없는 것을
# 나타낸다. lo와 hi는 존재
# 위는 두개의 regression을 해서 그룹 간 차이를
# post hoc test처럼 한 것이고 이를 한 번에 처리하는
# 방법은
# install.packages("emmeans")
library(emmeans)
# Calculate estimated marginal means for each group
gmeans <- emmeans(m.df, specs = ~ group)
# Perform pairwise comparisons using Tukey's adjustment (Default)
pairs(gmeans, adjust = "tukey")
rout01
> rm(list=ls())
>
> library(dplyr)
> library(ggplot2)
>
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+ sum((x-mean(x))*(y-mean(y)))
+ }
>
> # 1. Generate numerical variable
> set.seed(12)
> n <- 30
> n.1 <- round(rnorm(n, mean = 50, sd = 20),0) # Continuous variable
> n.2 = round(rnorm(n, 55, 20),0)
> n.3 = round(rnorm(n, 60, 22),0)
> sc <- data.frame(n.1,n.2,n.3)
> sc.s <- stack(sc)
> sc.s <- sc.s[-2]
> head(sc.s)
values
1 20
2 82
3 31
4 32
5 10
6 45
>
> c.1 <- (rep("lo", n))
> c.2 <- (rep("mid", n))
> c.3 <- rep("hi", n)
> ca <- data.frame(c.1,c.2,c.3)
> head(ca)
c.1 c.2 c.3
1 lo mid hi
2 lo mid hi
3 lo mid hi
4 lo mid hi
5 lo mid hi
6 lo mid hi
> ca.s <- stack(ca)
> ca.s <- ca.s[-2]
> head(ca.s)
values
1 lo
2 lo
3 lo
4 lo
5 lo
6 lo
>
> df <- data.frame(sc.s, ca.s)
> colnames(df) <-c("score", "group")
> df$group <- factor(df$group)
> str(df)
'data.frame': 90 obs. of 2 variables:
$ score: num 20 82 31 32 10 45 44 37 48 59 ...
$ group: Factor w/ 3 levels "hi","lo","mid": 2 2 2 2 2 2 2 2 2 2 ...
>
> head(df)
score group
1 20 lo
2 82 lo
3 31 lo
4 32 lo
5 10 lo
6 45 lo
> amean <- aggregate(df[, 1], list(df$group), mean)
> colnames(amean)=c("group", "mean")
> amean
group mean
1 hi 62.33333
2 lo 46.96667
3 mid 54.10000
> boxplot(df$score~df$group)
>
> # 세 그룹 간의 차이를 보려면
> # aov 테스트를 (F-test) 하면
> # 된다.
> fm.df <- aov(score~group, data=df)
> summary(fm.df)
Df Sum Sq Mean Sq F value Pr(>F)
group 2 3548 1774.0 5.278 0.00686 **
Residuals 87 29242 336.1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # 이것을 regression을 해도 된다
> m.df <- lm(score~group, data=df)
> summary(m.df)
Call:
lm(formula = score ~ group, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.333 3.347 18.622 < 2e-16 ***
grouplo -15.367 4.734 -3.246 0.00166 **
groupmid -8.233 4.734 -1.739 0.08552 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
> anova(m.df)
Analysis of Variance Table
Response: score
Df Sum Sq Mean Sq F value Pr(>F)
group 2 3548.1 1774.03 5.278 0.006863 **
Residuals 87 29242.3 336.12
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(fm.df)
Df Sum Sq Mean Sq F value Pr(>F)
group 2 3548 1774.0 5.278 0.00686 **
Residuals 87 29242 336.1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> boxplot(df$score~df$group)
> amean
group mean
1 hi 62.33333
2 lo 46.96667
3 mid 54.10000
>
> # 2. Manually create dummy variables (k - 1 = 3 variables needed)
> df$lo <- ifelse(df$group == "lo", 1, 0)
> df$mid <- ifelse(df$group == "mid", 1, 0)
> df$hi <- ifelse(df$group == "hi", 1, 0)
>
> head(df)
score group lo mid hi
1 20 lo 1 0 0
2 82 lo 1 0 0
3 31 lo 1 0 0
4 32 lo 1 0 0
5 10 lo 1 0 0
6 45 lo 1 0 0
> m.tmp <- lm(score~lo+mid, data=df)
> summary(m.tmp)
Call:
lm(formula = score ~ lo + mid, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.333 3.347 18.622 < 2e-16 ***
lo -15.367 4.734 -3.246 0.00166 **
mid -8.233 4.734 -1.739 0.08552 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
> summary(m.df)
Call:
lm(formula = score ~ group, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.333 3.347 18.622 < 2e-16 ***
grouplo -15.367 4.734 -3.246 0.00166 **
groupmid -8.233 4.734 -1.739 0.08552 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
> m.tmp2 <- lm(score~mid+hi, data=df)
> summary(m.tmp2)
Call:
lm(formula = score ~ mid + hi, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.967 3.347 14.031 < 2e-16 ***
mid 7.133 4.734 1.507 0.13545
hi 15.367 4.734 3.246 0.00166 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
>
> b.hi <- m.tmp$coefficients[1]
> b.lo <- m.tmp$coefficients[2]
> b.mid <- m.tmp$coefficients[3]
> b.lo
lo
-15.36667
> b.mid
mid
-8.233333
> b.hi
(Intercept)
62.33333
>
> mean.hi <- b.hi
> mean.mid <- mean.hi+b.mid
> mean.lo <- mean.hi+b.lo
> data.frame(mean.hi, mean.lo, mean.mid)
mean.hi mean.lo mean.mid
(Intercept) 62.33333 46.96667 54.1
> amean
group mean
1 hi 62.33333
2 lo 46.96667
3 mid 54.10000
>
> am.df <- aov(score~group, df)
> summary(am.df)
Df Sum Sq Mean Sq F value Pr(>F)
group 2 3548 1774.0 5.278 0.00686 **
Residuals 87 29242 336.1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> posthoc.am.df <- TukeyHSD(am.df)
> posthoc.am.df
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = score ~ group, data = df)
$group
diff lwr upr p adj
lo-hi -15.366667 -26.654078 -4.079255 0.0046872
mid-hi -8.233333 -19.520745 3.054078 0.1965275
mid-lo 7.133333 -4.154078 18.420745 0.2926865
>
> # t-test의 의미 = baseline과 (절편) 차이가 있는가 테스트
> # basedline의 t-test는 baseline이 0과 다른가?
> # 다른 조건과 차이를 테스트하는 것은 아님 (주의)
> summary(m.df)
Call:
lm(formula = score ~ group, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.333 3.347 18.622 < 2e-16 ***
grouplo -15.367 4.734 -3.246 0.00166 **
groupmid -8.233 4.734 -1.739 0.08552 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
> # 위에서 grouplo의 significance는 lo 그룹과 hi 그룹이
> # (절편, baseline) 차이가 있는가를 테스트한 것
> # 마찬가지로 groupmid의 t-test는 baseline인 grouphi와
> # 차이가 있는지 본 것으로 not significant
> # 여기서 보지 못한 것은 mid와 lo그룹 간의 차이가 있는지를
> # 보는 것이다
> # 이를 위해서는 아래를 보면 된다
> summary(m.tmp2)
Call:
lm(formula = score ~ mid + hi, data = df)
Residuals:
Min 1Q Median 3Q Max
-49.333 -11.242 -1.533 12.000 43.033
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.967 3.347 14.031 < 2e-16 ***
mid 7.133 4.734 1.507 0.13545
hi 15.367 4.734 3.246 0.00166 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared: 0.1082, Adjusted R-squared: 0.0877
F-statistic: 5.278 on 2 and 87 DF, p-value: 0.006863
> # 위의 결과는 baseline인 lo가 mid와는 차이가 없는 것을
> # 나타낸다. lo와 hi는 존재
> # 위는 두개의 regression을 해서 그룹 간 차이를
> # post hoc test처럼 한 것이고 이를 한 번에 처리하는
> # 방법은
>
> # install.packages("emmeans")
> library(emmeans)
> # Calculate estimated marginal means for each group
> gmeans <- emmeans(m.df, specs = ~ group)
> # Perform pairwise comparisons using Tukey's adjustment (Default)
> pairs(gmeans, adjust = "tukey")
contrast estimate SE df t.ratio p.value
hi - lo 15.37 4.73 87 3.246 0.0047
hi - mid 8.23 4.73 87 1.739 0.1965
lo - mid -7.13 4.73 87 -1.507 0.2927
P value adjustment: tukey method for comparing a family of 3 estimates
2 group: e.g. 2
data: for spss
elemapi2.sav
elemapi2_categories.sps
data: for r
elemapi2.csv
rscrp.02
####################
# 다른 예
####################
rm(list=ls())
# library(dplyr)
library(ggplot2)
ss <- function(x) {
sum((x-mean(x))^2)
}
sp <- function(x, y) {
sum((x-mean(x))*(y-mean(y)))
}
datavar <- read.csv("http://commres.net/_media/r/elemapi2.csv")
head(datavar)
rout.02
> ####################
> # 다른 예
> ####################
> rm(list=ls())
>
> # library(dplyr)
> library(ggplot2)
>
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+ sum((x-mean(x))*(y-mean(y)))
+ }
>
> datavar <- read.csv("http://commres.net/_media/r/elemapi2.csv")
> head(datavar)
snum dnum api00 api99 growth meals ell yr_rnd mobility acs_k3 acs_46 not_hsg hsg some_col
1 906 41 693 600 93 67 9 0 11 16 22 0 0 0
2 889 41 570 501 69 92 21 0 33 15 32 0 0 0
3 887 41 546 472 74 97 29 0 36 17 25 0 0 0
4 876 41 571 487 84 90 27 0 27 20 30 36 45 9
5 888 41 478 425 53 89 30 0 44 18 31 50 50 0
6 4284 98 858 844 14 10 3 0 10 20 33 1 8 24
col_grad grad_sch avg_ed full emer enroll mealcat collcat
1 0 0 NA 76 24 247 2 1
2 0 0 NA 79 19 463 3 1
3 0 0 NA 68 29 395 3 1
4 9 0 1.91 87 11 418 3 1
5 0 0 1.50 87 13 520 3 1
6 36 31 3.89 100 0 343 1 2
>
datavar data set의 변인 설명
Variable Labels Variable Position Label snum 1 school number dnum 2 district number api00 3 api 2000 api99 4 api 1999 growth 5 growth 1999 to 2000 meals 6 pct free meals ell 7 english language learners yr_rnd 8 year round school mobility 9 pct 1st year in school acs_k3 10 avg class size k-3 acs_46 11 avg class size 4-6 not_hsg 12 parent not hsg hsg 13 parent hsg some_col 14 parent some college col_grad 15 parent college grad grad_sch 16 parent grad school avg_ed 17 avg parent ed full 18 pct full credential emer 19 pct emer credential enroll 20 number of students mealcat 21 Percentage free meals in 3 categories collcat 22 <none> Variables in the working file yr_rnd: 0 = 방학있음 1 = 방학없음 mealcat: 1 = 0-46% free meals 2 = 47-80 3 = 81-100
- pct (%) of emer (emergency) credentials: Some states will grant an emergency certificate or permit at the request of a school district that has posted and failed to find a qualified candidate for a teacher vacancy. It typically allows the candidate to serve in a temporary capacity for the duration of a school year.
- pct (%) of full credentials: To be fully certified, it generally means that a teacher must have graduated from an accredited college, completed an approved teacher credential program and passed a test of their academic skills.
위의 변인들 중에서 “무방학학교”가 성적에 어떤 영향을 미칠 것인가를 알아 보기 위해서 regression 테스트를 시행하였다. 아래는 그 결과이다.
rscrp.03
# data 정리
df <- datavar[,c(1,3,6,7,8,9,18,21)]
head(df)
str(df)
df$yr_rnd <- factor(df$yr_rnd, levels=c(0,1), labels=c("break", "nobr"))
df$mealcat <- factor(df$mealcat, levels=c(1:3), labels=c("to46", "to80", "to100"))
str(df)
# regression
m.br <- lm(api00~yr_rnd, data=df)
summary(m.br)
# making dummy variables
df$nobr <- ifelse(df$yr_rnd == "nobr", 1, 0)
df$br<- ifelse(df$yr_rnd == "break", 1, 0)
head(df)
m.br2 <- lm(api00~nobr,data=df)
summary(m.br2)
m.br3 <- lm(api00~br, data=df)
summary(m.br3)
# baseline 변화
df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "nobr")
m.br4 <- lm(api00 ~ yr_rnd, data = df)
summary(m.br4)
# 원래로 복원
df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "break")
head(df)
tmp <- aggregate(df[, 2], list(df$yr_rnd), mean)
mean.br <- tmp[[2]][1]
mean.nobr <- tmp[[2]][2]
data.frame(mean.br, mean.nobr)
boxplot(df$api00~df$yr_rnd)
t.out <- t.test(df$api00~df$yr_rnd, var.eq=T)
t.out
t.out$estimate
# str(t.out)
t.value <- t.out$statistic
f.out <- aov(df$api00~df$yr_rnd, data=df)
summary(f.out)
f.value <- summary(f.out)[[1]][1, "F value"]
summary(m.br)
f.value.lm <- summary(m.br)$fstatistic[1]
t.value.lm <- summary(m.br)$coefficients["yr_rndnobr", "t value"]
data.frame(t.value, t.value^2, f.value, f.value.lm, t.value.lm)
plot(df$yr_rnd,df$api00)
df %>%
ggplot(aes(x = yr_rnd, y = api00, color = factor(yr_rnd))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1", name = "breaks") +
labs(title = "Regression: with yr_rnd break",
x = " (yr_rnd)",
y = "api00")
summary(m.br)
rout.03
> # data 정리
> df <- datavar[,c(1,3,6,7,8,9,18,21)]
> head(df)
snum api00 meals ell yr_rnd mobility full mealcat
1 906 693 67 9 0 11 76 2
2 889 570 92 21 0 33 79 3
3 887 546 97 29 0 36 68 3
4 876 571 90 27 0 27 87 3
5 888 478 89 30 0 44 87 3
6 4284 858 10 3 0 10 100 1
> str(df)
'data.frame': 400 obs. of 8 variables:
$ snum : int 906 889 887 876 888 4284 4271 2910 2899 2887 ...
$ api00 : int 693 570 546 571 478 858 918 831 860 737 ...
$ meals : int 67 92 97 90 89 10 5 2 5 29 ...
$ ell : int 9 21 29 27 30 3 2 3 6 15 ...
$ yr_rnd : int 0 0 0 0 0 0 0 0 0 0 ...
$ mobility: int 11 33 36 27 44 10 16 44 10 17 ...
$ full : int 76 79 68 87 87 100 100 96 100 96 ...
$ mealcat : int 2 3 3 3 3 1 1 1 1 1 ...
> df$yr_rnd <- factor(df$yr_rnd, levels=c(0,1), labels=c("break", "nobr"))
> df$mealcat <- factor(df$mealcat, levels=c(1:3), labels=c("to46", "to80", "to100"))
> str(df)
'data.frame': 400 obs. of 8 variables:
$ snum : int 906 889 887 876 888 4284 4271 2910 2899 2887 ...
$ api00 : int 693 570 546 571 478 858 918 831 860 737 ...
$ meals : int 67 92 97 90 89 10 5 2 5 29 ...
$ ell : int 9 21 29 27 30 3 2 3 6 15 ...
$ yr_rnd : Factor w/ 2 levels "break","nobr": 1 1 1 1 1 1 1 1 1 1 ...
$ mobility: int 11 33 36 27 44 10 16 44 10 17 ...
$ full : int 76 79 68 87 87 100 100 96 100 96 ...
$ mealcat : Factor w/ 3 levels "to46","to80",..: 2 3 3 3 3 1 1 1 1 1 ...
>
> # regression
> m.br <- lm(api00~yr_rnd, data=df)
> summary(m.br)
Call:
lm(formula = api00 ~ yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 684.54 7.14 95.88 <2e-16 ***
yr_rndnobr -160.51 14.89 -10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
>
> # making dummy variables
> df$nobr <- ifelse(df$yr_rnd == "nobr", 1, 0)
> df$br<- ifelse(df$yr_rnd == "break", 1, 0)
> head(df)
snum api00 meals ell yr_rnd mobility full mealcat nobr br
1 906 693 67 9 break 11 76 to80 0 1
2 889 570 92 21 break 33 79 to100 0 1
3 887 546 97 29 break 36 68 to100 0 1
4 876 571 90 27 break 27 87 to100 0 1
5 888 478 89 30 break 44 87 to100 0 1
6 4284 858 10 3 break 10 100 to46 0 1
>
> m.br2 <- lm(api00~nobr,data=df)
> summary(m.br2)
Call:
lm(formula = api00 ~ nobr, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 684.54 7.14 95.88 <2e-16 ***
nobr -160.51 14.89 -10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
>
> m.br3 <- lm(api00~br, data=df)
> summary(m.br3)
Call:
lm(formula = api00 ~ br, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 524.03 13.06 40.11 <2e-16 ***
br 160.51 14.89 10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
>
> # baseline 변화
> df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "nobr")
> m.br4 <- lm(api00 ~ yr_rnd, data = df)
> summary(m.br4)
Call:
lm(formula = api00 ~ yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 524.03 13.06 40.11 <2e-16 ***
yr_rndbreak 160.51 14.89 10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
> # 원래로 복원
> df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "break")
>
> head(df)
snum api00 meals ell yr_rnd mobility full mealcat nobr br
1 906 693 67 9 break 11 76 to80 0 1
2 889 570 92 21 break 33 79 to100 0 1
3 887 546 97 29 break 36 68 to100 0 1
4 876 571 90 27 break 27 87 to100 0 1
5 888 478 89 30 break 44 87 to100 0 1
6 4284 858 10 3 break 10 100 to46 0 1
> tmp <- aggregate(df[, 2], list(df$yr_rnd), mean)
> mean.br <- tmp[[2]][1]
> mean.nobr <- tmp[[2]][2]
> data.frame(mean.br, mean.nobr)
mean.br mean.nobr
1 684.539 524.0326
>
> boxplot(df$api00~df$yr_rnd)
>
> t.out <- t.test(df$api00~df$yr_rnd, var.eq=T)
> t.out
Two Sample t-test
data: df$api00 by df$yr_rnd
t = 10.782, df = 398, p-value < 2.2e-16
alternative hypothesis: true difference in means between group break and group nobr is not equal to 0
95 percent confidence interval:
131.2390 189.7737
sample estimates:
mean in group break mean in group nobr
684.5390 524.0326
> t.out$estimate
mean in group break mean in group nobr
684.5390 524.0326
> # str(t.out)
> t.value <- t.out$statistic
>
> f.out <- aov(df$api00~df$yr_rnd, data=df)
> summary(f.out)
Df Sum Sq Mean Sq F value Pr(>F)
df$yr_rnd 1 1825001 1825001 116.2 <2e-16 ***
Residuals 398 6248671 15700
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> f.value <- summary(f.out)[[1]][1, "F value"]
>
> summary(m.br)
Call:
lm(formula = api00 ~ yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 684.54 7.14 95.88 <2e-16 ***
yr_rndnobr -160.51 14.89 -10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
> f.value.lm <- summary(m.br)$fstatistic[1]
> t.value.lm <- summary(m.br)$coefficients["yr_rndnobr", "t value"]
>
> data.frame(t.value, t.value^2, f.value, f.value.lm, t.value.lm)
t.value t.value.2 f.value f.value.lm t.value.lm
t 10.7815 116.2407 116.2407 116.2407 -10.7815
>
> plot(df$yr_rnd,df$api00)
>
> df %>%
+ ggplot(aes(x = yr_rnd, y = api00, color = factor(yr_rnd))) +
+ geom_point(size = 3) +
+ geom_smooth(method = "lm", se = FALSE) +
+ scale_color_brewer(palette = "Set1", name = "breaks") +
+ labs(title = "Regression: with yr_rnd break",
+ x = " (yr_rnd)",
+ y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
> summary(m.br)
Call:
lm(formula = api00 ~ yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-273.539 -95.662 0.967 103.341 297.967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 684.54 7.14 95.88 <2e-16 ***
yr_rndnobr -160.51 14.89 -10.78 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared: 0.226, Adjusted R-squared: 0.2241
F-statistic: 116.2 on 1 and 398 DF, p-value: < 2.2e-16
>


위의 아웃풋을 살펴 보면, 학생들의 성적이 가지는 총 변량의 (Sum of Square Total) 약 22.6% 를 방학이 있고 없고로 구분되는 yr_rnd 변인이 설명을 하고 있으며, 이는 통계적으로 유의미한 것이다 (F(1, 398) = 116.241, p < .001). 위의 regression output은 yr_rnd 변인 중에서 방학이 있는 특성이 baseline이 되어 있으며, 이를 염두에 두고 regression식을 적어 보면 아래와 같다.
y hat = 684.54 - 160.51 * (yr_rndno_break)
- yr_rndno_break: yr_rndno_break = 1
- y hat = 684.54 - 160.51 * (1)
- y hat = 524.03
- yr_rndbreak: yr_rndnobreak = 0
- y hat = 684.54 - 160.51 * (0)
- y hat = 684.54
위 회귀식에서 r은
y hat = a + bX 의 형식에서 X 로 no_break를 썼음을 (yr_rndno_break) 알 수 있다. 이 경우에는 yr_rndno_break를 1로 넣어서 해석을 한다. 즉, no break일 경우에는 y hat = 684.54 - 160.51(1) 이라는 것이다. 반대로 break일 경우에는 뒤쪽 부분이 해당이 안되므로 0으로 대체한다. 따라서 이 경우의 회귀식은 y hat = 684.54이다. 이 둘을 비교해보면 no break일 경우에는
y hat = 684.54 - 160.51(1) = 524.03 이고
break일 경우에는
y hat = 684.54 - 160.51(0) = 684.54 라는 것이다. 다시 이야기하면 break가 없는 학교의 평균 api점수는 524.03점인 반면에 break가 있는 학교의 평균은 684.54 이다. 이 점수의 차이는 두 집단의 평균을 비교하는 것과 같은 형태를 (형식) 갖는다. 즉, t-test를 하는 것과 마찬가지이다. 위에서 얻은
- t.value
- F.value
- t.value.lm
- F.value.lm 은 모두 같은 원리로 두 그룹을 (집단의 api 평균을) 테스트 한 것이다.
위의 그래프는 두 그룹의 (break, no break 학교그룹) api 점수 분포를 나타내 주는 것이다. 그리고 직선은 두 그룹 평균을 연결한 선으로 $\hat{Y} = 684.539 - 160.506 X$ 으로 표현된다.
이와 같이 종류변인(category, nominal)을 가지고서도 regression 테스트를 할 수 있으며, 사실 이는 t-test나 F-test와 다르지 않다. 위에서 주의해야 할 점은 두 변인의 종류를 손으로 coding할 때, 1과 2가 아닌, 0과 1로 하였다는 점이다. 이렇게 하는 이유는 해석하기에 편하기 때문이며, 이것이 보통의 방법이다. 그러나, 1과 2로 coding 데이터를 이용해도 크게 다른지 않은 결과를 구하게 된다. 다른 점이라면, 절편에 해당되는 상수값이 다르게 되며, coefficient값은 위의 분석과 동일한 값을 갖게 된다. 그리고 일반적으로 r에서는 dummy variables을 얻기 위한 coding을 하지 않아도 r이 스스로 구하여 계산을 한다. 즉, dummy variable 을 구하지 않은 채로 lm(api00 ~ yr_rnd, data=df) 만으로도 계산이 된다.
Regression with a categorical IV with 3 attributes
rs.3att
m.mealcat <- lm(api00 ~ mealcat, data=df) summary(m.mealcat)
ro.3att
> #######################################
> # categorical IV with 3 or more attributes
> #######################################
> m.mealcat <- lm(api00 ~ mealcat, data=df)
> summary(m.mealcat)
Call:
lm(formula = api00 ~ mealcat, data = df)
Residuals:
Min 1Q Median 3Q Max
-253.394 -47.883 0.282 52.282 185.620
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 805.718 6.169 130.60 <2e-16 ***
mealcatto80 -166.324 8.708 -19.10 <2e-16 ***
mealcatto100 -301.338 8.629 -34.92 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 70.61 on 397 degrees of freedom
Multiple R-squared: 0.7548, Adjusted R-squared: 0.7536
F-statistic: 611.1 on 2 and 397 DF, p-value: < 2.2e-16
>
y hat = 805.718 - 166.324 * to80 - 301.338 * to100 mealcat0-46 (to46 으로 대체) mealcat47-80 (to80 으로 대체) maelcat81-100 (to100 으로 대체)
이에 대한 해석도 앞에서의 것과 마찬가지이다.
- y hat = 805.718 - 166.324*mg2 - 301.338*mg3
- mg1 = 1, mg2 = 0, mg3 = 0 일 경우
- y hat = 805.718 - 166.324*(0) - 301.338*(0)
- y hat = 805.718
- mg1 = 0, mg2 = 1, mg3 = 0 일 경우
- y hat = 805.718 - 166.324*(1) - 301.338*(0)
- y hat = 805.718 - 166.324
- y hat = 639.394
- mg1 = 0, mg2 = 0, mg3 = 1 일 경우
- y hat = 805.718 - 166.324*(0) - 301.338*(1)
- y hat = 805.718 - 301.338
- y hat = 504.38
- 즉, 무료급식의 퍼센티지가 높을 수록 api점수가 낮음을 알 수 있다. 이렇게 무료급식 퍼센티지를 독립변인으로 종속변인인 api00점수를 (학력점수) 봤을 때, 그 설명력이 통계학적으로 유효한가는 regression output에서 (summary(mod2))
- F-value 와 p-value를 가지고 판단한다.
- (F (2, 397) = 611.1; p-value < 2.2e-16)
- 위에서 2, 397 은 각각 between degrees of freedom 과 within degrees of freedom 이다. 이를 보고도 우리는
- 총 400개의 학교가 데이터에 참여했음을 알 수 있고 (2 + 397 에 1을 더한 값),
- 독립변인의 종류가 3가지 (df = 2 이므로) 임을 알 수 있다.
- R square value 는 설명력의 크기를 알려준다.
- 0.7548 즉, 75.48% 를 독립변인이 종속변인을 설명한다 (상당한 크기임을 알 수 있다).
Regression with a continuous and a categorical IV
rs.04
#######################################
#######################################
# with a continuous and a category IV
#######################################
#######################################
m.mealsyr_rnd <- lm(api00~meals+yr_rnd, data=df)
summary(m.mealsyr_rnd)
m.mealsxyr_rnd <- lm(api00~meals*yr_rnd, data=df)
summary(m.mealsxyr_rnd)
# 중요: 위에서 yr_rnd의 설명력이 t test를 보면
# 사라짐. 왜?
# interaction을 추가하면서 yr_rnd의 설명력은
# (방학이 있고 없음의 설명력은) meals의 percentage가
# 0 이 될때의 것으로 설정된다. 0이 되는 데이터가
# 극단치이고 이 경우에 nobr인 학교도 없으므로 계산된
# se값이 극대화되어 t값이 작아지게 된다. 이를 바로
# 잡으려면 meals의 0이 중간에 가도록 한 후에
# regression을 하면 meals가 0 일때가 극단치가 되지
# 않으므로 결과가 옳게 나온다
df %>%
ggplot(aes(x = meals, y = api00, color = factor(yr_rnd))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1", name = "yr_rnd") +
labs(title = "Multiple Regression: Interaction by yr_rnd break",
x = " (free meals percentage)",
y = "api00")
# 1. 평균을 빼준 값을 새로운 변인으로 저장
df$meals_centered <-
df$meals - mean(df$meals)
# 2. Run the model again using the centered variable
m_centered <- lm(api00 ~ meals_centered * yr_rnd,
data = df)
# 3. Check the new summary
summary(m_centered)
df %>%
ggplot(aes(x = meals_centered, y = api00, color = factor(yr_rnd))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1", name = "yr_rnd") +
labs(title = "Multiple Regression: Interaction by yr_rnd break",
x = " (free meals percentage CENTERED)",
y = "api00")
# Install the package if you do not have it
# install.packages("interactions")
# Plot the interaction
library(interactions)
interact_plot(m.mealsxyr_rnd, pred = meals, modx = yr_rnd)
m.ellyr_rnd <- lm(api00~ell+yr_rnd, data=df)
summary(m.ellyr_rnd)
m.ellxyr_rnd <- lm(api00~ell*yr_rnd, data=df)
summary(m.ellxyr_rnd)
df %>%
ggplot(aes(x = ell, y = api00, color = factor(yr_rnd))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1", name = "Cylinders") +
labs(title = "Multiple Regression: Interaction by yr_rnd break",
x = " (ell, english language learner)",
y = "api00")
# ell mean centred 이번 경우는 큰 차이를 보이지 않는다
df$ell_centered <- df$ell - mean(df$ell)
m.ell.centered <- lm(api00 ~ ell_centered * yr_rnd, data = df)
summary(m.ell.centered)
coefs <- summary(m.ellxyr_rnd)$coefficients
coefs
api.br <- coefs[1]
api.nobr <- coefs[1]+coefs[3]
slope.red <- coefs[2]
slope.blue <- coefs[2]+coefs[4]
data.frame(api.br, api.nobr, slope.red, slope.blue)
# br과 nobr에 따라서 ell의 slope가 달라지는것이
# interaction effects
ro.04
> #######################################
> #######################################
> # with a continuous and a category IV
> #######################################
> #######################################
> m.mealsyr_rnd <- lm(api00~meals+yr_rnd, data=df)
> summary(m.mealsyr_rnd)
Call:
lm(formula = api00 ~ meals + yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-174.882 -41.542 -1.044 39.454 176.007
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 885.6192 6.4712 136.855 < 2e-16 ***
meals -3.7921 0.1036 -36.595 < 2e-16 ***
yr_rndnobr -40.3291 7.8479 -5.139 4.35e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 59.99 on 397 degrees of freedom
Multiple R-squared: 0.823, Adjusted R-squared: 0.8221
F-statistic: 923.2 on 2 and 397 DF, p-value: < 2.2e-16
>
> m.mealsxyr_rnd <- lm(api00~meals*yr_rnd, data=df)
> summary(m.mealsxyr_rnd)
Call:
lm(formula = api00 ~ meals * yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-175.78 -41.96 -0.76 39.06 175.49
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 885.0046 6.7647 130.827 <2e-16 ***
meals -3.7805 0.1100 -34.354 <2e-16 ***
yr_rndnobr -31.8737 27.9104 -1.142 0.254
meals:yr_rndnobr -0.1041 0.3299 -0.316 0.752
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 60.06 on 396 degrees of freedom
Multiple R-squared: 0.8231, Adjusted R-squared: 0.8217
F-statistic: 614.1 on 3 and 396 DF, p-value: < 2.2e-16
>
> # 중요: 위에서 yr_rnd의 설명력이 t test를 보면
> # 사라짐. 왜?
> # interaction을 추가하면서 yr_rnd의 설명력은
> # (방학이 있고 없음의 설명력은) meals의 percentage가
> # 0 이 될때의 것으로 설정된다. 0이 되는 데이터가
> # 극단치이고 이 경우에 nobr인 학교도 없으므로 계산된
> # se값이 극대화되어 t값이 작아지게 된다. 이를 바로
> # 잡으려면 meals의 0이 중간에 가도록 한 후에
> # regression을 하면 meals가 0 일때가 극단치가 되지
> # 않으므로 결과가 옳게 나온다
>
> df %>%
+ ggplot(aes(x = meals, y = api00, color = factor(yr_rnd))) +
+ geom_point(size = 3) +
+ geom_smooth(method = "lm", se = FALSE) +
+ scale_color_brewer(palette = "Set1", name = "yr_rnd") +
+ labs(title = "Multiple Regression: Interaction by yr_rnd break",
+ x = " (free meals percentage)",
+ y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
>
> # 1. 평균을 빼준 값을 새로운 변인으로 저장
> df$meals_centered <-
+ df$meals - mean(df$meals)
>
> # 2. Run the model again using the centered variable
> m_centered <- lm(api00 ~ meals_centered * yr_rnd,
+ data = df)
>
> # 3. Check the new summary
> summary(m_centered)
Call:
lm(formula = api00 ~ meals_centered * yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-175.78 -41.96 -0.76 39.06 175.49
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 656.9827 3.5150 186.910 < 2e-16 ***
meals_centered -3.7805 0.1100 -34.354 < 2e-16 ***
yr_rndnobr -38.1550 10.4473 -3.652 0.000295 ***
meals_centered:yr_rndnobr -0.1041 0.3299 -0.316 0.752386
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 60.06 on 396 degrees of freedom
Multiple R-squared: 0.8231, Adjusted R-squared: 0.8217
F-statistic: 614.1 on 3 and 396 DF, p-value: < 2.2e-16
>
> df %>%
+ ggplot(aes(x = meals_centered, y = api00, color = factor(yr_rnd))) +
+ geom_point(size = 3) +
+ geom_smooth(method = "lm", se = FALSE) +
+ scale_color_brewer(palette = "Set1", name = "yr_rnd") +
+ labs(title = "Multiple Regression: Interaction by yr_rnd break",
+ x = " (free meals percentage CENTERED)",
+ y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
>
> # Install the package if you do not have it
> # install.packages("interactions")
>
> # Plot the interaction
> library(interactions)
> interact_plot(m.mealsxyr_rnd, pred = meals, modx = yr_rnd)
>
>
>
> m.ellyr_rnd <- lm(api00~ell+yr_rnd, data=df)
> summary(m.ellyr_rnd)
Call:
lm(formula = api00 ~ ell + yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-265.843 -55.349 0.334 70.673 195.778
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 784.3978 7.2878 107.63 < 2e-16 ***
ell -4.0427 0.2094 -19.31 < 2e-16 ***
yr_rndnobr -41.8420 12.3441 -3.39 0.00077 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 90.1 on 397 degrees of freedom
Multiple R-squared: 0.6008, Adjusted R-squared: 0.5988
F-statistic: 298.8 on 2 and 397 DF, p-value: < 2.2e-16
> m.ellxyr_rnd <- lm(api00~ell*yr_rnd, data=df)
> summary(m.ellxyr_rnd)
Call:
lm(formula = api00 ~ ell * yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-270.514 -57.800 5.994 65.845 206.275
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 794.2576 7.8422 101.280 < 2e-16 ***
ell -4.4418 0.2420 -18.354 < 2e-16 ***
yr_rndnobr -110.5779 24.7935 -4.460 1.07e-05 ***
ell:yr_rndnobr 1.4884 0.4673 3.185 0.00156 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 89.08 on 396 degrees of freedom
Multiple R-squared: 0.6108, Adjusted R-squared: 0.6078
F-statistic: 207.1 on 3 and 396 DF, p-value: < 2.2e-16
> df %>%
+ ggplot(aes(x = ell, y = api00, color = factor(yr_rnd))) +
+ geom_point(size = 3) +
+ geom_smooth(method = "lm", se = FALSE) +
+ scale_color_brewer(palette = "Set1", name = "Cylinders") +
+ labs(title = "Multiple Regression: Interaction by yr_rnd break",
+ x = " (ell, english language learner)",
+ y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
>
> # ell mean centred 이번 경우는 큰 차이를 보이지 않는다
> df$ell_centered <- df$ell - mean(df$ell)
> m.ell.centered <- lm(api00 ~ ell_centered * yr_rnd, data = df)
> summary(m.ell.centered)
Call:
lm(formula = api00 ~ ell_centered * yr_rnd, data = df)
Residuals:
Min 1Q Median 3Q Max
-270.514 -57.800 5.994 65.845 206.275
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 654.5514 5.3323 122.752 < 2e-16 ***
ell_centered -4.4418 0.2420 -18.354 < 2e-16 ***
yr_rndnobr -63.7652 14.0117 -4.551 7.12e-06 ***
ell_centered:yr_rndnobr 1.4884 0.4673 3.185 0.00156 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 89.08 on 396 degrees of freedom
Multiple R-squared: 0.6108, Adjusted R-squared: 0.6078
F-statistic: 207.1 on 3 and 396 DF, p-value: < 2.2e-16
>
>
> coefs <- summary(m.ellxyr_rnd)$coefficients
> coefs
Estimate Std. Error t value Pr(>|t|)
(Intercept) 794.257647 7.8421894 101.280090 3.235236e-285
ell -4.441819 0.2420092 -18.353922 6.903303e-55
yr_rndnobr -110.577884 24.7935271 -4.459950 1.069371e-05
ell:yr_rndnobr 1.488362 0.4673174 3.184906 1.562618e-03
> api.br <- coefs[1]
> api.nobr <- coefs[1]+coefs[3]
> slope.red <- coefs[2]
> slope.blue <- coefs[2]+coefs[4]
> data.frame(api.br, api.nobr, slope.red, slope.blue)
api.br api.nobr slope.red slope.blue
1 794.2576 683.6798 -4.441819 -2.953456
> # br과 nobr에 따라서 ell의 slope가 달라지는것이
> # interaction effects
>
Regression with a categorical and a continuous IV: e.g. 2
rs.06
###############################
# 다른 예
###############################
m.ellmealcat <- lm(api00~ell+mealcat, data=df)
summary(m.ellmealcat)
# install.packages("emmeans")
# library(emmeans)
# Calculate estimated marginal means for each group
gmeans <- emmeans(m.ellmealcat, specs = ~ mealcat)
# Perform pairwise comparisons using Tukey's adjustment (Default)
pairs(gmeans, adjust = "tukey")
m.ellxmealcat <- lm(api00~ell*mealcat, data=df)
summary(m.ellxmealcat)
df %>%
ggplot(aes(x = ell, y = api00, color = factor(mealcat))) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
scale_color_brewer(palette = "Set1", name = "mealcat") +
labs(title = "Multiple Regression: Interaction by meal cat",
x = " (ell, english language learner)",
y = "api00")
mealcat1 <- summary(m.ellxmealcat)$coefficients[[1]]
mealcat2 <- summary(m.ellxmealcat)$coefficients[[1]] +
summary(m.ellxmealcat)$coefficients[[3]]
mealcat3 <- summary(m.ellxmealcat)$coefficients[[1]] +
summary(m.ellxmealcat)$coefficients[[4]]
data.frame(mealcat1, mealcat2, mealcat3)
ell.slope.mealcat1 <- summary(m.ellxmealcat)$coefficients[[2]]
ell.slope.mealcat2 <- summary(m.ellxmealcat)$coefficients[[2]] +
summary(m.ellxmealcat)$coefficients[[5]]
ell.slope.mealcat3 <- summary(m.ellxmealcat)$coefficients[[2]] +
summary(m.ellxmealcat)$coefficients[[6]]
data.frame(ell.slope.mealcat1,
ell.slope.mealcat2,
ell.slope.mealcat3)
ro.06
> ###############################
> # 다른 예
> ###############################
> m.ellmealcat <- lm(api00~ell+mealcat, data=df)
> summary(m.ellmealcat)
Call:
lm(formula = api00 ~ ell + mealcat, data = df)
Residuals:
Min 1Q Median 3Q Max
-224.96 -42.70 -0.32 48.93 179.92
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 819.654 6.052 135.426 < 2e-16 ***
ell -1.546 0.203 -7.616 1.94e-13 ***
mealcatto80 -134.493 9.153 -14.693 < 2e-16 ***
mealcatto100 -230.737 12.290 -18.774 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 66.03 on 396 degrees of freedom
Multiple R-squared: 0.7861, Adjusted R-squared: 0.7845
F-statistic: 485.2 on 3 and 396 DF, p-value: < 2.2e-16
> # install.packages("emmeans")
> # library(emmeans)
> # Calculate estimated marginal means for each group
> gmeans <- emmeans(m.ellmealcat, specs = ~ mealcat)
> # Perform pairwise comparisons using Tukey's adjustment (Default)
> pairs(gmeans, adjust = "tukey")
contrast estimate SE df t.ratio p.value
to46 - to80 134.5 9.15 396 14.693 <.0001
to46 - to100 230.7 12.30 396 18.774 <.0001
to80 - to100 96.2 9.53 396 10.102 <.0001
P value adjustment: tukey method for comparing a family of 3 estimates
>
> m.ellxmealcat <- lm(api00~ell*mealcat, data=df)
> summary(m.ellxmealcat)
Call:
lm(formula = api00 ~ ell * mealcat, data = df)
Residuals:
Min 1Q Median 3Q Max
-221.854 -43.244 1.437 47.443 181.471
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 836.7724 8.8620 94.422 < 2e-16 ***
ell -3.4447 0.7508 -4.588 6.03e-06 ***
mealcatto80 -146.6127 13.9085 -10.541 < 2e-16 ***
mealcatto100 -270.8356 18.7941 -14.411 < 2e-16 ***
ell:mealcatto80 1.7300 0.8111 2.133 0.0335 *
ell:mealcatto100 2.3190 0.8032 2.887 0.0041 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 65.47 on 394 degrees of freedom
Multiple R-squared: 0.7909, Adjusted R-squared: 0.7882
F-statistic: 298 on 5 and 394 DF, p-value: < 2.2e-16
> df %>%
+ ggplot(aes(x = ell, y = api00, color = factor(mealcat))) +
+ geom_point(size = 3) +
+ geom_smooth(method = "lm", se = FALSE) +
+ scale_color_brewer(palette = "Set1", name = "mealcat") +
+ labs(title = "Multiple Regression: Interaction by meal cat",
+ x = " (ell, english language learner)",
+ y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
> mealcat1 <- summary(m.ellxmealcat)$coefficients[[1]]
> mealcat2 <- summary(m.ellxmealcat)$coefficients[[1]] +
+ summary(m.ellxmealcat)$coefficients[[3]]
> mealcat3 <- summary(m.ellxmealcat)$coefficients[[1]] +
+ summary(m.ellxmealcat)$coefficients[[4]]
> data.frame(mealcat1, mealcat2, mealcat3)
mealcat1 mealcat2 mealcat3
1 836.7724 690.1597 565.9368
>
> ell.slope.mealcat1 <- summary(m.ellxmealcat)$coefficients[[2]]
> ell.slope.mealcat2 <- summary(m.ellxmealcat)$coefficients[[2]] +
+ summary(m.ellxmealcat)$coefficients[[5]]
> ell.slope.mealcat3 <- summary(m.ellxmealcat)$coefficients[[2]] +
+ summary(m.ellxmealcat)$coefficients[[6]]
> data.frame(ell.slope.mealcat1,
+ ell.slope.mealcat2,
+ ell.slope.mealcat3)
ell.slope.mealcat1 ell.slope.mealcat2 ell.slope.mealcat3
1 -3.444691 -1.714707 -1.125645
>
Regression with two Catogorical IVs
rs.07
########################################## # 카테고리 iv 2개 ########################################## m.yrrndmealcat <- lm(api00~yr_rnd+mealcat, data=df) summary(m.yrrndmealcat) # 해석. coefs <- summary(m.yrrndmealcat)$coefficients coefs br.to46 <- coefs[1] br.to80 <- coefs[1]+coefs[3] br.to100 <- coefs[1]+coefs[4] nobr.to46 <- coefs[1]+coefs[2] nobr.to80 <- coefs[1]+coefs[2]+coefs[3] nobr.to100 <- coefs[1]+coefs[2]+coefs[4] cat(br.to46, br.to80, br.to100) cat(nobr.to46, nobr.to80, nobr.to100) # 해석. interaction m.yrrndxmealcat <- lm(api00~yr_rnd*mealcat, data=df) summary(m.yrrndxmealcat) coefs <- summary(m.yrrndxmealcat)$coefficients coefs br.to46 <- coefs[1] br.to80 <- coefs[1]+coefs[3] br.to100 <- coefs[1]+coefs[4] nobr.to46 <- coefs[1]+coefs[2] nobr.to80 <- coefs[1]+coefs[2]+coefs[3]+coefs[5] nobr.to100 <- coefs[1]+coefs[2]+coefs[4]+coefs[6] cat(br.to46, br.to80, br.to100) cat(nobr.to46, nobr.to80, nobr.to100)
ro.07
> ##########################################
> # 카테고리 iv 2개
> ##########################################
> m.yrrndmealcat <- lm(api00~yr_rnd+mealcat, data=df)
> summary(m.yrrndmealcat)
Call:
lm(formula = api00 ~ yr_rnd + mealcat, data = df)
Residuals:
Min 1Q Median 3Q Max
-215.32 -49.50 1.65 49.17 183.63
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 808.013 6.040 133.777 < 2e-16 ***
yr_rndnobr -42.960 9.362 -4.589 5.99e-06 ***
mealcatto80 -163.737 8.515 -19.229 < 2e-16 ***
mealcatto100 -281.683 9.446 -29.821 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 68.89 on 396 degrees of freedom
Multiple R-squared: 0.7672, Adjusted R-squared: 0.7654
F-statistic: 435 on 3 and 396 DF, p-value: < 2.2e-16
>
> # 해석.
> coefs <- summary(m.yrrndmealcat)$coefficients
> coefs
Estimate Std. Error t value Pr(>|t|)
(Intercept) 808.01313 6.039984 133.777362 0.000000e+00
yr_rndnobr -42.96006 9.361761 -4.588886 5.990293e-06
mealcatto80 -163.73737 8.515015 -19.229252 1.128686e-58
mealcatto100 -281.68318 9.445676 -29.821388 2.767252e-103
> br.to46 <- coefs[1]
> br.to80 <- coefs[1]+coefs[3]
> br.to100 <- coefs[1]+coefs[4]
> nobr.to46 <- coefs[1]+coefs[2]
> nobr.to80 <- coefs[1]+coefs[2]+coefs[3]
> nobr.to100 <- coefs[1]+coefs[2]+coefs[4]
> cat(br.to46, br.to80, br.to100)
808.0131 644.2758 526.33
> cat(nobr.to46, nobr.to80, nobr.to100)
765.0531 601.3157 483.3699>
예측식은 아래와 같다.
y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100) yr_rnd: break = 방학있음 no_break = 방학없음 mealcat: 0-46% free meals 47-80% 81-100%
이에 대한 해석은 각각의 독립변인의 종류 수인 2개와 3개를 곱한 6개의 경우로 나누어서 생각할 수 있다. 즉,
y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100)
을 바탕으로 각각의 조건을 고려하여 y hat를 계산하면 아래와 같다.
TABLE. Two dummy variables
| mealcat0-46 | mealcat47-80 | mealcat81-100 | |
|---|---|---|---|
| yr_rndbreak | yr_rndbreak = 1 yr_rndno_break = 0 mealcat0-46 = 1 mealcat47-80 = 0 mealcat81-100 = 0 경우 y hat = 808.013 | yr_rndbreak = 1 yr_rndno_break = 0 mealcat0-46 = 0 mealcat47-80 = 1 mealcat81-100 = 0 경우 y hat = 808.013 - 163.737 = 644.276 | yr_rndbreak = 1 yr_rndno_break = 0 mealcat0-46 = 0 mealcat47-80 = 0 mealcat81-100 = 1 경우 y hat = 808.013 - 281.683 = 526.33 |
| yr_rndno_break | yr_rndbreak = 0 yr_rndno_break = 1 mealcat0-46 = 1 mealcat47-80 = 0 mealcat81-100 = 0 경우 y hat = 808.013 - 42.960 = 765.053 | yr_rndbreak = 0 yr_rndno_break = 1 mealcat0-46 = 0 mealcat47-80 = 1 mealcat81-100 = 0 경우 y hat = 808.013 - 42.960 - 163.737 = 601.316 | yr_rndbreak = 0 yr_rndno_break = 1 mealcat0-46 = 0 mealcat47-80 = 0 mealcat81-100 = 1 경우 y hat = 808.013 - 42.960 - 281.683 = 483.37 |
> # 해석. interaction
> m.yrrndxmealcat <- lm(api00~yr_rnd*mealcat, data=df)
> summary(m.yrrndxmealcat)
Call:
lm(formula = api00 ~ yr_rnd * mealcat, data = df)
Residuals:
Min 1Q Median 3Q Max
-207.533 -50.764 -1.843 48.874 179.000
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 809.685 6.185 130.911 < 2e-16 ***
yr_rndnobr -74.257 26.756 -2.775 0.00578 **
mealcatto80 -164.412 8.877 -18.522 < 2e-16 ***
mealcatto100 -288.193 10.443 -27.597 < 2e-16 ***
yr_rndnobr:mealcatto80 22.517 32.752 0.687 0.49217
yr_rndnobr:mealcatto100 40.764 29.231 1.395 0.16394
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 68.87 on 394 degrees of freedom
Multiple R-squared: 0.7685, Adjusted R-squared: 0.7656
F-statistic: 261.6 on 5 and 394 DF, p-value: < 2.2e-16
>
> coefs <- summary(m.yrrndxmealcat)$coefficients
> coefs
Estimate Std. Error t value Pr(>|t|)
(Intercept) 809.68548 6.184993 130.9113019 0.000000e+00
yr_rndnobr -74.25691 26.756287 -2.7753071 5.777728e-03
mealcatto80 -164.41198 8.876767 -18.5216067 1.529685e-55
mealcatto100 -288.19295 10.442837 -27.5971894 4.297367e-94
yr_rndnobr:mealcatto80 22.51674 32.751732 0.6874977 4.921736e-01
yr_rndnobr:mealcatto100 40.76438 29.231183 1.3945510 1.639371e-01
> br.to46 <- coefs[1]
> br.to80 <- coefs[1]+coefs[3]
> br.to100 <- coefs[1]+coefs[4]
> nobr.to46 <- coefs[1]+coefs[2]
> nobr.to80 <- coefs[1]+coefs[2]+coefs[3]+coefs[5]
> nobr.to100 <- coefs[1]+coefs[2]+coefs[4]+coefs[6]
> cat(br.to46, br.to80, br.to100)
809.6855 645.2735 521.4925> cat(nobr.to46, nobr.to80, nobr.to100)
735.4286 593.5333 488
>
위의 테스트는 두 개의 독립변인이 모두 종류이고 종속변인이 숫자일 때의 조건을 만족하니 factorial anova를 해도 된다. 아래는 그 결과이다.
> mod4 <- lm(api00 ~ yr_rnd + mealcat + yr_rnd:mealcat, data=datavar)
> summary(mod4)
Call:
lm(formula = api00 ~ yr_rnd + mealcat + yr_rnd:mealcat, data = datavar)
Residuals:
Min 1Q Median 3Q Max
-207.533 -50.764 -1.843 48.874 179.000
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 809.685 6.185 130.911 < 2e-16 ***
yr_rndno_break -74.257 26.756 -2.775 0.00578 **
mealcat47-80 -164.412 8.877 -18.522 < 2e-16 ***
mealcat81-100 -288.193 10.443 -27.597 < 2e-16 ***
yr_rndno_break:mealcat47-80 22.517 32.752 0.687 0.49217
yr_rndno_break:mealcat81-100 40.764 29.231 1.395 0.16394
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 68.87 on 394 degrees of freedom
Multiple R-squared: 0.7685, Adjusted R-squared: 0.7656
F-statistic: 261.6 on 5 and 394 DF, p-value: < 2.2e-16
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 809.685 6.185 130.911 < 2e-16 ***
yr_rndno_break -74.257 26.756 -2.775 0.00578 **
mealcat47-80 -164.412 8.877 -18.522 < 2e-16 ***
mealcat81-100 -288.193 10.443 -27.597 < 2e-16 ***
yr_rndno_break:mealcat47-80 22.517 32.752 0.687 0.49217
yr_rndno_break:mealcat81-100 40.764 29.231 1.395 0.16394
---
이전 식
y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100)
위의 식
y hat = 809.685 + -74.257*(yr_rndno_break) +
-164.412*(mealcat47-80) +
-288.193*(mealcat81-100) +
22.517*(yr_rndno_break:mealcat47-80) + --> aaaaa case
40.764*(yr_rndno_break:mealcat81-100) --> bbbbb case
yr_rnd:
break = 방학있음
no_break = 방학없음
mealcat:
0-46% free meals
47-80%
81-100%
| mealcat0-46 | mealcat47-80 | mealcat81-100 | |
|---|---|---|---|
| yr_rndbreak | 베이스라인 yr_rndno_break = 0 mealcat47-80 = 0 mealcat81-100 = 0 경우 y hat = 809.685 | yr_rndno_break = 0 mealcat0-46 = 0 mealcat81-100 = 0 경우 y hat = 809.685 - | yr_rndno_break = 0 mealcat0-46 = 0 mealcat47-80 = 0 경우 y hat = 809.685 - |
| yr_rndno_break | yr_rndbreak = 0 mealcat47-80 = 0 mealcat81-100 = 0 경우 y hat = 809.685 - | aaaaa yr_rndbreak = 0 mealcat0-46 = 0 mealcat81-100 = 0 경우 y hat = 809.685 - | bbbbb yr_rndbreak = 0 mealcat0-46 = 0 mealcat47-80 = 0 경우 y hat = 809.685 - |
마지막 두 케이스를 보면 no_break학교 중에서 밀카테고리 2와 3에서 떨어지는 정도가 어느 정도 완화되는 경향을 보이지만 통계학적으로 significant하지는 않다.






