User Tools

Site Tools


using_dummy_variables

This is an old revision of the document!


Categorical variables

2 groups

rscr01

rm(list=ls())

library(dplyr)
library(ggplot2)

ss <- function(x) {
  sum((x-mean(x))^2)
}
sp <- function(x, y) {
  sum((x-mean(x))*(y-mean(y)))
}

# 1. Generate numerical variable
set.seed(12) 
n <- 30
n.1 <- round(rnorm(n, mean = 50, sd = 20),0) # Continuous variable
n.2 = round(rnorm(n, 55, 20),0)
n.3 = round(rnorm(n, 60, 22),0)
sc <- data.frame(n.1,n.2,n.3)
sc.s <- stack(sc)
sc.s <- sc.s[-2]
head(sc.s)

c.1 <- (rep("lo", n))
c.2 <- (rep("mid", n))
c.3 <- rep("hi", n)
ca <- data.frame(c.1,c.2,c.3)
head(ca)
ca.s <- stack(ca)
ca.s <- ca.s[-2]
head(ca.s)

df <- data.frame(sc.s, ca.s)
colnames(df) <-c("score", "group")
df$group <- factor(df$group)
str(df)

head(df)
amean <- aggregate(df[, 1], list(df$group), mean)
colnames(amean)=c("group", "mean")
amean
boxplot(df$score~df$group)

# 세 그룹 간의 차이를 보려면 
# aov 테스트를 (F-test) 하면
# 된다. 
fm.df <- aov(score~group, data=df)
summary(fm.df)

# 이것을 regression을 해도 된다
m.df <- lm(score~group, data=df)
summary(m.df)
anova(m.df)
summary(fm.df)

boxplot(df$score~df$group)
amean

# 2. Manually create dummy variables (k - 1 = 3 variables needed)
df$lo <- ifelse(df$group == "lo", 1, 0)
df$mid <- ifelse(df$group == "mid", 1, 0)
df$hi <- ifelse(df$group == "hi", 1, 0)

head(df)
m.tmp <- lm(score~lo+mid, data=df)
summary(m.tmp)
summary(m.df)
m.tmp2 <- lm(score~mid+hi, data=df)
summary(m.tmp2)

b.hi <- m.tmp$coefficients[1]
b.lo <- m.tmp$coefficients[2]
b.mid <- m.tmp$coefficients[3]
b.lo 
b.mid
b.hi

mean.hi <- b.hi
mean.mid <- mean.hi+b.mid
mean.lo <- mean.hi+b.lo
data.frame(mean.hi, mean.lo, mean.mid)
amean

am.df <- aov(score~group, df)
summary(am.df)
posthoc.am.df <- TukeyHSD(am.df)
posthoc.am.df

# t-test의 의미 = baseline과 (절편) 차이가 있는가 테스트
# basedline의 t-test는 baseline이 0과 다른가?
# 다른 조건과 차이를 테스트하는 것은 아님 (주의)
summary(m.df)
# 위에서 grouplo의 significance는 lo 그룹과 hi 그룹이 
# (절편, baseline) 차이가 있는가를 테스트한 것 
# 마찬가지로 groupmid의 t-test는 baseline인 grouphi와
# 차이가 있는지 본 것으로 not significant
# 여기서 보지 못한 것은 mid와 lo그룹 간의 차이가 있는지를
# 보는 것이다
# 이를 위해서는 아래를 보면 된다
summary(m.tmp2)
# 위의 결과는 baseline인 lo가 mid와는 차이가 없는 것을 
# 나타낸다. lo와 hi는 존재
# 위는 두개의 regression을 해서 그룹 간 차이를 
# post hoc test처럼 한 것이고 이를 한 번에 처리하는 
# 방법은 

# install.packages("emmeans")
library(emmeans)
# Calculate estimated marginal means for each group
gmeans <- emmeans(m.df, specs = ~ group)
# Perform pairwise comparisons using Tukey's adjustment (Default)
pairs(gmeans, adjust = "tukey")

rout01

> rm(list=ls())
> 
> library(dplyr)
> library(ggplot2)
> 
> ss <- function(x) {
+   sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+   sum((x-mean(x))*(y-mean(y)))
+ }
> 
> # 1. Generate numerical variable
> set.seed(12) 
> n <- 30
> n.1 <- round(rnorm(n, mean = 50, sd = 20),0) # Continuous variable
> n.2 = round(rnorm(n, 55, 20),0)
> n.3 = round(rnorm(n, 60, 22),0)
> sc <- data.frame(n.1,n.2,n.3)
> sc.s <- stack(sc)
> sc.s <- sc.s[-2]
> head(sc.s)
  values
1     20
2     82
3     31
4     32
5     10
6     45
> 
> c.1 <- (rep("lo", n))
> c.2 <- (rep("mid", n))
> c.3 <- rep("hi", n)
> ca <- data.frame(c.1,c.2,c.3)
> head(ca)
  c.1 c.2 c.3
1  lo mid  hi
2  lo mid  hi
3  lo mid  hi
4  lo mid  hi
5  lo mid  hi
6  lo mid  hi
> ca.s <- stack(ca)
> ca.s <- ca.s[-2]
> head(ca.s)
  values
1     lo
2     lo
3     lo
4     lo
5     lo
6     lo
> 
> df <- data.frame(sc.s, ca.s)
> colnames(df) <-c("score", "group")
> df$group <- factor(df$group)
> str(df)
'data.frame':	90 obs. of  2 variables:
 $ score: num  20 82 31 32 10 45 44 37 48 59 ...
 $ group: Factor w/ 3 levels "hi","lo","mid": 2 2 2 2 2 2 2 2 2 2 ...
> 
> head(df)
  score group
1    20    lo
2    82    lo
3    31    lo
4    32    lo
5    10    lo
6    45    lo
> amean <- aggregate(df[, 1], list(df$group), mean)
> colnames(amean)=c("group", "mean")
> amean
  group     mean
1    hi 62.33333
2    lo 46.96667
3   mid 54.10000
> boxplot(df$score~df$group)
> 
> # 세 그룹 간의 차이를 보려면 
> # aov 테스트를 (F-test) 하면
> # 된다. 
> fm.df <- aov(score~group, data=df)
> summary(fm.df)
            Df Sum Sq Mean Sq F value  Pr(>F)   
group        2   3548  1774.0   5.278 0.00686 **
Residuals   87  29242   336.1                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> # 이것을 regression을 해도 된다
> m.df <- lm(score~group, data=df)
> summary(m.df)

Call:
lm(formula = score ~ group, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.333      3.347  18.622  < 2e-16 ***
grouplo      -15.367      4.734  -3.246  0.00166 ** 
groupmid      -8.233      4.734  -1.739  0.08552 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> anova(m.df)
Analysis of Variance Table

Response: score
          Df  Sum Sq Mean Sq F value   Pr(>F)   
group      2  3548.1 1774.03   5.278 0.006863 **
Residuals 87 29242.3  336.12                    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(fm.df)
            Df Sum Sq Mean Sq F value  Pr(>F)   
group        2   3548  1774.0   5.278 0.00686 **
Residuals   87  29242   336.1                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> boxplot(df$score~df$group)
> amean
  group     mean
1    hi 62.33333
2    lo 46.96667
3   mid 54.10000
> 
> # 2. Manually create dummy variables (k - 1 = 3 variables needed)
> df$lo <- ifelse(df$group == "lo", 1, 0)
> df$mid <- ifelse(df$group == "mid", 1, 0)
> df$hi <- ifelse(df$group == "hi", 1, 0)
> 
> head(df)
  score group lo mid hi
1    20    lo  1   0  0
2    82    lo  1   0  0
3    31    lo  1   0  0
4    32    lo  1   0  0
5    10    lo  1   0  0
6    45    lo  1   0  0
> m.tmp <- lm(score~lo+mid, data=df)
> summary(m.tmp)

Call:
lm(formula = score ~ lo + mid, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.333      3.347  18.622  < 2e-16 ***
lo           -15.367      4.734  -3.246  0.00166 ** 
mid           -8.233      4.734  -1.739  0.08552 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> summary(m.df)

Call:
lm(formula = score ~ group, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.333      3.347  18.622  < 2e-16 ***
grouplo      -15.367      4.734  -3.246  0.00166 ** 
groupmid      -8.233      4.734  -1.739  0.08552 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> m.tmp2 <- lm(score~mid+hi, data=df)
> summary(m.tmp2)

Call:
lm(formula = score ~ mid + hi, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   46.967      3.347  14.031  < 2e-16 ***
mid            7.133      4.734   1.507  0.13545    
hi            15.367      4.734   3.246  0.00166 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> 
> b.hi <- m.tmp$coefficients[1]
> b.lo <- m.tmp$coefficients[2]
> b.mid <- m.tmp$coefficients[3]
> b.lo 
       lo 
-15.36667 
> b.mid
      mid 
-8.233333 
> b.hi
(Intercept) 
   62.33333 
> 
> mean.hi <- b.hi
> mean.mid <- mean.hi+b.mid
> mean.lo <- mean.hi+b.lo
> data.frame(mean.hi, mean.lo, mean.mid)
             mean.hi  mean.lo mean.mid
(Intercept) 62.33333 46.96667     54.1
> amean
  group     mean
1    hi 62.33333
2    lo 46.96667
3   mid 54.10000
> 
> am.df <- aov(score~group, df)
> summary(am.df)
            Df Sum Sq Mean Sq F value  Pr(>F)   
group        2   3548  1774.0   5.278 0.00686 **
Residuals   87  29242   336.1                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> posthoc.am.df <- TukeyHSD(am.df)
> posthoc.am.df
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = score ~ group, data = df)

$group
             diff        lwr       upr     p adj
lo-hi  -15.366667 -26.654078 -4.079255 0.0046872
mid-hi  -8.233333 -19.520745  3.054078 0.1965275
mid-lo   7.133333  -4.154078 18.420745 0.2926865

> 
> # t-test의 의미 = baseline과 (절편) 차이가 있는가 테스트
> # basedline의 t-test는 baseline이 0과 다른가?
> # 다른 조건과 차이를 테스트하는 것은 아님 (주의)
> summary(m.df)

Call:
lm(formula = score ~ group, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.333      3.347  18.622  < 2e-16 ***
grouplo      -15.367      4.734  -3.246  0.00166 ** 
groupmid      -8.233      4.734  -1.739  0.08552 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> # 위에서 grouplo의 significance는 lo 그룹과 hi 그룹이 
> # (절편, baseline) 차이가 있는가를 테스트한 것 
> # 마찬가지로 groupmid의 t-test는 baseline인 grouphi와
> # 차이가 있는지 본 것으로 not significant
> # 여기서 보지 못한 것은 mid와 lo그룹 간의 차이가 있는지를
> # 보는 것이다
> # 이를 위해서는 아래를 보면 된다
> summary(m.tmp2)

Call:
lm(formula = score ~ mid + hi, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-49.333 -11.242  -1.533  12.000  43.033 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   46.967      3.347  14.031  < 2e-16 ***
mid            7.133      4.734   1.507  0.13545    
hi            15.367      4.734   3.246  0.00166 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.33 on 87 degrees of freedom
Multiple R-squared:  0.1082,	Adjusted R-squared:  0.0877 
F-statistic: 5.278 on 2 and 87 DF,  p-value: 0.006863

> # 위의 결과는 baseline인 lo가 mid와는 차이가 없는 것을 
> # 나타낸다. lo와 hi는 존재
> # 위는 두개의 regression을 해서 그룹 간 차이를 
> # post hoc test처럼 한 것이고 이를 한 번에 처리하는 
> # 방법은 
> 
> # install.packages("emmeans")
> library(emmeans)
> # Calculate estimated marginal means for each group
> gmeans <- emmeans(m.df, specs = ~ group)
> # Perform pairwise comparisons using Tukey's adjustment (Default)
> pairs(gmeans, adjust = "tukey")
 contrast estimate   SE df t.ratio p.value
 hi - lo     15.37 4.73 87   3.246  0.0047
 hi - mid     8.23 4.73 87   1.739  0.1965
 lo - mid    -7.13 4.73 87  -1.507  0.2927

P value adjustment: tukey method for comparing a family of 3 estimates 

2 group: e.g. 2

data: for spss
elemapi2.sav
elemapi2_categories.sps
data: for r
elemapi2.csv

rscrp.02

####################
# 다른 예
####################
rm(list=ls())

# library(dplyr)
library(ggplot2)

ss <- function(x) {
  sum((x-mean(x))^2)
}
sp <- function(x, y) {
  sum((x-mean(x))*(y-mean(y)))
}

datavar <- read.csv("http://commres.net/_media/r/elemapi2.csv")
head(datavar)

rout.02

> ####################
> # 다른 예
> ####################
> rm(list=ls())
> 
> # library(dplyr)
> library(ggplot2)
> 
> ss <- function(x) {
+   sum((x-mean(x))^2)
+ }
> sp <- function(x, y) {
+   sum((x-mean(x))*(y-mean(y)))
+ }
> 
> datavar <- read.csv("http://commres.net/_media/r/elemapi2.csv")
> head(datavar)
  snum dnum api00 api99 growth meals ell yr_rnd mobility acs_k3 acs_46 not_hsg hsg some_col
1  906   41   693   600     93    67   9      0       11     16     22       0   0        0
2  889   41   570   501     69    92  21      0       33     15     32       0   0        0
3  887   41   546   472     74    97  29      0       36     17     25       0   0        0
4  876   41   571   487     84    90  27      0       27     20     30      36  45        9
5  888   41   478   425     53    89  30      0       44     18     31      50  50        0
6 4284   98   858   844     14    10   3      0       10     20     33       1   8       24
  col_grad grad_sch avg_ed full emer enroll mealcat collcat
1        0        0     NA   76   24    247       2       1
2        0        0     NA   79   19    463       3       1
3        0        0     NA   68   29    395       3       1
4        9        0   1.91   87   11    418       3       1
5        0        0   1.50   87   13    520       3       1
6       36       31   3.89  100    0    343       1       2
> 

datavar data set의 변인 설명

	Variable Labels
Variable	Position	Label
snum	1	school number
dnum	2	district number
api00	3	api 2000
api99	4	api 1999
growth	5	growth 1999 to 2000
meals	6	pct free meals
ell	7	english language learners
yr_rnd	8	year round school 
mobility	9	pct 1st year in school
acs_k3	10	avg class size k-3
acs_46	11	avg class size 4-6
not_hsg	12	parent not hsg
hsg	13	parent hsg
some_col	14	parent some college
col_grad	15	parent college grad
grad_sch	16	parent grad school
avg_ed	17	avg parent ed
full	18	pct full credential
emer	19	pct emer credential
enroll	20	number of students
mealcat	21	Percentage free meals in 3 categories
collcat	22	<none>
Variables in the working file

yr_rnd:
0 = 방학있음 
1 = 방학없음

mealcat: 
1 = 0-46% free meals
2 = 47-80
3 = 81-100
  • pct (%) of emer (emergency) credentials: Some states will grant an emergency certificate or permit at the request of a school district that has posted and failed to find a qualified candidate for a teacher vacancy. It typically allows the candidate to serve in a temporary capacity for the duration of a school year.
  • pct (%) of full credentials: To be fully certified, it generally means that a teacher must have graduated from an accredited college, completed an approved teacher credential program and passed a test of their academic skills.

위의 변인들 중에서 “무방학학교”가 성적에 어떤 영향을 미칠 것인가를 알아 보기 위해서 regression 테스트를 시행하였다. 아래는 그 결과이다.

rscrp.03

# data 정리 
df <- datavar[,c(1,3,6,7,8,9,18,21)]
head(df)
str(df)
df$yr_rnd <- factor(df$yr_rnd, levels=c(0,1), labels=c("break", "nobr"))
df$mealcat <- factor(df$mealcat, levels=c(1:3), labels=c("to46", "to80", "to100"))
str(df)

# regression
m.br <- lm(api00~yr_rnd, data=df)
summary(m.br)

# making dummy variables
df$nobr <- ifelse(df$yr_rnd == "nobr", 1, 0)
df$br<- ifelse(df$yr_rnd == "break", 1, 0)
head(df)

m.br2 <- lm(api00~nobr,data=df)
summary(m.br2)

m.br3 <- lm(api00~br, data=df)
summary(m.br3)

# baseline 변화
df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "nobr")
m.br4 <- lm(api00 ~ yr_rnd, data = df)
summary(m.br4)
# 원래로 복원
df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "break")

head(df)
tmp <- aggregate(df[, 2], list(df$yr_rnd), mean)
mean.br <- tmp[[2]][1]
mean.nobr <- tmp[[2]][2]
data.frame(mean.br, mean.nobr)

boxplot(df$api00~df$yr_rnd)

t.out <- t.test(df$api00~df$yr_rnd, var.eq=T)
t.out
t.out$estimate
# str(t.out)
t.value <- t.out$statistic

f.out <- aov(df$api00~df$yr_rnd, data=df)
summary(f.out)
f.value <- summary(f.out)[[1]][1, "F value"]

summary(m.br)
f.value.lm <- summary(m.br)$fstatistic[1]
t.value.lm <- summary(m.br)$coefficients["yr_rndnobr", "t value"]

data.frame(t.value, t.value^2, f.value, f.value.lm, t.value.lm)

plot(df$yr_rnd,df$api00)

df %>%
  ggplot(aes(x = yr_rnd, y = api00, color = factor(yr_rnd))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1", name = "breaks") +
  labs(title = "Regression: with yr_rnd break",
       x = " (yr_rnd)",
       y = "api00")

summary(m.br)

rout.03

> # data 정리 
> df <- datavar[,c(1,3,6,7,8,9,18,21)]
> head(df)
  snum api00 meals ell yr_rnd mobility full mealcat
1  906   693    67   9      0       11   76       2
2  889   570    92  21      0       33   79       3
3  887   546    97  29      0       36   68       3
4  876   571    90  27      0       27   87       3
5  888   478    89  30      0       44   87       3
6 4284   858    10   3      0       10  100       1
> str(df)
'data.frame':	400 obs. of  8 variables:
 $ snum    : int  906 889 887 876 888 4284 4271 2910 2899 2887 ...
 $ api00   : int  693 570 546 571 478 858 918 831 860 737 ...
 $ meals   : int  67 92 97 90 89 10 5 2 5 29 ...
 $ ell     : int  9 21 29 27 30 3 2 3 6 15 ...
 $ yr_rnd  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mobility: int  11 33 36 27 44 10 16 44 10 17 ...
 $ full    : int  76 79 68 87 87 100 100 96 100 96 ...
 $ mealcat : int  2 3 3 3 3 1 1 1 1 1 ...
> df$yr_rnd <- factor(df$yr_rnd, levels=c(0,1), labels=c("break", "nobr"))
> df$mealcat <- factor(df$mealcat, levels=c(1:3), labels=c("to46", "to80", "to100"))
> str(df)
'data.frame':	400 obs. of  8 variables:
 $ snum    : int  906 889 887 876 888 4284 4271 2910 2899 2887 ...
 $ api00   : int  693 570 546 571 478 858 918 831 860 737 ...
 $ meals   : int  67 92 97 90 89 10 5 2 5 29 ...
 $ ell     : int  9 21 29 27 30 3 2 3 6 15 ...
 $ yr_rnd  : Factor w/ 2 levels "break","nobr": 1 1 1 1 1 1 1 1 1 1 ...
 $ mobility: int  11 33 36 27 44 10 16 44 10 17 ...
 $ full    : int  76 79 68 87 87 100 100 96 100 96 ...
 $ mealcat : Factor w/ 3 levels "to46","to80",..: 2 3 3 3 3 1 1 1 1 1 ...
> 
> # regression
> m.br <- lm(api00~yr_rnd, data=df)
> summary(m.br)

Call:
lm(formula = api00 ~ yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   684.54       7.14   95.88   <2e-16 ***
yr_rndnobr   -160.51      14.89  -10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

> 
> # making dummy variables
> df$nobr <- ifelse(df$yr_rnd == "nobr", 1, 0)
> df$br<- ifelse(df$yr_rnd == "break", 1, 0)
> head(df)
  snum api00 meals ell yr_rnd mobility full mealcat nobr br
1  906   693    67   9  break       11   76    to80    0  1
2  889   570    92  21  break       33   79   to100    0  1
3  887   546    97  29  break       36   68   to100    0  1
4  876   571    90  27  break       27   87   to100    0  1
5  888   478    89  30  break       44   87   to100    0  1
6 4284   858    10   3  break       10  100    to46    0  1
> 
> m.br2 <- lm(api00~nobr,data=df)
> summary(m.br2)

Call:
lm(formula = api00 ~ nobr, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   684.54       7.14   95.88   <2e-16 ***
nobr         -160.51      14.89  -10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

> 
> m.br3 <- lm(api00~br, data=df)
> summary(m.br3)

Call:
lm(formula = api00 ~ br, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   524.03      13.06   40.11   <2e-16 ***
br            160.51      14.89   10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

> 
> # baseline 변화
> df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "nobr")
> m.br4 <- lm(api00 ~ yr_rnd, data = df)
> summary(m.br4)

Call:
lm(formula = api00 ~ yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   524.03      13.06   40.11   <2e-16 ***
yr_rndbreak   160.51      14.89   10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

> # 원래로 복원
> df$yr_rnd <- relevel(factor(df$yr_rnd), ref = "break")
> 
> head(df)
  snum api00 meals ell yr_rnd mobility full mealcat nobr br
1  906   693    67   9  break       11   76    to80    0  1
2  889   570    92  21  break       33   79   to100    0  1
3  887   546    97  29  break       36   68   to100    0  1
4  876   571    90  27  break       27   87   to100    0  1
5  888   478    89  30  break       44   87   to100    0  1
6 4284   858    10   3  break       10  100    to46    0  1
> tmp <- aggregate(df[, 2], list(df$yr_rnd), mean)
> mean.br <- tmp[[2]][1]
> mean.nobr <- tmp[[2]][2]
> data.frame(mean.br, mean.nobr)
  mean.br mean.nobr
1 684.539  524.0326
> 
> boxplot(df$api00~df$yr_rnd)
> 
> t.out <- t.test(df$api00~df$yr_rnd, var.eq=T)
> t.out

	Two Sample t-test

data:  df$api00 by df$yr_rnd
t = 10.782, df = 398, p-value < 2.2e-16
alternative hypothesis: true difference in means between group break and group nobr is not equal to 0
95 percent confidence interval:
 131.2390 189.7737
sample estimates:
mean in group break  mean in group nobr 
           684.5390            524.0326 

> t.out$estimate
mean in group break  mean in group nobr 
           684.5390            524.0326 
> # str(t.out)
> t.value <- t.out$statistic
> 
> f.out <- aov(df$api00~df$yr_rnd, data=df)
> summary(f.out)
             Df  Sum Sq Mean Sq F value Pr(>F)    
df$yr_rnd     1 1825001 1825001   116.2 <2e-16 ***
Residuals   398 6248671   15700                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> f.value <- summary(f.out)[[1]][1, "F value"]
> 
> summary(m.br)

Call:
lm(formula = api00 ~ yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   684.54       7.14   95.88   <2e-16 ***
yr_rndnobr   -160.51      14.89  -10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

> f.value.lm <- summary(m.br)$fstatistic[1]
> t.value.lm <- summary(m.br)$coefficients["yr_rndnobr", "t value"]
> 
> data.frame(t.value, t.value^2, f.value, f.value.lm, t.value.lm)
  t.value t.value.2  f.value f.value.lm t.value.lm
t 10.7815  116.2407 116.2407   116.2407   -10.7815
> 
> plot(df$yr_rnd,df$api00)
> 
> df %>%
+   ggplot(aes(x = yr_rnd, y = api00, color = factor(yr_rnd))) +
+   geom_point(size = 3) +
+   geom_smooth(method = "lm", se = FALSE) +
+   scale_color_brewer(palette = "Set1", name = "breaks") +
+   labs(title = "Regression: with yr_rnd break",
+        x = " (yr_rnd)",
+        y = "api00")
`geom_smooth()` using formula = 'y ~ x'
> 
> summary(m.br)

Call:
lm(formula = api00 ~ yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-273.539  -95.662    0.967  103.341  297.967 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   684.54       7.14   95.88   <2e-16 ***
yr_rndnobr   -160.51      14.89  -10.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 125.3 on 398 degrees of freedom
Multiple R-squared:  0.226,	Adjusted R-squared:  0.2241 
F-statistic: 116.2 on 1 and 398 DF,  p-value: < 2.2e-16

>



위의 아웃풋을 살펴 보면, 학생들의 성적이 가지는 총 변량의 (Sum of Square Total) 약 22.6% 를 방학이 있고 없고로 구분되는 yr_rnd 변인이 설명을 하고 있으며, 이는 통계적으로 유의미한 것이다 (F(1, 398) = 116.241, p < .001). 위의 regression output은 yr_rnd 변인 중에서 방학이 있는 특성이 baseline이 되어 있으며, 이를 염두에 두고 regression식을 적어 보면 아래와 같다.

y hat = 684.54 - 160.51 * (yr_rndno_break)

  • yr_rndno_break: yr_rndno_break = 1
    • y hat = 684.54 - 160.51 * (1)
    • y hat = 524.03
  • yr_rndbreak: yr_rndnobreak = 0
    • y hat = 684.54 - 160.51 * (0)
    • y hat = 684.54

위 회귀식에서 r은
y hat = a + bX 의 형식에서 Xno_break를 썼음을 (yr_rndno_break) 알 수 있다. 이 경우에는 yr_rndno_break를 1로 넣어서 해석을 한다. 즉, no break일 경우에는 y hat = 684.54 - 160.51(1) 이라는 것이다. 반대로 break일 경우에는 뒤쪽 부분이 해당이 안되므로 0으로 대체한다. 따라서 이 경우의 회귀식은 y hat = 684.54이다. 이 둘을 비교해보면 no break일 경우에는
y hat = 684.54 - 160.51(1) = 524.03 이고
break일 경우에는
y hat = 684.54 - 160.51(0) = 684.54 라는 것이다. 다시 이야기하면 break가 없는 학교의 평균 api점수는 524.03점인 반면에 break가 있는 학교의 평균은 684.54 이다. 이 점수의 차이는 두 집단의 평균을 비교하는 것과 같은 형태를 (형식) 갖는다. 즉, t-test를 하는 것과 마찬가지이다. 위에서 얻은

  • t.value
  • F.value
  • t.value.lm
  • F.value.lm 은 모두 같은 원리로 두 그룹을 (집단의 api 평균을) 테스트 한 것이다.

regressioncategory.jpg

위의 그래프는 두 그룹의 (break, no break 학교그룹) api 점수 분포를 나타내 주는 것이다. 그리고 직선은 두 그룹 평균을 연결한 선으로 $\hat{Y} = 684.539 - 160.506 X$ 으로 표현된다.

이와 같이 종류변인(category, nominal)을 가지고서도 regression 테스트를 할 수 있으며, 사실 이는 t-test나 F-test와 다르지 않다. 위에서 주의해야 할 점은 두 변인의 종류를 손으로 coding할 때, 1과 2가 아닌, 0과 1로 하였다는 점이다. 이렇게 하는 이유는 해석하기에 편하기 때문이며, 이것이 보통의 방법이다. 그러나, 1과 2로 coding 데이터를 이용해도 크게 다른지 않은 결과를 구하게 된다. 다른 점이라면, 절편에 해당되는 상수값이 다르게 되며, coefficient값은 위의 분석과 동일한 값을 갖게 된다. 그리고 일반적으로 r에서는 dummy variables을 얻기 위한 coding을 하지 않아도 r이 스스로 구하여 계산을 한다. 즉, dummy variable 을 구하지 않은 채로 lm(api00 ~ yr_rnd, data=df) 만으로도 계산이 된다.

Regression with a categorical IV with 3 attributes

rs.3att

m.mealcat <- lm(api00 ~ mealcat, data=df) 
summary(m.mealcat)

ro.3att

> #######################################
> # categorical IV with 3 or more attributes
> #######################################
> m.mealcat <- lm(api00 ~ mealcat, data=df) 
> summary(m.mealcat)

Call:
lm(formula = api00 ~ mealcat, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-253.394  -47.883    0.282   52.282  185.620 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   805.718      6.169  130.60   <2e-16 ***
mealcatto80  -166.324      8.708  -19.10   <2e-16 ***
mealcatto100 -301.338      8.629  -34.92   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 70.61 on 397 degrees of freedom
Multiple R-squared:  0.7548,	Adjusted R-squared:  0.7536 
F-statistic: 611.1 on 2 and 397 DF,  p-value: < 2.2e-16

> 
y hat = 805.718 - 166.324 * to80 - 301.338 * to100 
mealcat0-46 (to46 으로 대체)
mealcat47-80 (to80 으로 대체)
maelcat81-100 (to100 으로 대체)

이에 대한 해석도 앞에서의 것과 마찬가지이다.

  • y hat = 805.718 - 166.324*mg2 - 301.338*mg3
  • mg1 = 1, mg2 = 0, mg3 = 0 일 경우
    • y hat = 805.718 - 166.324*(0) - 301.338*(0)
    • y hat = 805.718
  • mg1 = 0, mg2 = 1, mg3 = 0 일 경우
    • y hat = 805.718 - 166.324*(1) - 301.338*(0)
    • y hat = 805.718 - 166.324
    • y hat = 639.394
  • mg1 = 0, mg2 = 0, mg3 = 1 일 경우
    • y hat = 805.718 - 166.324*(0) - 301.338*(1)
    • y hat = 805.718 - 301.338
    • y hat = 504.38
  • 즉, 무료급식의 퍼센티지가 높을 수록 api점수가 낮음을 알 수 있다. 이렇게 무료급식 퍼센티지를 독립변인으로 종속변인인 api00점수를 (학력점수) 봤을 때, 그 설명력이 통계학적으로 유효한가는 regression output에서 (summary(mod2))
    • F-value 와 p-value를 가지고 판단한다.
      • (F (2, 397) = 611.1; p-value < 2.2e-16)
      • 위에서 2, 397 은 각각 between degrees of freedom 과 within degrees of freedom 이다. 이를 보고도 우리는
      • 총 400개의 학교가 데이터에 참여했음을 알 수 있고 (2 + 397 에 1을 더한 값),
      • 독립변인의 종류가 3가지 (df = 2 이므로) 임을 알 수 있다.
    • R square value 는 설명력의 크기를 알려준다.
      • 0.7548 즉, 75.48% 를 독립변인이 종속변인을 설명한다 (상당한 크기임을 알 수 있다).

Regression with a continuous and a categorical IV

rs.04

#######################################
#######################################
# with a continuous and a category IV
#######################################
#######################################
m.mealsyr_rnd <- lm(api00~meals+yr_rnd, data=df)
summary(m.mealsyr_rnd)

m.mealsxyr_rnd <- lm(api00~meals*yr_rnd, data=df)
summary(m.mealsxyr_rnd)

# 중요: 위에서 yr_rnd의 설명력이 t test를 보면 
# 사라짐. 왜?
# interaction을 추가하면서 yr_rnd의 설명력은 
# (방학이 있고 없음의 설명력은) meals의 percentage가
# 0 이 될때의 것으로 설정된다. 0이 되는 데이터가
# 극단치이고 이 경우에 nobr인 학교도 없으므로 계산된
# se값이 극대화되어 t값이 작아지게 된다. 이를 바로
# 잡으려면 meals의 0이 중간에 가도록 한 후에 
# regression을 하면 meals가 0 일때가 극단치가 되지
# 않으므로 결과가 옳게 나온다

df %>%
  ggplot(aes(x = meals, y = api00, color = factor(yr_rnd))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1", name = "yr_rnd") +
  labs(title = "Multiple Regression: Interaction by yr_rnd break",
       x = " (free meals percentage)",
       y = "api00")


# 1. 평균을 빼준 값을 새로운 변인으로 저장
df$meals_centered <- 
  df$meals - mean(df$meals)

# 2. Run the model again using the centered variable
m_centered <- lm(api00 ~ meals_centered * yr_rnd, 
                     data = df)

# 3. Check the new summary
summary(m_centered)

df %>%
  ggplot(aes(x = meals_centered, y = api00, color = factor(yr_rnd))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1", name = "yr_rnd") +
  labs(title = "Multiple Regression: Interaction by yr_rnd break",
       x = " (free meals percentage CENTERED)",
       y = "api00")


# Install the package if you do not have it
# install.packages("interactions")

# Plot the interaction
library(interactions)
interact_plot(m.mealsxyr_rnd, pred = meals, modx = yr_rnd)



m.ellyr_rnd <- lm(api00~ell+yr_rnd, data=df)
summary(m.ellyr_rnd)
m.ellxyr_rnd <- lm(api00~ell*yr_rnd, data=df)
summary(m.ellxyr_rnd)
df %>%
  ggplot(aes(x = ell, y = api00, color = factor(yr_rnd))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1", name = "Cylinders") +
  labs(title = "Multiple Regression: Interaction by yr_rnd break",
       x = " (ell, english language learner)",
       y = "api00")

# ell mean centred 이번 경우는 큰 차이를 보이지 않는다
df$ell_centered <- df$ell - mean(df$ell)
m.ell.centered <- lm(api00 ~ ell_centered * yr_rnd, data = df)
summary(m.ell.centered)

coefs <- summary(m.ellxyr_rnd)$coefficients
coefs 
api.br <- coefs[1]
api.nobr <- coefs[1]+coefs[3]
slope.red <- coefs[2]
slope.blue <- coefs[2]+coefs[4]
data.frame(api.br, api.nobr, slope.red, slope.blue)
# br과 nobr에 따라서 ell의 slope가 달라지는것이
# interaction effects

ro.04

> #######################################
> #######################################
> # with a continuous and a category IV
> #######################################
> #######################################
> m.mealsyr_rnd <- lm(api00~meals+yr_rnd, data=df)
> summary(m.mealsyr_rnd)

Call:
lm(formula = api00 ~ meals + yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-174.882  -41.542   -1.044   39.454  176.007 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 885.6192     6.4712 136.855  < 2e-16 ***
meals        -3.7921     0.1036 -36.595  < 2e-16 ***
yr_rndnobr  -40.3291     7.8479  -5.139 4.35e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 59.99 on 397 degrees of freedom
Multiple R-squared:  0.823,	Adjusted R-squared:  0.8221 
F-statistic: 923.2 on 2 and 397 DF,  p-value: < 2.2e-16

> 
> m.mealsxyr_rnd <- lm(api00~meals*yr_rnd, data=df)
> summary(m.mealsxyr_rnd)

Call:
lm(formula = api00 ~ meals * yr_rnd, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-175.78  -41.96   -0.76   39.06  175.49 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      885.0046     6.7647 130.827   <2e-16 ***
meals             -3.7805     0.1100 -34.354   <2e-16 ***
yr_rndnobr       -31.8737    27.9104  -1.142    0.254    
meals:yr_rndnobr  -0.1041     0.3299  -0.316    0.752    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 60.06 on 396 degrees of freedom
Multiple R-squared:  0.8231,	Adjusted R-squared:  0.8217 
F-statistic: 614.1 on 3 and 396 DF,  p-value: < 2.2e-16

> 
> # 중요: 위에서 yr_rnd의 설명력이 t test를 보면 
> # 사라짐. 왜?
> # interaction을 추가하면서 yr_rnd의 설명력은 
> # (방학이 있고 없음의 설명력은) meals의 percentage가
> # 0 이 될때의 것으로 설정된다. 0이 되는 데이터가
> # 극단치이고 이 경우에 nobr인 학교도 없으므로 계산된
> # se값이 극대화되어 t값이 작아지게 된다. 이를 바로
> # 잡으려면 meals의 0이 중간에 가도록 한 후에 
> # regression을 하면 meals가 0 일때가 극단치가 되지
> # 않으므로 결과가 옳게 나온다
> 
> df %>%
+   ggplot(aes(x = meals, y = api00, color = factor(yr_rnd))) +
+   geom_point(size = 3) +
+   geom_smooth(method = "lm", se = FALSE) +
+   scale_color_brewer(palette = "Set1", name = "yr_rnd") +
+   labs(title = "Multiple Regression: Interaction by yr_rnd break",
+        x = " (free meals percentage)",
+        y = "api00")
`geom_smooth()` using formula = 'y ~ x'
> 
> 
> # 1. 평균을 빼준 값을 새로운 변인으로 저장
> df$meals_centered <- 
+   df$meals - mean(df$meals)
> 
> # 2. Run the model again using the centered variable
> m_centered <- lm(api00 ~ meals_centered * yr_rnd, 
+                      data = df)
> 
> # 3. Check the new summary
> summary(m_centered)

Call:
lm(formula = api00 ~ meals_centered * yr_rnd, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-175.78  -41.96   -0.76   39.06  175.49 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)               656.9827     3.5150 186.910  < 2e-16 ***
meals_centered             -3.7805     0.1100 -34.354  < 2e-16 ***
yr_rndnobr                -38.1550    10.4473  -3.652 0.000295 ***
meals_centered:yr_rndnobr  -0.1041     0.3299  -0.316 0.752386    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 60.06 on 396 degrees of freedom
Multiple R-squared:  0.8231,	Adjusted R-squared:  0.8217 
F-statistic: 614.1 on 3 and 396 DF,  p-value: < 2.2e-16

> 
> df %>%
+   ggplot(aes(x = meals_centered, y = api00, color = factor(yr_rnd))) +
+   geom_point(size = 3) +
+   geom_smooth(method = "lm", se = FALSE) +
+   scale_color_brewer(palette = "Set1", name = "yr_rnd") +
+   labs(title = "Multiple Regression: Interaction by yr_rnd break",
+        x = " (free meals percentage CENTERED)",
+        y = "api00")
`geom_smooth()` using formula = 'y ~ x'
> 
> 
> # Install the package if you do not have it
> # install.packages("interactions")
> 
> # Plot the interaction
> library(interactions)
> interact_plot(m.mealsxyr_rnd, pred = meals, modx = yr_rnd)
> 
> 
> 
> m.ellyr_rnd <- lm(api00~ell+yr_rnd, data=df)
> summary(m.ellyr_rnd)

Call:
lm(formula = api00 ~ ell + yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-265.843  -55.349    0.334   70.673  195.778 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 784.3978     7.2878  107.63  < 2e-16 ***
ell          -4.0427     0.2094  -19.31  < 2e-16 ***
yr_rndnobr  -41.8420    12.3441   -3.39  0.00077 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 90.1 on 397 degrees of freedom
Multiple R-squared:  0.6008,	Adjusted R-squared:  0.5988 
F-statistic: 298.8 on 2 and 397 DF,  p-value: < 2.2e-16

> m.ellxyr_rnd <- lm(api00~ell*yr_rnd, data=df)
> summary(m.ellxyr_rnd)

Call:
lm(formula = api00 ~ ell * yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-270.514  -57.800    5.994   65.845  206.275 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     794.2576     7.8422 101.280  < 2e-16 ***
ell              -4.4418     0.2420 -18.354  < 2e-16 ***
yr_rndnobr     -110.5779    24.7935  -4.460 1.07e-05 ***
ell:yr_rndnobr    1.4884     0.4673   3.185  0.00156 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 89.08 on 396 degrees of freedom
Multiple R-squared:  0.6108,	Adjusted R-squared:  0.6078 
F-statistic: 207.1 on 3 and 396 DF,  p-value: < 2.2e-16

> df %>%
+   ggplot(aes(x = ell, y = api00, color = factor(yr_rnd))) +
+   geom_point(size = 3) +
+   geom_smooth(method = "lm", se = FALSE) +
+   scale_color_brewer(palette = "Set1", name = "Cylinders") +
+   labs(title = "Multiple Regression: Interaction by yr_rnd break",
+        x = " (ell, english language learner)",
+        y = "api00")
`geom_smooth()` using formula = 'y ~ x'
>
>
> # ell mean centred 이번 경우는 큰 차이를 보이지 않는다
> df$ell_centered <- df$ell - mean(df$ell)
> m.ell.centered <- lm(api00 ~ ell_centered * yr_rnd, data = df)
> summary(m.ell.centered)

Call:
lm(formula = api00 ~ ell_centered * yr_rnd, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-270.514  -57.800    5.994   65.845  206.275 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             654.5514     5.3323 122.752  < 2e-16 ***
ell_centered             -4.4418     0.2420 -18.354  < 2e-16 ***
yr_rndnobr              -63.7652    14.0117  -4.551 7.12e-06 ***
ell_centered:yr_rndnobr   1.4884     0.4673   3.185  0.00156 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 89.08 on 396 degrees of freedom
Multiple R-squared:  0.6108,	Adjusted R-squared:  0.6078 
F-statistic: 207.1 on 3 and 396 DF,  p-value: < 2.2e-16
>
>
> coefs <- summary(m.ellxyr_rnd)$coefficients
> coefs 
                  Estimate Std. Error    t value      Pr(>|t|)
(Intercept)     794.257647  7.8421894 101.280090 3.235236e-285
ell              -4.441819  0.2420092 -18.353922  6.903303e-55
yr_rndnobr     -110.577884 24.7935271  -4.459950  1.069371e-05
ell:yr_rndnobr    1.488362  0.4673174   3.184906  1.562618e-03
> api.br <- coefs[1]
> api.nobr <- coefs[1]+coefs[3]
> slope.red <- coefs[2]
> slope.blue <- coefs[2]+coefs[4]
> data.frame(api.br, api.nobr, slope.red, slope.blue)
    api.br api.nobr slope.red slope.blue
1 794.2576 683.6798 -4.441819  -2.953456
> # br과 nobr에 따라서 ell의 slope가 달라지는것이
> # interaction effects
> 




Regression with a categorical and a continuous IV: e.g. 2

rs.06

###############################
# 다른 예
###############################
m.ellmealcat <- lm(api00~ell+mealcat, data=df)
summary(m.ellmealcat)
# install.packages("emmeans")
# library(emmeans)
# Calculate estimated marginal means for each group
gmeans <- emmeans(m.ellmealcat, specs = ~ mealcat)
# Perform pairwise comparisons using Tukey's adjustment (Default)
pairs(gmeans, adjust = "tukey")

m.ellxmealcat <- lm(api00~ell*mealcat, data=df)
summary(m.ellxmealcat)
df %>%
  ggplot(aes(x = ell, y = api00, color = factor(mealcat))) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1", name = "mealcat") +
  labs(title = "Multiple Regression: Interaction by meal cat",
       x = " (ell, english language learner)",
       y = "api00")

mealcat1 <- summary(m.ellxmealcat)$coefficients[[1]]
mealcat2 <- summary(m.ellxmealcat)$coefficients[[1]] + 
  summary(m.ellxmealcat)$coefficients[[3]]
mealcat3 <- summary(m.ellxmealcat)$coefficients[[1]] +
  summary(m.ellxmealcat)$coefficients[[4]]
data.frame(mealcat1, mealcat2, mealcat3)

ell.slope.mealcat1 <- summary(m.ellxmealcat)$coefficients[[2]]
ell.slope.mealcat2 <- summary(m.ellxmealcat)$coefficients[[2]] +
  summary(m.ellxmealcat)$coefficients[[5]]
ell.slope.mealcat3 <- summary(m.ellxmealcat)$coefficients[[2]] +
  summary(m.ellxmealcat)$coefficients[[6]]
data.frame(ell.slope.mealcat1,
           ell.slope.mealcat2,
           ell.slope.mealcat3)

ro.06

> ###############################
> # 다른 예
> ###############################
> m.ellmealcat <- lm(api00~ell+mealcat, data=df)
> summary(m.ellmealcat)

Call:
lm(formula = api00 ~ ell + mealcat, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-224.96  -42.70   -0.32   48.93  179.92 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   819.654      6.052 135.426  < 2e-16 ***
ell            -1.546      0.203  -7.616 1.94e-13 ***
mealcatto80  -134.493      9.153 -14.693  < 2e-16 ***
mealcatto100 -230.737     12.290 -18.774  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 66.03 on 396 degrees of freedom
Multiple R-squared:  0.7861,	Adjusted R-squared:  0.7845 
F-statistic: 485.2 on 3 and 396 DF,  p-value: < 2.2e-16

> # install.packages("emmeans")
> # library(emmeans)
> # Calculate estimated marginal means for each group
> gmeans <- emmeans(m.ellmealcat, specs = ~ mealcat)
> # Perform pairwise comparisons using Tukey's adjustment (Default)
> pairs(gmeans, adjust = "tukey")
 contrast     estimate    SE  df t.ratio p.value
 to46 - to80     134.5  9.15 396  14.693  <.0001
 to46 - to100    230.7 12.30 396  18.774  <.0001
 to80 - to100     96.2  9.53 396  10.102  <.0001

P value adjustment: tukey method for comparing a family of 3 estimates 
> 
> m.ellxmealcat <- lm(api00~ell*mealcat, data=df)
> summary(m.ellxmealcat)

Call:
lm(formula = api00 ~ ell * mealcat, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-221.854  -43.244    1.437   47.443  181.471 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       836.7724     8.8620  94.422  < 2e-16 ***
ell                -3.4447     0.7508  -4.588 6.03e-06 ***
mealcatto80      -146.6127    13.9085 -10.541  < 2e-16 ***
mealcatto100     -270.8356    18.7941 -14.411  < 2e-16 ***
ell:mealcatto80     1.7300     0.8111   2.133   0.0335 *  
ell:mealcatto100    2.3190     0.8032   2.887   0.0041 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 65.47 on 394 degrees of freedom
Multiple R-squared:  0.7909,	Adjusted R-squared:  0.7882 
F-statistic:   298 on 5 and 394 DF,  p-value: < 2.2e-16

> df %>%
+   ggplot(aes(x = ell, y = api00, color = factor(mealcat))) +
+   geom_point(size = 3) +
+   geom_smooth(method = "lm", se = FALSE) +
+   scale_color_brewer(palette = "Set1", name = "mealcat") +
+   labs(title = "Multiple Regression: Interaction by meal cat",
+        x = " (ell, english language learner)",
+        y = "api00")
`geom_smooth()` using formula = 'y ~ x'
> 
> mealcat1 <- summary(m.ellxmealcat)$coefficients[[1]]
> mealcat2 <- summary(m.ellxmealcat)$coefficients[[1]] + 
+   summary(m.ellxmealcat)$coefficients[[3]]
> mealcat3 <- summary(m.ellxmealcat)$coefficients[[1]] +
+   summary(m.ellxmealcat)$coefficients[[4]]
> data.frame(mealcat1, mealcat2, mealcat3)
  mealcat1 mealcat2 mealcat3
1 836.7724 690.1597 565.9368
> 
> ell.slope.mealcat1 <- summary(m.ellxmealcat)$coefficients[[2]]
> ell.slope.mealcat2 <- summary(m.ellxmealcat)$coefficients[[2]] +
+   summary(m.ellxmealcat)$coefficients[[5]]
> ell.slope.mealcat3 <- summary(m.ellxmealcat)$coefficients[[2]] +
+   summary(m.ellxmealcat)$coefficients[[6]]
> data.frame(ell.slope.mealcat1,
+            ell.slope.mealcat2,
+            ell.slope.mealcat3)
  ell.slope.mealcat1 ell.slope.mealcat2 ell.slope.mealcat3
1          -3.444691          -1.714707          -1.125645
> 

Regression with two Catogorical IVs

rs.07

##########################################
# 카테고리 iv 2개
##########################################
m.yrrndmealcat <- lm(api00~yr_rnd+mealcat, data=df)
summary(m.yrrndmealcat)

# 해석. 
coefs <- summary(m.yrrndmealcat)$coefficients
coefs
br.to46 <- coefs[1]
br.to80 <- coefs[1]+coefs[3] 
br.to100 <- coefs[1]+coefs[4]
nobr.to46 <- coefs[1]+coefs[2]
nobr.to80 <- coefs[1]+coefs[2]+coefs[3]
nobr.to100 <- coefs[1]+coefs[2]+coefs[4]
cat(br.to46, br.to80, br.to100)
cat(nobr.to46, nobr.to80, nobr.to100)

# 해석. interaction
m.yrrndxmealcat <- lm(api00~yr_rnd*mealcat, data=df)
summary(m.yrrndxmealcat)

coefs <- summary(m.yrrndxmealcat)$coefficients
coefs
br.to46 <- coefs[1]
br.to80 <- coefs[1]+coefs[3] 
br.to100 <- coefs[1]+coefs[4]
nobr.to46 <- coefs[1]+coefs[2]
nobr.to80 <- coefs[1]+coefs[2]+coefs[3]+coefs[5]
nobr.to100 <- coefs[1]+coefs[2]+coefs[4]+coefs[6]
cat(br.to46, br.to80, br.to100)
cat(nobr.to46, nobr.to80, nobr.to100)

ro.07

> ##########################################
> # 카테고리 iv 2개
> ##########################################
> m.yrrndmealcat <- lm(api00~yr_rnd+mealcat, data=df)
> summary(m.yrrndmealcat)

Call:
lm(formula = api00 ~ yr_rnd + mealcat, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-215.32  -49.50    1.65   49.17  183.63 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   808.013      6.040 133.777  < 2e-16 ***
yr_rndnobr    -42.960      9.362  -4.589 5.99e-06 ***
mealcatto80  -163.737      8.515 -19.229  < 2e-16 ***
mealcatto100 -281.683      9.446 -29.821  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 68.89 on 396 degrees of freedom
Multiple R-squared:  0.7672,	Adjusted R-squared:  0.7654 
F-statistic:   435 on 3 and 396 DF,  p-value: < 2.2e-16

> 
> # 해석. 
> coefs <- summary(m.yrrndmealcat)$coefficients
> coefs
               Estimate Std. Error    t value      Pr(>|t|)
(Intercept)   808.01313   6.039984 133.777362  0.000000e+00
yr_rndnobr    -42.96006   9.361761  -4.588886  5.990293e-06
mealcatto80  -163.73737   8.515015 -19.229252  1.128686e-58
mealcatto100 -281.68318   9.445676 -29.821388 2.767252e-103
> br.to46 <- coefs[1]
> br.to80 <- coefs[1]+coefs[3] 
> br.to100 <- coefs[1]+coefs[4]
> nobr.to46 <- coefs[1]+coefs[2]
> nobr.to80 <- coefs[1]+coefs[2]+coefs[3]
> nobr.to100 <- coefs[1]+coefs[2]+coefs[4]
> cat(br.to46, br.to80, br.to100)
808.0131 644.2758 526.33
> cat(nobr.to46, nobr.to80, nobr.to100)
765.0531 601.3157 483.3699> 

예측식은 아래와 같다.

y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100)

yr_rnd:
break = 방학있음 
no_break = 방학없음

mealcat: 
0-46% free meals
47-80%
81-100%

이에 대한 해석은 각각의 독립변인의 종류 수인 2개와 3개를 곱한 6개의 경우로 나누어서 생각할 수 있다. 즉,
y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100)
을 바탕으로 각각의 조건을 고려하여 y hat를 계산하면 아래와 같다.


TABLE. Two dummy variables

mealcat0-46 mealcat47-80 mealcat81-100
yr_rndbreak yr_rndbreak = 1
yr_rndno_break = 0
mealcat0-46 = 1
mealcat47-80 = 0
mealcat81-100 = 0 경우
y hat = 808.013
yr_rndbreak = 1
yr_rndno_break = 0
mealcat0-46 = 0
mealcat47-80 = 1
mealcat81-100 = 0 경우
y hat = 808.013 - 163.737 = 644.276
yr_rndbreak = 1
yr_rndno_break = 0
mealcat0-46 = 0
mealcat47-80 = 0
mealcat81-100 = 1 경우
y hat = 808.013 - 281.683 = 526.33
yr_rndno_break yr_rndbreak = 0
yr_rndno_break = 1
mealcat0-46 = 1
mealcat47-80 = 0
mealcat81-100 = 0 경우
y hat = 808.013 - 42.960 = 765.053
yr_rndbreak = 0
yr_rndno_break = 1
mealcat0-46 = 0
mealcat47-80 = 1
mealcat81-100 = 0 경우
y hat = 808.013 - 42.960 - 163.737 = 601.316
yr_rndbreak = 0
yr_rndno_break = 1
mealcat0-46 = 0
mealcat47-80 = 0
mealcat81-100 = 1 경우
y hat = 808.013 - 42.960 - 281.683 = 483.37
> # 해석. interaction
> m.yrrndxmealcat <- lm(api00~yr_rnd*mealcat, data=df)
> summary(m.yrrndxmealcat)

Call:
lm(formula = api00 ~ yr_rnd * mealcat, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-207.533  -50.764   -1.843   48.874  179.000 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              809.685      6.185 130.911  < 2e-16 ***
yr_rndnobr               -74.257     26.756  -2.775  0.00578 ** 
mealcatto80             -164.412      8.877 -18.522  < 2e-16 ***
mealcatto100            -288.193     10.443 -27.597  < 2e-16 ***
yr_rndnobr:mealcatto80    22.517     32.752   0.687  0.49217    
yr_rndnobr:mealcatto100   40.764     29.231   1.395  0.16394    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 68.87 on 394 degrees of freedom
Multiple R-squared:  0.7685,	Adjusted R-squared:  0.7656 
F-statistic: 261.6 on 5 and 394 DF,  p-value: < 2.2e-16

> 
> coefs <- summary(m.yrrndxmealcat)$coefficients
> coefs
                          Estimate Std. Error     t value     Pr(>|t|)
(Intercept)              809.68548   6.184993 130.9113019 0.000000e+00
yr_rndnobr               -74.25691  26.756287  -2.7753071 5.777728e-03
mealcatto80             -164.41198   8.876767 -18.5216067 1.529685e-55
mealcatto100            -288.19295  10.442837 -27.5971894 4.297367e-94
yr_rndnobr:mealcatto80    22.51674  32.751732   0.6874977 4.921736e-01
yr_rndnobr:mealcatto100   40.76438  29.231183   1.3945510 1.639371e-01
> br.to46 <- coefs[1]
> br.to80 <- coefs[1]+coefs[3] 
> br.to100 <- coefs[1]+coefs[4]
> nobr.to46 <- coefs[1]+coefs[2]
> nobr.to80 <- coefs[1]+coefs[2]+coefs[3]+coefs[5]
> nobr.to100 <- coefs[1]+coefs[2]+coefs[4]+coefs[6]
> cat(br.to46, br.to80, br.to100)
809.6855 645.2735 521.4925> cat(nobr.to46, nobr.to80, nobr.to100)
735.4286 593.5333 488
> 

위의 테스트는 두 개의 독립변인이 모두 종류이고 종속변인이 숫자일 때의 조건을 만족하니 factorial anova를 해도 된다. 아래는 그 결과이다.

> mod4 <- lm(api00 ~ yr_rnd + mealcat + yr_rnd:mealcat, data=datavar) 
> summary(mod4)

Call:
lm(formula = api00 ~ yr_rnd + mealcat + yr_rnd:mealcat, data = datavar)

Residuals:
     Min       1Q   Median       3Q      Max 
-207.533  -50.764   -1.843   48.874  179.000 

Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   809.685      6.185 130.911  < 2e-16 ***
yr_rndno_break                -74.257     26.756  -2.775  0.00578 ** 
mealcat47-80                 -164.412      8.877 -18.522  < 2e-16 ***
mealcat81-100                -288.193     10.443 -27.597  < 2e-16 ***
yr_rndno_break:mealcat47-80    22.517     32.752   0.687  0.49217    
yr_rndno_break:mealcat81-100   40.764     29.231   1.395  0.16394    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 68.87 on 394 degrees of freedom
Multiple R-squared:  0.7685,	Adjusted R-squared:  0.7656 
F-statistic: 261.6 on 5 and 394 DF,  p-value: < 2.2e-16
Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   809.685      6.185 130.911  < 2e-16 ***
yr_rndno_break                -74.257     26.756  -2.775  0.00578 ** 
mealcat47-80                 -164.412      8.877 -18.522  < 2e-16 ***
mealcat81-100                -288.193     10.443 -27.597  < 2e-16 ***
yr_rndno_break:mealcat47-80    22.517     32.752   0.687  0.49217    
yr_rndno_break:mealcat81-100   40.764     29.231   1.395  0.16394    
---

이전 식
y hat = 808.013 + -42.960*(yr_rndno_break) + -163.737(mealcat47-80) + -281.683(mealcat81-100)
위의 식
y hat = 809.685 +  -74.257*(yr_rndno_break) + 
                  -164.412*(mealcat47-80) +  
                  -288.193*(mealcat81-100) +
                   22.517*(yr_rndno_break:mealcat47-80) +   --> aaaaa case
                   40.764*(yr_rndno_break:mealcat81-100)    --> bbbbb case

yr_rnd:
break = 방학있음 
no_break = 방학없음

mealcat: 
0-46% free meals
47-80%
81-100%
mealcat0-46 mealcat47-80 mealcat81-100
yr_rndbreak 베이스라인
yr_rndno_break = 0
mealcat47-80 = 0
mealcat81-100 = 0 경우
y hat = 809.685
yr_rndno_break = 0
mealcat0-46 = 0
mealcat81-100 = 0 경우
y hat = 809.685 -
163.737
= 645.9


yr_rndno_break = 0
mealcat0-46 = 0
mealcat47-80 = 0 경우
y hat = 809.685 -
281.683
= 528

yr_rndno_break
yr_rndbreak = 0
mealcat47-80 = 0
mealcat81-100 = 0 경우
y hat = 809.685 -
74.257
= 735.4

aaaaa
yr_rndbreak = 0
mealcat0-46 = 0
mealcat81-100 = 0 경우
y hat = 809.685 -
74.257 -
164.412 +
22.517
= 593.5

bbbbb
yr_rndbreak = 0
mealcat0-46 = 0
mealcat47-80 = 0 경우
y hat = 809.685 -
74.257 -
288.193 +
40.764
= 488

마지막 두 케이스를 보면 no_break학교 중에서 밀카테고리 2와 3에서 떨어지는 정도가 어느 정도 완화되는 경향을 보이지만 통계학적으로 significant하지는 않다.

using_dummy_variables.1781048379.txt.gz · Last modified: by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki