Oneway ANOVA

data

see https://github.com/hkimscil/ms/blob/main/anova.R

(온도조건)x1	50.5	52.1	51.9	52.4	50.6	51.4	51.2	52.2	51.5	50.8
(온도조건)x2	47.5	47.7	46.6	47.1	47.2	47.8	45.2	47.4	45.0	47.9
(온도조건)x3	46.0	47.1	45.6	47.1	47.2	46.4	45.9	47.1	44.9	46.2

x1 <- c(50.5, 52.1, 51.9, 52.4, 50.6, 51.4, 51.2, 52.2, 51.5, 50.8)
x2 <- c(47.5, 47.7, 46.6, 47.1, 47.2, 47.8, 45.2, 47.4, 45.0, 47.9)
x3 <- c(46.0, 47.1, 45.6, 47.1, 47.2, 46.4, 45.9, 47.1, 44.9, 46.2)

x1
x2
x3

> xc <- data.frame(x1,x2,x3)
> xc
     x1   x2   x3
1  50.5 47.5 46.0
2  52.1 47.7 47.1
3  51.9 46.6 45.6
4  52.4 47.1 47.1
5  50.6 47.2 47.2
6  51.4 47.8 46.4
7  51.2 45.2 45.9
8  52.2 47.4 47.1
9  51.5 45.0 44.9
10 50.8 47.9 46.2

> xs <- stack(xc)
> xs
   values ind
1    50.5  x1
2    52.1  x1
3    51.9  x1
4    52.4  x1
5    50.6  x1
6    51.4  x1
7    51.2  x1
8    52.2  x1
9    51.5  x1
10   50.8  x1
11   47.5  x2
12   47.7  x2
13   46.6  x2
14   47.1  x2
15   47.2  x2
16   47.8  x2
17   45.2  x2
18   47.4  x2
19   45.0  x2
20   47.9  x2
21   46.0  x3
22   47.1  x3
23   45.6  x3
24   47.1  x3
25   47.2  x3
26   46.4  x3
27   45.9  x3
28   47.1  x3
29   44.9  x3
30   46.2  x3

> aggregate(values~ind, data=xs, mean)

  ind values
1  x1  51.46
2  x2  46.94
3  x3  46.35

# or use, tapply function as shown the below

변인 이름과 levels 정리
변인이름

score ← 점수 (performance)
temp ← 교실의 온도
즉 교실의 온도에 따라서 점수가 다르게 나옴

temp의 level 정리

x1 - low 낮은온도
x2 - mid 중간온도
x3 - hi 높은온도

colnames(xs) <- c("score", "temp")
xs
levels(xs$temp) <- c("low", "mid", "hi")
xs

> colnames(xs) <- c("score", "temp")
> xs
   score temp
1   50.5   x1
2   52.1   x1
3   51.9   x1
4   52.4   x1
5   50.6   x1
6   51.4   x1
7   51.2   x1
8   52.2   x1
9   51.5   x1
10  50.8   x1
11  47.5   x2
12  47.7   x2
13  46.6   x2
14  47.1   x2
15  47.2   x2
16  47.8   x2
17  45.2   x2
18  47.4   x2
19  45.0   x2
20  47.9   x2
21  46.0   x3
22  47.1   x3
23  45.6   x3
24  47.1   x3
25  47.2   x3
26  46.4   x3
27  45.9   x3
28  47.1   x3
29  44.9   x3
30  46.2   x3
> str(xs)
'data.frame':	30 obs. of  2 variables:
 $ score: num  50.5 52.1 51.9 52.4 50.6 51.4 51.2 52.2 51.5 50.8 ...
 $ temp : Factor w/ 3 levels "x1","x2","x3": 1 1 1 1 1 1 1 1 1 1 ...
> levels(xs$temp) <- c("low", "mid", "hi")
> xs
   score temp
1   50.5  low
2   52.1  low
3   51.9  low
4   52.4  low
5   50.6  low
6   51.4  low
7   51.2  low
8   52.2  low
9   51.5  low
10  50.8  low
11  47.5  mid
12  47.7  mid
13  46.6  mid
14  47.1  mid
15  47.2  mid
16  47.8  mid
17  45.2  mid
18  47.4  mid
19  45.0  mid
20  47.9  mid
21  46.0   hi
22  47.1   hi
23  45.6   hi
24  47.1   hi
25  47.2   hi
26  46.4   hi
27  45.9   hi
28  47.1   hi
29  44.9   hi
30  46.2   hi

ANOVA by hand

mean.by.group.xs <- tapply(xs$score, xs$temp, mean)
var.by.group.xs <- tapply(xs$score, xs$temp, var)
n.by.group.xs <- tapply(xs$score, xs$temp, length)
df.by.group.xs <- n.xs-1

mean.by.group.xs
var.by.group.xs
n.by.group.xs
df.by.group.xs

> mean.by.group.xs <- tapply(xs$score, xs$temp, mean)
> var.by.group.xs <- tapply(xs$score, xs$temp, var)
> n.by.group.xs <- tapply(xs$score, xs$temp, length)
> df.by.group.xs <- n.xs-1
> ss.within <- sum(var.by.group.xs * df.by.group.xs)
> 
> mean.by.group.xs
  low   mid    hi 
51.46 46.94 46.35 
> var.by.group.xs
      low       mid        hi 
0.4671111 1.0848889 0.6027778 
> n.by.group.xs
low mid  hi 
 10  10  10 
> df.by.group.xs
low mid  hi 
  9   9   9 
>

mean.xs <- mean(xs$score)
n.total <- length(xs$score)
df.total <- n.total-1
n.group.xs <- 3
df.between <- n.group.xs -1
df.within <- sum(df.by.group.xs)
n.total
df.total
df.between
df.within

ss.total <- var(xs$score) * (length(xs$score)-1)
ss.total <- var(xs$score) * df.total
ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2)
ss.within <- sum(var.by.group.xs * df.by.group.xs)
ss.total
ss.between
ss.within
ss.total 
ss.between + ss.within

ms.between <- ss.between/df.between
ms.within <- ss.within/df.within
ms.total <- ss.total/df.total

ms.total
ms.between
ms.within

f.calculated <- ms.between/ms.within
f.calculated
var(xs$score)

> mean.xs <- mean(xs$score)
> n.total <- length(xs$score)
> df.total <- n.total-1
> n.group.xs <- 3
> df.between <- n.group.xs -1
> df.within <- sum(df.by.group.xs)
> n.total
[1] 30
> df.total
[1] 29
> df.between
[1] 2
> df.within
[1] 27
> 
> ss.total <- var(xs$score) * (length(xs$score)-1)
> ss.total <- var(xs$score) * df.total
> ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2)
> ss.within <- sum(var.by.group.xs * df.by.group.xs)
> ss.total
[1] 175.695
> ss.between
[1] 156.302
> ss.within
[1] 19.393
> ss.total 
[1] 175.695
> ss.between + ss.within
[1] 175.695
> 
> ms.between <- ss.between/df.between
> ms.within <- ss.within/df.within
> ms.total <- ss.total/df.total
> 
> ms.total
[1] 6.058448
> ms.between
[1] 78.151
> ms.within
[1] 0.7182593
> 
> f.calculated <- ms.between/ms.within
> f.calculated
[1] 108.8061
> var(xs$score)
[1] 6.058448
>

ANOVA function (aov)

x.mod <- aov(score~temp, data=xs)
x.mod
summary(x.mod)
TukeyHSD(x.mod)

> x.mod <- aov(score~temp, data=xs)
> x.mod
Call:
   aov(formula = score ~ temp, data = xs)

Terms:
                   temp Residuals
Sum of Squares  156.302    19.393
Deg. of Freedom       2        27

Residual standard error: 0.8475018
Estimated effects may be unbalanced
> summary(x.mod)
            Df Sum Sq Mean Sq F value  Pr(>F)    
temp         2 156.30   78.15   108.8 1.2e-13 ***
Residuals   27  19.39    0.72                    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 

> TukeyHSD(x.mod)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = score ~ temp, data = xs)

$temp
         diff       lwr        upr     p adj
mid-low -4.52 -5.459735 -3.5802652 0.0000000
hi-low  -5.11 -6.049735 -4.1702652 0.0000000
hi-mid  -0.59 -1.529735  0.3497348 0.2813795

>

E.g. 1

> data(InsectSprays)
> str(InsectSprays)
'data.frame':   72 obs. of  2 variables:
 $ count: num  10 7 20 14 14 12 10 23 17 20 ...
 $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
 > summary(InsectSprays)
     count       spray 
 Min.   : 0.00   A:12  
 1st Qu.: 3.00   B:12  
 Median : 7.00   C:12  
 Mean   : 9.50   D:12  
 3rd Qu.:14.25   E:12  
 Max.   :26.00   F:12

> attach(InsectSprays)
> tapply(count, spray, mean)
        A         B         C         D         E         F 
14.500000 15.333333  2.083333  4.916667  3.500000 16.666667 
> tapply(count, spray, var)
        A         B         C         D         E         F 
22.272727 18.242424  3.901515  6.265152  3.000000 38.606061 
> tapply(count, spray, length)              # okay, as there are no missing values
 A  B  C  D  E  F 
12 12 12 12 12 12

> boxplot(count ~ spray)          # boxplot(count~spray, data=InsectSprays) if not attached

> oneway.test(count ~ spray)      # oneway.test(count~spray, data=InsectSprays) if not attached

        One-way analysis of means (not assuming equal variances)

data:  count and spray 
F = 36.0654, num df = 5.000, denom df = 30.043, p-value = 8e-12

By default, oneway.test() applies a Welch-like correction for nonhomogeneity. To turn off it, set the “var.equal=” option to TRUE.

ANOVA 그룹간 homogeneity를 요구하는데, oneway.test는 이를 보정하여 F값을 구한다.

> aov.out = aov(count ~ spray, data=InsectSprays)
> summary(aov.out)
            Df  Sum Sq Mean Sq F value    Pr(>F)    
spray        5 2668.83  533.77  34.702  <2.2e-16 ***
Residuals   66 1015.17   15.38                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

if we use var.eq=TRUE option, the result would be the same.

> oneway.test(count ~ spray, data=InsectSprays, var.eq=T)

	One-way analysis of means

data:  count and spray
F = 34.7023, num df = 5, denom df = 66, p-value < 2.2e-16

> pairwise.t.test(InsectSprays$count, InsectSprays$spray)

	Pairwise comparisons using t tests with pooled SD 

data:  InsectSprays$count and InsectSprays$spray 

  A       B       C       D       E      
B 1.00    -       -       -       -      
C 8.7e-10 1.2e-10 -       -       -      
D 6.9e-07 9.7e-08 0.49    -       -      
E 2.5e-08 3.6e-09 1.00    1.00    -      
F 0.90    1.00    4.2e-12 4.0e-09 1.4e-10

P value adjustment method: holm 
>

> TukeyHSD(aov.out)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = count ~ spray, data = InsectSprays)

$spray
           diff        lwr       upr     p adj
B-A   0.8333333  -3.866075  5.532742 0.9951810
C-A -12.4166667 -17.116075 -7.717258 0.0000000
D-A  -9.5833333 -14.282742 -4.883925 0.0000014
E-A -11.0000000 -15.699409 -6.300591 0.0000000
F-A   2.1666667  -2.532742  6.866075 0.7542147
C-B -13.2500000 -17.949409 -8.550591 0.0000000
D-B -10.4166667 -15.116075 -5.717258 0.0000002
E-B -11.8333333 -16.532742 -7.133925 0.0000000
F-B   1.3333333  -3.366075  6.032742 0.9603075
D-C   2.8333333  -1.866075  7.532742 0.4920707
E-C   1.4166667  -3.282742  6.116075 0.9488669
F-C  14.5833333   9.883925 19.282742 0.0000000
E-D  -1.4166667  -6.116075  3.282742 0.9488669
F-D  11.7500000   7.050591 16.449409 0.0000000
F-E  13.1666667   8.467258 17.866075 0.0000000

Ex 1

약의 종류에 따른 효과의 차이

Drug A 4 5 4 3 2 4 3 4 4
Drug B 6 8 4 5 4 6 5 8 6
Drug C 6 7 6 6 7 5 6 5 5

A <- c(4, 5, 4, 3, 2, 4, 3, 4, 4)
B <- c(6, 8, 4, 5, 4, 6, 5, 8, 6)
C <- c(6, 7, 6, 6, 7, 5, 6, 5, 5)

Ex 2

download dataset_anova_comparisonMethods.csv data file

dataset_ANOVA_OneWayComparisons.csv

Treatment	StressReduction
mental	2
mental	2
mental	3
mental	4
mental	4
mental	5
mental	3
mental	4
mental	4
mental	4
physical	4
physical	4
physical	3
physical	5
physical	4
physical	1
physical	1
physical	2
physical	3
physical	3
medical	1
medical	2
medical	2
medical	2
medical	3
medical	2
medical	3
medical	1
medical	3
medical	1

dd <- read.csv("http://commres.net/wiki/_media/dataset_anova_comparisonmethods.csv", sep=",")

In class exercise
위의 데이터를 이용하여 아래를 수행하시오.
1. F test 가설을 세우시오
2. 각 그룹의 평균과 표준편차 그리고 분산값을 구하시오.
3. F test를 하시오
4. 그룹간 비교를 하시오 (post hoc)
5. 분석결과를 정리하여 설명하시오.

Table of Contents