====== Oneway ANOVA ======
===== data =====
see https://github.com/hkimscil/ms/blob/main/anova.R
| (온도조건)x1 | 50.5 | 52.1 | 51.9 | 52.4 | 50.6 | 51.4 | 51.2 | 52.2 | 51.5 | 50.8 |
| (온도조건)x2 | 47.5 | 47.7 | 46.6 | 47.1 | 47.2 | 47.8 | 45.2 | 47.4 | 45.0 | 47.9 |
| (온도조건)x3 | 46.0 | 47.1 | 45.6 | 47.1 | 47.2 | 46.4 | 45.9 | 47.1 | 44.9 | 46.2 |
x1 <- c(50.5, 52.1, 51.9, 52.4, 50.6, 51.4, 51.2, 52.2, 51.5, 50.8)
x2 <- c(47.5, 47.7, 46.6, 47.1, 47.2, 47.8, 45.2, 47.4, 45.0, 47.9)
x3 <- c(46.0, 47.1, 45.6, 47.1, 47.2, 46.4, 45.9, 47.1, 44.9, 46.2)
x1
x2
x3
> xc <- data.frame(x1,x2,x3)
> xc
x1 x2 x3
1 50.5 47.5 46.0
2 52.1 47.7 47.1
3 51.9 46.6 45.6
4 52.4 47.1 47.1
5 50.6 47.2 47.2
6 51.4 47.8 46.4
7 51.2 45.2 45.9
8 52.2 47.4 47.1
9 51.5 45.0 44.9
10 50.8 47.9 46.2
> xs <- stack(xc)
> xs
values ind
1 50.5 x1
2 52.1 x1
3 51.9 x1
4 52.4 x1
5 50.6 x1
6 51.4 x1
7 51.2 x1
8 52.2 x1
9 51.5 x1
10 50.8 x1
11 47.5 x2
12 47.7 x2
13 46.6 x2
14 47.1 x2
15 47.2 x2
16 47.8 x2
17 45.2 x2
18 47.4 x2
19 45.0 x2
20 47.9 x2
21 46.0 x3
22 47.1 x3
23 45.6 x3
24 47.1 x3
25 47.2 x3
26 46.4 x3
27 45.9 x3
28 47.1 x3
29 44.9 x3
30 46.2 x3
> aggregate(values~ind, data=xs, mean)
ind values
1 x1 51.46
2 x2 46.94
3 x3 46.35
# or use, tapply function as shown the below
변인 이름과 levels 정리
변인이름
* score <- 점수 (performance)
* temp <- 교실의 온도
* 즉 교실의 온도에 따라서 점수가 다르게 나옴
temp의 level 정리
* x1 - low 낮은온도
* x2 - mid 중간온도
* x3 - hi 높은온도
colnames(xs) <- c("score", "temp")
xs
levels(xs$temp) <- c("low", "mid", "hi")
xs
> colnames(xs) <- c("score", "temp")
> xs
score temp
1 50.5 x1
2 52.1 x1
3 51.9 x1
4 52.4 x1
5 50.6 x1
6 51.4 x1
7 51.2 x1
8 52.2 x1
9 51.5 x1
10 50.8 x1
11 47.5 x2
12 47.7 x2
13 46.6 x2
14 47.1 x2
15 47.2 x2
16 47.8 x2
17 45.2 x2
18 47.4 x2
19 45.0 x2
20 47.9 x2
21 46.0 x3
22 47.1 x3
23 45.6 x3
24 47.1 x3
25 47.2 x3
26 46.4 x3
27 45.9 x3
28 47.1 x3
29 44.9 x3
30 46.2 x3
> str(xs)
'data.frame': 30 obs. of 2 variables:
$ score: num 50.5 52.1 51.9 52.4 50.6 51.4 51.2 52.2 51.5 50.8 ...
$ temp : Factor w/ 3 levels "x1","x2","x3": 1 1 1 1 1 1 1 1 1 1 ...
> levels(xs$temp) <- c("low", "mid", "hi")
> xs
score temp
1 50.5 low
2 52.1 low
3 51.9 low
4 52.4 low
5 50.6 low
6 51.4 low
7 51.2 low
8 52.2 low
9 51.5 low
10 50.8 low
11 47.5 mid
12 47.7 mid
13 46.6 mid
14 47.1 mid
15 47.2 mid
16 47.8 mid
17 45.2 mid
18 47.4 mid
19 45.0 mid
20 47.9 mid
21 46.0 hi
22 47.1 hi
23 45.6 hi
24 47.1 hi
25 47.2 hi
26 46.4 hi
27 45.9 hi
28 47.1 hi
29 44.9 hi
30 46.2 hi
===== ANOVA by hand =====
mean.by.group.xs <- tapply(xs$score, xs$temp, mean)
var.by.group.xs <- tapply(xs$score, xs$temp, var)
n.by.group.xs <- tapply(xs$score, xs$temp, length)
df.by.group.xs <- n.xs-1
mean.by.group.xs
var.by.group.xs
n.by.group.xs
df.by.group.xs
> mean.by.group.xs <- tapply(xs$score, xs$temp, mean)
> var.by.group.xs <- tapply(xs$score, xs$temp, var)
> n.by.group.xs <- tapply(xs$score, xs$temp, length)
> df.by.group.xs <- n.xs-1
> ss.within <- sum(var.by.group.xs * df.by.group.xs)
>
> mean.by.group.xs
low mid hi
51.46 46.94 46.35
> var.by.group.xs
low mid hi
0.4671111 1.0848889 0.6027778
> n.by.group.xs
low mid hi
10 10 10
> df.by.group.xs
low mid hi
9 9 9
>
mean.xs <- mean(xs$score)
n.total <- length(xs$score)
df.total <- n.total-1
n.group.xs <- 3
df.between <- n.group.xs -1
df.within <- sum(df.by.group.xs)
n.total
df.total
df.between
df.within
ss.total <- var(xs$score) * (length(xs$score)-1)
ss.total <- var(xs$score) * df.total
ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2)
ss.within <- sum(var.by.group.xs * df.by.group.xs)
ss.total
ss.between
ss.within
ss.total
ss.between + ss.within
ms.between <- ss.between/df.between
ms.within <- ss.within/df.within
ms.total <- ss.total/df.total
ms.total
ms.between
ms.within
f.calculated <- ms.between/ms.within
f.calculated
var(xs$score)
> mean.xs <- mean(xs$score)
> n.total <- length(xs$score)
> df.total <- n.total-1
> n.group.xs <- 3
> df.between <- n.group.xs -1
> df.within <- sum(df.by.group.xs)
> n.total
[1] 30
> df.total
[1] 29
> df.between
[1] 2
> df.within
[1] 27
>
> ss.total <- var(xs$score) * (length(xs$score)-1)
> ss.total <- var(xs$score) * df.total
> ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2)
> ss.within <- sum(var.by.group.xs * df.by.group.xs)
> ss.total
[1] 175.695
> ss.between
[1] 156.302
> ss.within
[1] 19.393
> ss.total
[1] 175.695
> ss.between + ss.within
[1] 175.695
>
> ms.between <- ss.between/df.between
> ms.within <- ss.within/df.within
> ms.total <- ss.total/df.total
>
> ms.total
[1] 6.058448
> ms.between
[1] 78.151
> ms.within
[1] 0.7182593
>
> f.calculated <- ms.between/ms.within
> f.calculated
[1] 108.8061
> var(xs$score)
[1] 6.058448
>
===== ANOVA function (aov) =====
x.mod <- aov(score~temp, data=xs)
x.mod
summary(x.mod)
TukeyHSD(x.mod)
> x.mod <- aov(score~temp, data=xs)
> x.mod
Call:
aov(formula = score ~ temp, data = xs)
Terms:
temp Residuals
Sum of Squares 156.302 19.393
Deg. of Freedom 2 27
Residual standard error: 0.8475018
Estimated effects may be unbalanced
> summary(x.mod)
Df Sum Sq Mean Sq F value Pr(>F)
temp 2 156.30 78.15 108.8 1.2e-13 ***
Residuals 27 19.39 0.72
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> TukeyHSD(x.mod)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = score ~ temp, data = xs)
$temp
diff lwr upr p adj
mid-low -4.52 -5.459735 -3.5802652 0.0000000
hi-low -5.11 -6.049735 -4.1702652 0.0000000
hi-mid -0.59 -1.529735 0.3497348 0.2813795
>
====== E.g. 1 ======
> data(InsectSprays)
> str(InsectSprays)
'data.frame': 72 obs. of 2 variables:
$ count: num 10 7 20 14 14 12 10 23 17 20 ...
$ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
> summary(InsectSprays)
count spray
Min. : 0.00 A:12
1st Qu.: 3.00 B:12
Median : 7.00 C:12
Mean : 9.50 D:12
3rd Qu.:14.25 E:12
Max. :26.00 F:12
> attach(InsectSprays)
> tapply(count, spray, mean)
A B C D E F
14.500000 15.333333 2.083333 4.916667 3.500000 16.666667
> tapply(count, spray, var)
A B C D E F
22.272727 18.242424 3.901515 6.265152 3.000000 38.606061
> tapply(count, spray, length) # okay, as there are no missing values
A B C D E F
12 12 12 12 12 12
> boxplot(count ~ spray) # boxplot(count~spray, data=InsectSprays) if not attached
> oneway.test(count ~ spray) # oneway.test(count~spray, data=InsectSprays) if not attached
One-way analysis of means (not assuming equal variances)
data: count and spray
F = 36.0654, num df = 5.000, denom df = 30.043, p-value = 8e-12
By default, oneway.test() applies a **Welch-like correction for nonhomogeneity**. To turn off it, set the "var.equal=" option to TRUE.
ANOVA 그룹간 homogeneity를 요구하는데, oneway.test는 이를 보정하여 F값을 구한다.
> aov.out = aov(count ~ spray, data=InsectSprays)
> summary(aov.out)
Df Sum Sq Mean Sq F value Pr(>F)
spray 5 2668.83 533.77 34.702 <2.2e-16 ***
Residuals 66 1015.17 15.38
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
if we use var.eq=TRUE option, the result would be the same.
> oneway.test(count ~ spray, data=InsectSprays, var.eq=T)
One-way analysis of means
data: count and spray
F = 34.7023, num df = 5, denom df = 66, p-value < 2.2e-16
> pairwise.t.test(InsectSprays$count, InsectSprays$spray)
Pairwise comparisons using t tests with pooled SD
data: InsectSprays$count and InsectSprays$spray
A B C D E
B 1.00 - - - -
C 8.7e-10 1.2e-10 - - -
D 6.9e-07 9.7e-08 0.49 - -
E 2.5e-08 3.6e-09 1.00 1.00 -
F 0.90 1.00 4.2e-12 4.0e-09 1.4e-10
P value adjustment method: holm
>
> TukeyHSD(aov.out)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = count ~ spray, data = InsectSprays)
$spray
diff lwr upr p adj
B-A 0.8333333 -3.866075 5.532742 0.9951810
C-A -12.4166667 -17.116075 -7.717258 0.0000000
D-A -9.5833333 -14.282742 -4.883925 0.0000014
E-A -11.0000000 -15.699409 -6.300591 0.0000000
F-A 2.1666667 -2.532742 6.866075 0.7542147
C-B -13.2500000 -17.949409 -8.550591 0.0000000
D-B -10.4166667 -15.116075 -5.717258 0.0000002
E-B -11.8333333 -16.532742 -7.133925 0.0000000
F-B 1.3333333 -3.366075 6.032742 0.9603075
D-C 2.8333333 -1.866075 7.532742 0.4920707
E-C 1.4166667 -3.282742 6.116075 0.9488669
F-C 14.5833333 9.883925 19.282742 0.0000000
E-D -1.4166667 -6.116075 3.282742 0.9488669
F-D 11.7500000 7.050591 16.449409 0.0000000
F-E 13.1666667 8.467258 17.866075 0.0000000
====== Ex 1 ======
약의 종류에 따른 효과의 차이
Drug A 4 5 4 3 2 4 3 4 4
Drug B 6 8 4 5 4 6 5 8 6
Drug C 6 7 6 6 7 5 6 5 5
A <- c(4, 5, 4, 3, 2, 4, 3, 4, 4)
B <- c(6, 8, 4, 5, 4, 6, 5, 8, 6)
C <- c(6, 7, 6, 6, 7, 5, 6, 5, 5)
====== Ex 2 ======
download {{:dataset_anova_comparisonMethods.csv|dataset_anova_comparisonMethods.csv data file}}
Treatment StressReduction
mental 2
mental 2
mental 3
mental 4
mental 4
mental 5
mental 3
mental 4
mental 4
mental 4
physical 4
physical 4
physical 3
physical 5
physical 4
physical 1
physical 1
physical 2
physical 3
physical 3
medical 1
medical 2
medical 2
medical 2
medical 3
medical 2
medical 3
medical 1
medical 3
medical 1
dd <- read.csv("http://commres.net/wiki/_media/dataset_anova_comparisonmethods.csv", sep=",")
In class exercise
위의 데이터를 이용하여 아래를 수행하시오.
1. F test 가설을 세우시오
2. 각 그룹의 평균과 표준편차 그리고 분산값을 구하시오.
3. F test를 하시오
4. 그룹간 비교를 하시오 (post hoc)
5. 분석결과를 정리하여 설명하시오.