see https://github.com/hkimscil/ms/blob/main/anova.R
(온도조건)x1 | 50.5 | 52.1 | 51.9 | 52.4 | 50.6 | 51.4 | 51.2 | 52.2 | 51.5 | 50.8 |
(온도조건)x2 | 47.5 | 47.7 | 46.6 | 47.1 | 47.2 | 47.8 | 45.2 | 47.4 | 45.0 | 47.9 |
(온도조건)x3 | 46.0 | 47.1 | 45.6 | 47.1 | 47.2 | 46.4 | 45.9 | 47.1 | 44.9 | 46.2 |
x1 <- c(50.5, 52.1, 51.9, 52.4, 50.6, 51.4, 51.2, 52.2, 51.5, 50.8) x2 <- c(47.5, 47.7, 46.6, 47.1, 47.2, 47.8, 45.2, 47.4, 45.0, 47.9) x3 <- c(46.0, 47.1, 45.6, 47.1, 47.2, 46.4, 45.9, 47.1, 44.9, 46.2) x1 x2 x3
> xc <- data.frame(x1,x2,x3) > xc x1 x2 x3 1 50.5 47.5 46.0 2 52.1 47.7 47.1 3 51.9 46.6 45.6 4 52.4 47.1 47.1 5 50.6 47.2 47.2 6 51.4 47.8 46.4 7 51.2 45.2 45.9 8 52.2 47.4 47.1 9 51.5 45.0 44.9 10 50.8 47.9 46.2 > xs <- stack(xc) > xs values ind 1 50.5 x1 2 52.1 x1 3 51.9 x1 4 52.4 x1 5 50.6 x1 6 51.4 x1 7 51.2 x1 8 52.2 x1 9 51.5 x1 10 50.8 x1 11 47.5 x2 12 47.7 x2 13 46.6 x2 14 47.1 x2 15 47.2 x2 16 47.8 x2 17 45.2 x2 18 47.4 x2 19 45.0 x2 20 47.9 x2 21 46.0 x3 22 47.1 x3 23 45.6 x3 24 47.1 x3 25 47.2 x3 26 46.4 x3 27 45.9 x3 28 47.1 x3 29 44.9 x3 30 46.2 x3
> aggregate(values~ind, data=xs, mean) ind values 1 x1 51.46 2 x2 46.94 3 x3 46.35 # or use, tapply function as shown the below
변인 이름과 levels 정리
변인이름
temp의 level 정리
colnames(xs) <- c("score", "temp") xs levels(xs$temp) <- c("low", "mid", "hi") xs
> colnames(xs) <- c("score", "temp") > xs score temp 1 50.5 x1 2 52.1 x1 3 51.9 x1 4 52.4 x1 5 50.6 x1 6 51.4 x1 7 51.2 x1 8 52.2 x1 9 51.5 x1 10 50.8 x1 11 47.5 x2 12 47.7 x2 13 46.6 x2 14 47.1 x2 15 47.2 x2 16 47.8 x2 17 45.2 x2 18 47.4 x2 19 45.0 x2 20 47.9 x2 21 46.0 x3 22 47.1 x3 23 45.6 x3 24 47.1 x3 25 47.2 x3 26 46.4 x3 27 45.9 x3 28 47.1 x3 29 44.9 x3 30 46.2 x3 > str(xs) 'data.frame': 30 obs. of 2 variables: $ score: num 50.5 52.1 51.9 52.4 50.6 51.4 51.2 52.2 51.5 50.8 ... $ temp : Factor w/ 3 levels "x1","x2","x3": 1 1 1 1 1 1 1 1 1 1 ... > levels(xs$temp) <- c("low", "mid", "hi") > xs score temp 1 50.5 low 2 52.1 low 3 51.9 low 4 52.4 low 5 50.6 low 6 51.4 low 7 51.2 low 8 52.2 low 9 51.5 low 10 50.8 low 11 47.5 mid 12 47.7 mid 13 46.6 mid 14 47.1 mid 15 47.2 mid 16 47.8 mid 17 45.2 mid 18 47.4 mid 19 45.0 mid 20 47.9 mid 21 46.0 hi 22 47.1 hi 23 45.6 hi 24 47.1 hi 25 47.2 hi 26 46.4 hi 27 45.9 hi 28 47.1 hi 29 44.9 hi 30 46.2 hi
mean.by.group.xs <- tapply(xs$score, xs$temp, mean) var.by.group.xs <- tapply(xs$score, xs$temp, var) n.by.group.xs <- tapply(xs$score, xs$temp, length) df.by.group.xs <- n.xs-1 mean.by.group.xs var.by.group.xs n.by.group.xs df.by.group.xs
> mean.by.group.xs <- tapply(xs$score, xs$temp, mean) > var.by.group.xs <- tapply(xs$score, xs$temp, var) > n.by.group.xs <- tapply(xs$score, xs$temp, length) > df.by.group.xs <- n.xs-1 > ss.within <- sum(var.by.group.xs * df.by.group.xs) > > mean.by.group.xs low mid hi 51.46 46.94 46.35 > var.by.group.xs low mid hi 0.4671111 1.0848889 0.6027778 > n.by.group.xs low mid hi 10 10 10 > df.by.group.xs low mid hi 9 9 9 >
mean.xs <- mean(xs$score) n.total <- length(xs$score) df.total <- n.total-1 n.group.xs <- 3 df.between <- n.group.xs -1 df.within <- sum(df.by.group.xs) n.total df.total df.between df.within ss.total <- var(xs$score) * (length(xs$score)-1) ss.total <- var(xs$score) * df.total ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2) ss.within <- sum(var.by.group.xs * df.by.group.xs) ss.total ss.between ss.within ss.total ss.between + ss.within ms.between <- ss.between/df.between ms.within <- ss.within/df.within ms.total <- ss.total/df.total ms.total ms.between ms.within f.calculated <- ms.between/ms.within f.calculated var(xs$score)
> mean.xs <- mean(xs$score) > n.total <- length(xs$score) > df.total <- n.total-1 > n.group.xs <- 3 > df.between <- n.group.xs -1 > df.within <- sum(df.by.group.xs) > n.total [1] 30 > df.total [1] 29 > df.between [1] 2 > df.within [1] 27 > > ss.total <- var(xs$score) * (length(xs$score)-1) > ss.total <- var(xs$score) * df.total > ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2) > ss.within <- sum(var.by.group.xs * df.by.group.xs) > ss.total [1] 175.695 > ss.between [1] 156.302 > ss.within [1] 19.393 > ss.total [1] 175.695 > ss.between + ss.within [1] 175.695 > > ms.between <- ss.between/df.between > ms.within <- ss.within/df.within > ms.total <- ss.total/df.total > > ms.total [1] 6.058448 > ms.between [1] 78.151 > ms.within [1] 0.7182593 > > f.calculated <- ms.between/ms.within > f.calculated [1] 108.8061 > var(xs$score) [1] 6.058448 >
x.mod <- aov(score~temp, data=xs) x.mod summary(x.mod) TukeyHSD(x.mod)
> x.mod <- aov(score~temp, data=xs) > x.mod Call: aov(formula = score ~ temp, data = xs) Terms: temp Residuals Sum of Squares 156.302 19.393 Deg. of Freedom 2 27 Residual standard error: 0.8475018 Estimated effects may be unbalanced > summary(x.mod) Df Sum Sq Mean Sq F value Pr(>F) temp 2 156.30 78.15 108.8 1.2e-13 *** Residuals 27 19.39 0.72 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > TukeyHSD(x.mod) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = score ~ temp, data = xs) $temp diff lwr upr p adj mid-low -4.52 -5.459735 -3.5802652 0.0000000 hi-low -5.11 -6.049735 -4.1702652 0.0000000 hi-mid -0.59 -1.529735 0.3497348 0.2813795 >
> data(InsectSprays) > str(InsectSprays) 'data.frame': 72 obs. of 2 variables: $ count: num 10 7 20 14 14 12 10 23 17 20 ... $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ... > summary(InsectSprays) count spray Min. : 0.00 A:12 1st Qu.: 3.00 B:12 Median : 7.00 C:12 Mean : 9.50 D:12 3rd Qu.:14.25 E:12 Max. :26.00 F:12
> attach(InsectSprays) > tapply(count, spray, mean) A B C D E F 14.500000 15.333333 2.083333 4.916667 3.500000 16.666667 > tapply(count, spray, var) A B C D E F 22.272727 18.242424 3.901515 6.265152 3.000000 38.606061 > tapply(count, spray, length) # okay, as there are no missing values A B C D E F 12 12 12 12 12 12
> boxplot(count ~ spray) # boxplot(count~spray, data=InsectSprays) if not attached
> oneway.test(count ~ spray) # oneway.test(count~spray, data=InsectSprays) if not attached One-way analysis of means (not assuming equal variances) data: count and spray F = 36.0654, num df = 5.000, denom df = 30.043, p-value = 8e-12
By default, oneway.test() applies a Welch-like correction for nonhomogeneity. To turn off it, set the “var.equal=” option to TRUE.
ANOVA 그룹간 homogeneity를 요구하는데, oneway.test는 이를 보정하여 F값을 구한다.
> aov.out = aov(count ~ spray, data=InsectSprays) > summary(aov.out) Df Sum Sq Mean Sq F value Pr(>F) spray 5 2668.83 533.77 34.702 <2.2e-16 *** Residuals 66 1015.17 15.38 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
if we use var.eq=TRUE option, the result would be the same.
> oneway.test(count ~ spray, data=InsectSprays, var.eq=T) One-way analysis of means data: count and spray F = 34.7023, num df = 5, denom df = 66, p-value < 2.2e-16
> pairwise.t.test(InsectSprays$count, InsectSprays$spray) Pairwise comparisons using t tests with pooled SD data: InsectSprays$count and InsectSprays$spray A B C D E B 1.00 - - - - C 8.7e-10 1.2e-10 - - - D 6.9e-07 9.7e-08 0.49 - - E 2.5e-08 3.6e-09 1.00 1.00 - F 0.90 1.00 4.2e-12 4.0e-09 1.4e-10 P value adjustment method: holm >
> TukeyHSD(aov.out) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = count ~ spray, data = InsectSprays) $spray diff lwr upr p adj B-A 0.8333333 -3.866075 5.532742 0.9951810 C-A -12.4166667 -17.116075 -7.717258 0.0000000 D-A -9.5833333 -14.282742 -4.883925 0.0000014 E-A -11.0000000 -15.699409 -6.300591 0.0000000 F-A 2.1666667 -2.532742 6.866075 0.7542147 C-B -13.2500000 -17.949409 -8.550591 0.0000000 D-B -10.4166667 -15.116075 -5.717258 0.0000002 E-B -11.8333333 -16.532742 -7.133925 0.0000000 F-B 1.3333333 -3.366075 6.032742 0.9603075 D-C 2.8333333 -1.866075 7.532742 0.4920707 E-C 1.4166667 -3.282742 6.116075 0.9488669 F-C 14.5833333 9.883925 19.282742 0.0000000 E-D -1.4166667 -6.116075 3.282742 0.9488669 F-D 11.7500000 7.050591 16.449409 0.0000000 F-E 13.1666667 8.467258 17.866075 0.0000000
약의 종류에 따른 효과의 차이
Drug A 4 5 4 3 2 4 3 4 4 Drug B 6 8 4 5 4 6 5 8 6 Drug C 6 7 6 6 7 5 6 5 5
A <- c(4, 5, 4, 3, 2, 4, 3, 4, 4) B <- c(6, 8, 4, 5, 4, 6, 5, 8, 6) C <- c(6, 7, 6, 6, 7, 5, 6, 5, 5)
download dataset_anova_comparisonMethods.csv data file
Treatment StressReduction mental 2 mental 2 mental 3 mental 4 mental 4 mental 5 mental 3 mental 4 mental 4 mental 4 physical 4 physical 4 physical 3 physical 5 physical 4 physical 1 physical 1 physical 2 physical 3 physical 3 medical 1 medical 2 medical 2 medical 2 medical 3 medical 2 medical 3 medical 1 medical 3 medical 1
dd <- read.csv("http://commres.net/wiki/_media/dataset_anova_comparisonmethods.csv", sep=",")
In class exercise
위의 데이터를 이용하여 아래를 수행하시오.
1. F test 가설을 세우시오
2. 각 그룹의 평균과 표준편차 그리고 분산값을 구하시오.
3. F test를 하시오
4. 그룹간 비교를 하시오 (post hoc)
5. 분석결과를 정리하여 설명하시오.