====== Oneway ANOVA ====== ===== data ===== see https://github.com/hkimscil/ms/blob/main/anova.R | (온도조건)x1 | 50.5 | 52.1 | 51.9 | 52.4 | 50.6 | 51.4 | 51.2 | 52.2 | 51.5 | 50.8 | | (온도조건)x2 | 47.5 | 47.7 | 46.6 | 47.1 | 47.2 | 47.8 | 45.2 | 47.4 | 45.0 | 47.9 | | (온도조건)x3 | 46.0 | 47.1 | 45.6 | 47.1 | 47.2 | 46.4 | 45.9 | 47.1 | 44.9 | 46.2 | x1 <- c(50.5, 52.1, 51.9, 52.4, 50.6, 51.4, 51.2, 52.2, 51.5, 50.8) x2 <- c(47.5, 47.7, 46.6, 47.1, 47.2, 47.8, 45.2, 47.4, 45.0, 47.9) x3 <- c(46.0, 47.1, 45.6, 47.1, 47.2, 46.4, 45.9, 47.1, 44.9, 46.2) x1 x2 x3 > xc <- data.frame(x1,x2,x3) > xc x1 x2 x3 1 50.5 47.5 46.0 2 52.1 47.7 47.1 3 51.9 46.6 45.6 4 52.4 47.1 47.1 5 50.6 47.2 47.2 6 51.4 47.8 46.4 7 51.2 45.2 45.9 8 52.2 47.4 47.1 9 51.5 45.0 44.9 10 50.8 47.9 46.2 > xs <- stack(xc) > xs values ind 1 50.5 x1 2 52.1 x1 3 51.9 x1 4 52.4 x1 5 50.6 x1 6 51.4 x1 7 51.2 x1 8 52.2 x1 9 51.5 x1 10 50.8 x1 11 47.5 x2 12 47.7 x2 13 46.6 x2 14 47.1 x2 15 47.2 x2 16 47.8 x2 17 45.2 x2 18 47.4 x2 19 45.0 x2 20 47.9 x2 21 46.0 x3 22 47.1 x3 23 45.6 x3 24 47.1 x3 25 47.2 x3 26 46.4 x3 27 45.9 x3 28 47.1 x3 29 44.9 x3 30 46.2 x3 > aggregate(values~ind, data=xs, mean) ind values 1 x1 51.46 2 x2 46.94 3 x3 46.35 # or use, tapply function as shown the below 변인 이름과 levels 정리 변인이름 * score <- 점수 (performance) * temp <- 교실의 온도 * 즉 교실의 온도에 따라서 점수가 다르게 나옴 temp의 level 정리 * x1 - low 낮은온도 * x2 - mid 중간온도 * x3 - hi 높은온도 colnames(xs) <- c("score", "temp") xs levels(xs$temp) <- c("low", "mid", "hi") xs > colnames(xs) <- c("score", "temp") > xs score temp 1 50.5 x1 2 52.1 x1 3 51.9 x1 4 52.4 x1 5 50.6 x1 6 51.4 x1 7 51.2 x1 8 52.2 x1 9 51.5 x1 10 50.8 x1 11 47.5 x2 12 47.7 x2 13 46.6 x2 14 47.1 x2 15 47.2 x2 16 47.8 x2 17 45.2 x2 18 47.4 x2 19 45.0 x2 20 47.9 x2 21 46.0 x3 22 47.1 x3 23 45.6 x3 24 47.1 x3 25 47.2 x3 26 46.4 x3 27 45.9 x3 28 47.1 x3 29 44.9 x3 30 46.2 x3 > str(xs) 'data.frame': 30 obs. of 2 variables: $ score: num 50.5 52.1 51.9 52.4 50.6 51.4 51.2 52.2 51.5 50.8 ... $ temp : Factor w/ 3 levels "x1","x2","x3": 1 1 1 1 1 1 1 1 1 1 ... > levels(xs$temp) <- c("low", "mid", "hi") > xs score temp 1 50.5 low 2 52.1 low 3 51.9 low 4 52.4 low 5 50.6 low 6 51.4 low 7 51.2 low 8 52.2 low 9 51.5 low 10 50.8 low 11 47.5 mid 12 47.7 mid 13 46.6 mid 14 47.1 mid 15 47.2 mid 16 47.8 mid 17 45.2 mid 18 47.4 mid 19 45.0 mid 20 47.9 mid 21 46.0 hi 22 47.1 hi 23 45.6 hi 24 47.1 hi 25 47.2 hi 26 46.4 hi 27 45.9 hi 28 47.1 hi 29 44.9 hi 30 46.2 hi ===== ANOVA by hand ===== mean.by.group.xs <- tapply(xs$score, xs$temp, mean) var.by.group.xs <- tapply(xs$score, xs$temp, var) n.by.group.xs <- tapply(xs$score, xs$temp, length) df.by.group.xs <- n.xs-1 mean.by.group.xs var.by.group.xs n.by.group.xs df.by.group.xs > mean.by.group.xs <- tapply(xs$score, xs$temp, mean) > var.by.group.xs <- tapply(xs$score, xs$temp, var) > n.by.group.xs <- tapply(xs$score, xs$temp, length) > df.by.group.xs <- n.xs-1 > ss.within <- sum(var.by.group.xs * df.by.group.xs) > > mean.by.group.xs low mid hi 51.46 46.94 46.35 > var.by.group.xs low mid hi 0.4671111 1.0848889 0.6027778 > n.by.group.xs low mid hi 10 10 10 > df.by.group.xs low mid hi 9 9 9 > mean.xs <- mean(xs$score) n.total <- length(xs$score) df.total <- n.total-1 n.group.xs <- 3 df.between <- n.group.xs -1 df.within <- sum(df.by.group.xs) n.total df.total df.between df.within ss.total <- var(xs$score) * (length(xs$score)-1) ss.total <- var(xs$score) * df.total ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2) ss.within <- sum(var.by.group.xs * df.by.group.xs) ss.total ss.between ss.within ss.total ss.between + ss.within ms.between <- ss.between/df.between ms.within <- ss.within/df.within ms.total <- ss.total/df.total ms.total ms.between ms.within f.calculated <- ms.between/ms.within f.calculated var(xs$score) > mean.xs <- mean(xs$score) > n.total <- length(xs$score) > df.total <- n.total-1 > n.group.xs <- 3 > df.between <- n.group.xs -1 > df.within <- sum(df.by.group.xs) > n.total [1] 30 > df.total [1] 29 > df.between [1] 2 > df.within [1] 27 > > ss.total <- var(xs$score) * (length(xs$score)-1) > ss.total <- var(xs$score) * df.total > ss.between <- sum(n.by.group.xs * (mean.by.group.xs - mean.xs)^2) > ss.within <- sum(var.by.group.xs * df.by.group.xs) > ss.total [1] 175.695 > ss.between [1] 156.302 > ss.within [1] 19.393 > ss.total [1] 175.695 > ss.between + ss.within [1] 175.695 > > ms.between <- ss.between/df.between > ms.within <- ss.within/df.within > ms.total <- ss.total/df.total > > ms.total [1] 6.058448 > ms.between [1] 78.151 > ms.within [1] 0.7182593 > > f.calculated <- ms.between/ms.within > f.calculated [1] 108.8061 > var(xs$score) [1] 6.058448 > ===== ANOVA function (aov) ===== x.mod <- aov(score~temp, data=xs) x.mod summary(x.mod) TukeyHSD(x.mod) > x.mod <- aov(score~temp, data=xs) > x.mod Call: aov(formula = score ~ temp, data = xs) Terms: temp Residuals Sum of Squares 156.302 19.393 Deg. of Freedom 2 27 Residual standard error: 0.8475018 Estimated effects may be unbalanced > summary(x.mod) Df Sum Sq Mean Sq F value Pr(>F) temp 2 156.30 78.15 108.8 1.2e-13 *** Residuals 27 19.39 0.72 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > TukeyHSD(x.mod) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = score ~ temp, data = xs) $temp diff lwr upr p adj mid-low -4.52 -5.459735 -3.5802652 0.0000000 hi-low -5.11 -6.049735 -4.1702652 0.0000000 hi-mid -0.59 -1.529735 0.3497348 0.2813795 > ====== E.g. 1 ====== > data(InsectSprays) > str(InsectSprays) 'data.frame': 72 obs. of 2 variables: $ count: num 10 7 20 14 14 12 10 23 17 20 ... $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ... > summary(InsectSprays) count spray Min. : 0.00 A:12 1st Qu.: 3.00 B:12 Median : 7.00 C:12 Mean : 9.50 D:12 3rd Qu.:14.25 E:12 Max. :26.00 F:12 > attach(InsectSprays) > tapply(count, spray, mean) A B C D E F 14.500000 15.333333 2.083333 4.916667 3.500000 16.666667 > tapply(count, spray, var) A B C D E F 22.272727 18.242424 3.901515 6.265152 3.000000 38.606061 > tapply(count, spray, length) # okay, as there are no missing values A B C D E F 12 12 12 12 12 12 > boxplot(count ~ spray) # boxplot(count~spray, data=InsectSprays) if not attached > oneway.test(count ~ spray) # oneway.test(count~spray, data=InsectSprays) if not attached One-way analysis of means (not assuming equal variances) data: count and spray F = 36.0654, num df = 5.000, denom df = 30.043, p-value = 8e-12 By default, oneway.test() applies a **Welch-like correction for nonhomogeneity**. To turn off it, set the "var.equal=" option to TRUE. ANOVA 그룹간 homogeneity를 요구하는데, oneway.test는 이를 보정하여 F값을 구한다. > aov.out = aov(count ~ spray, data=InsectSprays) > summary(aov.out) Df Sum Sq Mean Sq F value Pr(>F) spray 5 2668.83 533.77 34.702 <2.2e-16 *** Residuals 66 1015.17 15.38 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 if we use var.eq=TRUE option, the result would be the same. > oneway.test(count ~ spray, data=InsectSprays, var.eq=T) One-way analysis of means data: count and spray F = 34.7023, num df = 5, denom df = 66, p-value < 2.2e-16 > pairwise.t.test(InsectSprays$count, InsectSprays$spray) Pairwise comparisons using t tests with pooled SD data: InsectSprays$count and InsectSprays$spray A B C D E B 1.00 - - - - C 8.7e-10 1.2e-10 - - - D 6.9e-07 9.7e-08 0.49 - - E 2.5e-08 3.6e-09 1.00 1.00 - F 0.90 1.00 4.2e-12 4.0e-09 1.4e-10 P value adjustment method: holm > > TukeyHSD(aov.out) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = count ~ spray, data = InsectSprays) $spray diff lwr upr p adj B-A 0.8333333 -3.866075 5.532742 0.9951810 C-A -12.4166667 -17.116075 -7.717258 0.0000000 D-A -9.5833333 -14.282742 -4.883925 0.0000014 E-A -11.0000000 -15.699409 -6.300591 0.0000000 F-A 2.1666667 -2.532742 6.866075 0.7542147 C-B -13.2500000 -17.949409 -8.550591 0.0000000 D-B -10.4166667 -15.116075 -5.717258 0.0000002 E-B -11.8333333 -16.532742 -7.133925 0.0000000 F-B 1.3333333 -3.366075 6.032742 0.9603075 D-C 2.8333333 -1.866075 7.532742 0.4920707 E-C 1.4166667 -3.282742 6.116075 0.9488669 F-C 14.5833333 9.883925 19.282742 0.0000000 E-D -1.4166667 -6.116075 3.282742 0.9488669 F-D 11.7500000 7.050591 16.449409 0.0000000 F-E 13.1666667 8.467258 17.866075 0.0000000 ====== Ex 1 ====== 약의 종류에 따른 효과의 차이 Drug A 4 5 4 3 2 4 3 4 4 Drug B 6 8 4 5 4 6 5 8 6 Drug C 6 7 6 6 7 5 6 5 5 A <- c(4, 5, 4, 3, 2, 4, 3, 4, 4) B <- c(6, 8, 4, 5, 4, 6, 5, 8, 6) C <- c(6, 7, 6, 6, 7, 5, 6, 5, 5) ====== Ex 2 ====== download {{:dataset_anova_comparisonMethods.csv|dataset_anova_comparisonMethods.csv data file}} Treatment StressReduction mental 2 mental 2 mental 3 mental 4 mental 4 mental 5 mental 3 mental 4 mental 4 mental 4 physical 4 physical 4 physical 3 physical 5 physical 4 physical 1 physical 1 physical 2 physical 3 physical 3 medical 1 medical 2 medical 2 medical 2 medical 3 medical 2 medical 3 medical 1 medical 3 medical 1 dd <- read.csv("http://commres.net/wiki/_media/dataset_anova_comparisonmethods.csv", sep=",") In class exercise 위의 데이터를 이용하여 아래를 수행하시오. 1. F test 가설을 세우시오 2. 각 그룹의 평균과 표준편차 그리고 분산값을 구하시오. 3. F test를 하시오 4. 그룹간 비교를 하시오 (post hoc) 5. 분석결과를 정리하여 설명하시오.