Differences

This shows you the differences between two versions of the page.

--- c:ms:2025:w07_anova_note [2025/04/10 15:00] – [ANOVA in R] hkimscil
+++ c:ms:2025:w07_anova_note [2025/04/16 10:40] (current) – [R square: output] hkimscil
@@ Line 1: / Line 1: @@
 ====== ANOVA in R ======
+이 문서는 http://commres.net/wiki/r/anova 의 내용을 변형해서
+다시 만든 문서입니다. http://commres.net/wiki/r/anova 은
+세 그룹 간의 차이를 보는 것이고 이 문서는 4그룹 간의 차이를 보는
+문서입니다.
 <code>
 #
@@ Line 17: / Line 21: @@
 mean.d <- 22
-A <- rnorm2(na, mean.a, sqrt(1160/(na-1)))
+A <- rnorm2(na, mean.a, sqrt(900/(na-1)))
-B <- rnorm2(nb, mean.b, sqrt(1000/(nb-1)))
+B <- rnorm2(nb, mean.b, sqrt(900/(nb-1)))
-C <- rnorm2(nc, mean.c, sqrt(1000/(nc-1)))
+C <- rnorm2(nc, mean.c, sqrt(900/(nc-1)))
-D <- rnorm2(nd, mean.d, sqrt(1000/(nd-1)))
+D <- rnorm2(nd, mean.d, sqrt(900/(nd-1)))
 # A combined group with group A and B
@@ Line 231: / Line 235: @@
 f.calculated
 f.calculated.pvalue
+# how much IV explains the DV
+# in terms of SS?
+r.square <- ss.between / ss.total
+eta  <- r.square
+eta
+lm.res <- lm(dat$values~dat$group, data = dat)
+summary(lm.res)
+summary(a.res)
 </code>
 ===== ANOVA in R: Output =====
 <code>
+> #
+> # ANOVA test with 4 levels in IV
 > #
-> # 3 샘플 종류 추출
+> rm(list=ls())
-> # A, B, C 학년에 따라서 욕하는 정도가 달라질것이라는
+> set.seed(101)
-> # 가설
-> set.seed(201)
 > rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
-> A <- rnorm2(16, 26, sqrt(600/15))
+> n <- 30
-> B <- rnorm2(16, 24, sqrt(750/15))
+> na <- n
-> C <- rnorm2(16, 19, sqrt(900/15))
+> nb <- n
+> nc <- n
+> nd <- n
+> mean.a <- 26
+> mean.b <- 25
+> mean.c <- 23
+> mean.d <- 22
 >
-> A <- c(A)
+> A <- rnorm2(na, mean.a, sqrt(900/(na-1)))
-> B <- c(B)
+> B <- rnorm2(nb, mean.b, sqrt(900/(nb-1)))
-> C <- c(C)
+> C <- rnorm2(nc, mean.c, sqrt(900/(nc-1)))
+> D <- rnorm2(nd, mean.d, sqrt(900/(nd-1)))
 >
-> # 평균구하기
+> # A combined group with group A and B
-> mean(A)
+> # We call it group total
-[1] 26
+> # we can obtain its mean, variance, ss, df, etc.
-> mean(B)
+> #
+> A
+          [,1]
+ [1,] 24.37871
+ [2,] 30.23619
+ [3,] 22.05233
+ [4,] 27.98186
+ [5,] 28.62468
+ [6,] 34.38014
+ [7,] 30.67844
+ [8,] 25.80092
+ [9,] 32.66698
+[10,] 25.06399
+[11,] 30.06274
+[12,] 21.25288
+[13,] 36.07231
+[14,] 16.77241
+[15,] 24.97448
+[16,] 25.26349
+[17,] 20.88676
+[18,] 26.94242
+[19,] 21.10069
+[20,] 12.88194
+[21,] 25.46073
+[22,] 31.27674
+[23,] 24.76580
+[24,] 16.79173
+[25,] 31.51620
+[26,] 17.14866
+[27,] 29.66682
+[28,] 25.75701
+[29,] 29.66796
+[30,] 29.87397
+attr(,"scaled:center")
+[1] -0.08287722
+attr(,"scaled:scale")
+[1] 0.8355107
+> B
+          [,1]
+ [1,] 30.62357
+ [2,] 27.27939
+ [3,] 31.23686
+ [4,] 14.50486
+ [5,] 32.22519
+ [6,] 21.82949
+ [7,] 26.67567
+ [8,] 30.76150
+ [9,] 16.68531
+[10,] 28.19891
+[11,] 28.38350
+[12,] 29.88106
+[13,] 13.16769
+[14,] 23.26793
+[15,] 19.76032
+[16,] 27.95159
+[17,] 28.85313
+[18,] 21.92882
+[19,] 24.18798
+[20,] 17.70481
+[21,] 19.51664
+[22,] 24.27280
+[23,] 28.90183
+[24,] 18.17715
+[25,] 29.83134
+[26,] 20.05465
+[27,] 26.66153
+[28,] 31.89910
+[29,] 32.13759
+[30,] 23.43977
+attr(,"scaled:center")
+[1] -0.1405676
+attr(,"scaled:scale")
+[1] 1.025799
+> C
+          [,1]
+ [1,] 21.04268
+ [2,] 14.58761
+ [3,] 18.90352
+ [4,] 23.12972
+ [5,] 24.86854
+ [6,] 24.66800
+ [7,] 18.64315
+ [8,] 23.33405
+ [9,] 22.17603
+[10,] 22.07975
+[11,] 30.96436
+[12,] 31.58129
+[13,] 28.96433
+[14,] 22.06416
+[15,] 12.30153
+[16,] 16.68289
+[17,] 24.19514
+[18,] 15.33454
+[19,] 23.27483
+[20,] 22.21340
+[21,] 32.88316
+[22,] 28.73176
+[23,] 19.63225
+[24,] 19.45001
+[25,] 12.80615
+[26,] 25.13846
+[27,] 22.52944
+[28,] 30.05695
+[29,] 26.55883
+[30,] 31.20348
+attr(,"scaled:center")
+[1] 0.08931913
+attr(,"scaled:scale")
+[1] 0.9936574
+> D
+           [,1]
+ [1,] 29.117527
+ [2,] 21.287702
+ [3,] 19.406172
+ [4,] 17.338050
+ [5,] 23.108952
+ [6,] 16.932891
+ [7,] 18.923097
+ [8,] 29.345116
+ [9,] 24.349527
+[10,] 16.795435
+[11,] 23.028628
+[12,] 18.074871
+[13,] 33.770366
+[14,] 28.238105
+[15,] 25.785121
+[16,] 20.157663
+[17,] 21.990432
+[18,]  8.910282
+[19,] 18.797986
+[20,] 31.193353
+[21,] 18.214726
+[22,] 21.215850
+[23,] 20.581063
+[24,] 30.711279
+[25,] 25.911186
+[26,] 17.041703
+[27,] 17.853327
+[28,] 16.703969
+[29,] 18.081180
+[30,] 27.134442
+attr(,"scaled:center")
+[1] 0.0894333
+attr(,"scaled:scale")
+[1] 0.9674409
+> comb <- data.frame(A, B, C, D)
+> dat <- stack(comb)
+> head(dat)
+    values ind
+24.37871   A
+30.23619   A
+22.05233   A
+27.98186   A
+28.62468   A
+34.38014   A
+> colnames(dat)[1] <- "values"
+> colnames(dat)[2] <- "group"
+> head(dat)
+    values group
+24.37871     A
+30.23619     A
+22.05233     A
+27.98186     A
+28.62468     A
+34.38014     A
+>
+> mean.total <- mean(dat$values)
+> var.total <- var(dat$values)
+> # variance를 ms라고 부르기도 한다
+> ms.total <- var.total
+>
+> df.total <- length(dat$values) - 1
+> ss.total <- var.total * df.total
+> ss.total.check <- sum(
++   (dat$values - mean(dat$values))^2
++ )
+>
+> mean.total
 [1] 24
-> mean(C)
+> var.total
-[1] 19
+[1] 32.77311
+> ms.total
+[1] 32.77311
+> df.total
+[1] 119
+> ss.total
+[1] 3900
+> ss.total.check
+[1] 3900
 >
-> # 3 샘플을 합치기
+> # Now for each group
-> # 두번재 컬럼 = group A, B, C 가 되도록
+> mean.a <- mean(A)
-> comb3 <- stack(list(a=A, b=B, c=C))
+> mean.b <- mean(B)
-> comb3
+> mean.c <- mean(C)
-      values ind
+> mean.d <- mean(D)
-  28.956987   a
+> mean.a
-  24.588298   a
+[1] 26
-  29.232040   a
+> mean.b
-  15.221730   a
+[1] 25
-  18.069720   a
+> mean.c
-  26.455426   a
+[1] 23
-  28.824505   a
+> mean.d
-  27.362726   a
+[1] 22
-  11.013442   a
-28.284603   a
-31.778215   a
-28.525725   a
-29.735641   a
-33.996331   a
-22.487061   a
-31.467550   a
-26.638215   b
-25.537955   b
-30.194374   b
-22.161849   b
-23.164369   b
-17.436099   b
-34.602104   b
-12.091427   b
-36.147829   b
-21.460025   b
-30.381254   b
-24.367170   b
-17.691419   b
-26.351577   b
-11.330276   b
-24.444055   b
-10.842903   c
-20.414861   c
-11.872108   c
-29.195858   c
-26.074787   c
-18.262033   c
-18.863621   c
-25.906536   c
-  4.379624   c
-  2.980549   c
-20.825446   c
-22.038722   c
-24.175933   c
-25.745030   c
-23.796968   c
-18.625020   c
-> colnames(comb3)[1] <- "values"
-> colnames(comb3)[2] <- "group"
-> comb3
-      values group
-  28.956987     a
-  24.588298     a
-  29.232040     a
-  15.221730     a
-  18.069720     a
-  26.455426     a
-  28.824505     a
-  27.362726     a
-  11.013442     a
-28.284603     a
-31.778215     a
-28.525725     a
-29.735641     a
-33.996331     a
-22.487061     a
-31.467550     a
-26.638215     b
- 25.537955     b
-30.194374     b
-22.161849     b
- 23.164369     b
-17.436099     b
-34.602104     b
-12.091427     b
-36.147829     b
-21.460025     b
-30.381254     b
-24.367170     b
-17.691419     b
-26.351577     b
-11.330276     b
-24.444055     b
-10.842903     c
-20.414861     c
-11.872108     c
-29.195858     c
-26.074787     c
-18.262033     c
-18.863621     c
-25.906536     c
-  4.379624     c
-  2.980549     c
-20.825446     c
- 22.038722     c
-24.175933     c
-25.745030     c
-23.796968     c
-18.625020     c
 >
-> # 전체구성원을 하나로 보고 분산값을 구해본다
+> # 그룹 간의 차이에서 나타나는 분산
-> # ms.tot = ss.tot / df.tot
+> # 수업시간에 설명을 잘 들을 것
-> # ss.tot = sum(xi-mean.tot)^2
+> hist(dat$values)
-> # df.tot = N - 1
+> abline(v = mean(dat$values), lty=2, lwd=3, col="red")
-> # attach(comb3)
+> abline(v = mean.a, lty=2, lwd=3, col="blue")
-> mean.tot <- mean(comb3$values)
+> abline(v = mean.b, lty=2, lwd=3, col="darkgreen")
-> ss.tot <- sum((comb3$values-mean.tot)^2)
+> abline(v = mean.c, lty=2, lwd=3, col="black")
-> df.tot <- 48-1
+> abline(v = mean.d, lty=2, lwd=3, col="orange")
-> ms.tot <- ss.tot/df.tot
-> ss.tot
-[1] 2666
-> df.tot
-[1] 47
-> ms.tot
-[1] 56.7234
-> var(comb3$values)
-[1] 56.7234
 >
-> # A, B, C 평균
+> # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
-> m.a <- mean(A)
+> # 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
-> m.b <- mean(B)
+> # 즉, SS를 구하는 방법.
-> m.c <- mean(C)
+> # 전체평균에서 그룹평균을 뺀 것의 제곱을
-> ms.a <- var(A)
+> # 그룹 구성원 숫자만큼 더하는 것
-> ms.b <- var(B)
+> # 그리고 이들을 다시 모두 더하여
-> ms.c <- var(C)
+> # ss.between에 저장
-> df.a <- 15
-> df.b <- 15
-> df.c <- 15
-> # 사실 우리는 아래 각 그룹의 SS가
-> # 600, 750, 900 인것을 알고 있다.
-> ss.a <- ms.a*df.a
-> ss.b <- ms.b*df.b
-> ss.c <- ms.c*df.c
 >
-> ss.within <- ss.a + ss.b + ss.c
+> length(A) * ((mean.total - mean.a)^2)
-> ss.within <- ss.within
+[1] 120
+> length(B) * ((mean.total - mean.b)^2)
+[1] 30
+> length(C) * ((mean.total - mean.c)^2)
+[1] 30
+> length(D) * ((mean.total - mean.d)^2)
+[1] 120
 >
-> 16*((m.a-mean.tot)^2)
-[1] 144
-> 16*((m.b-mean.tot)^2)
-[1] 16
-> 16*((m.c-mean.tot)^2)
-[1] 256
-> ss.bet <- 16*((m.a-mean.tot)^2) + 16*((m.b-mean.tot)^2) + 16*((m.c-mean.tot)^2)
 >
-> ss.tot
+> ss.between <-
-[1] 2666
++   length(A) * ((mean.total - mean.a)^2) +
-> ss.bet
++   length(B) * ((mean.total - mean.b)^2) +
-[1] 416
++   length(C) * ((mean.total - mean.c)^2) +
-> ss.within
++   length(D) * ((mean.total - mean.d)^2)
-[1] 2250
-> ss.bet + ss.within
-[1] 2666
 >
-> df.tot
+> ss.between
-[1] 47
+[1] 300
-> df.within <- df.a + df.b + df.c
+> # df between group은 연구에 사용된
-> df.within
+> # 그룹의 숫자에서 1을 뺀 숫자
-[1] 45
+> n.groups <- nlevels(dat$group)
-> df.bet <- 3-1
+> df.between <- n.groups - 1
-> df.bet
+> # 이 그룹 간 차이에 기인하는 분산 값은
-[1] 2
+> ms.between <- ss.between / df.between
-> df.bet + df.within
-[1] 47
 >
-> ms.bet <- ss.bet/df.bet
+> # 한편 ss.a 와 ss.b는 각 그룹 내의
-> ms.within <- ss.within /df.within
+> # 분산을 알아보기 위한 방법
-> f.calculated <- ms.bet/ms.within
+> df.a <- length(A) - 1
-> ms.bet
+> df.b <- length(B) - 1
-[1] 208
+> df.c <- length(C) - 1
-> ms.within
+> df.d <- length(D) - 1
-[1] 50
+> ss.a <- var(A) * df.a
-> f.calculated
+> ss.b <- var(B) * df.b
-[1] 4.16
+> ss.c <- var(C) * df.c
-> # 이 점수에서의 F critical value =
+> ss.d <- var(D) * df.d
-> fp.value <- pf(f.calculated, df1=2, df2=45, lower.tail = F)
+> ss.within <- ss.a + ss.b + ss.c + ss.d
-> fp.value
+> df.within <- df.a + df.b + df.c + df.d
-[1] 0.02199143
+> ms.within <- ss.within / df.within
 >
-> # f.calculated 와 fp.value로 판단
+> # 여기까지 우리는
+> # 전체분산
+> # 그룹간분산
+> # 그룹내분산 값을
+> # 구한 것
+>
+> # ms.between은 그룹의 차이때문에 생긴
+> # 분산으로 IV 혹은 treatment 때문에 생기는
+> # 차이에 기인하는 분산이고
+>
+> # ms.within은 각 그룹 내부에서 일어나는 분산이므로
+> # (variation이므로) 연구자의 관심사와는 상관이 없이
+> # 나타나는 random한 분산이라고 하면
+>
+> # t test 때와 마찬가지로
+> # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
+> # 구해볼 수 있다.
+>
+> # 즉, 그룹갑분산은 사실 = diff (between groups)
+> # 그리고 그룹내 분산은 사실 = re
+> # 따라서 우리는 위 둘 간의 비율을 t test와 같이
+> # 살펴볼 수 있다
+>
+>
+> # 이것을 f.calculated 이라고 하고
+> f.calculated <- ms.between / ms.within
+> # 이 값을 출력해 본다
 > f.calculated
-[1] 4.16
+         [,1]
-> fp.value
+[1,] 3.222222
-[1] 0.02199143
+>
+> # 컴퓨터 계산이 쉬워지기 전에는 아래처럼 0.5 level
+> # 에서의 f값을 구한 후 이것과 계산된 f값을 비교해봤었다.
+> qf(.05, df1 = df.between, df2 = df.within, lower.tail = FALSE)
+[1] 2.682809
+> f.calculated
+         [,1]
+[1,] 3.222222
+> # 위에서 f.calculated > qf값이므로
+> # f.calculated 값으로 영가설을 부정하고
+> # 연구가설을 채택하면 판단이 잘못일 확률이
+> # 0.05보다 작다는 것을 안다.
+> # 그러나 컴퓨터계산이 용이해지고서는 qf대신에
+> # pf를 써서 f.cal 값에 해당하는 prob. level을
+> # 알아본다.
+>
+> # percentage of f distribution with
+> # df1 and df2 option
+> # 이는 그림의 왼쪽을 나타내므로
+> # 차이가 점점 커지게 되는 오른쪽을
+> # 계산하기 위해서는 1-x를 취한다
+> f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+> f.calculated.pvalue
+           [,1]
+[1,] 0.02527283
+> f.calculated
+         [,1]
+[1,] 3.222222
+>
+> # graph 로 이해
+> x <- rf(500000, df1 = df.between, df2 = df.within)
+> y.max <- max(df(x,df1=df.between, df2=df.within))
+>
+> hist(x,
++      breaks = "Scott",
++      freq = FALSE,
++      xlim = c(0, f.calculated + 1),
++      ylim = c(0, y.max + .3),
++      xlab = "",
++      main = paste("Histogram for
++      a F-distribution with
++      df1 = ", df.between,
++                   "and df2 = ", df.within,
++                   "F calculated value = ", round(f.calculated,3),
++                   "p value = ", f.calculated.pvalue),
++      cex.main = 0.9
++ )
+> curve(df(x, df1 = df.between, df2 = df.within),
++       from = 0, to = f.calculated + 1, n = 5000,
++       col = "red", lwd = 2,
++       add = T)
+> abline(v=f.calculated, col="blue", lwd=2, lty="dotted")
+>
+> f.calculated
+         [,1]
+[1,] 3.222222
+> f.calculated.pvalue
+           [,1]
+[1,] 0.02527283
+> 1 - f.calculated.pvalue
+          [,1]
+[1,] 0.9747272
+>
+>
+>
+> # Now check this
+> ss.total
+[1] 3900
+> ss.between
+[1] 300
+> ss.within
+     [,1]
+[1,] 3600
+> ss.total
+[1] 3900
+> ss.between + ss.within
+     [,1]
+[1,] 3900
+>
+> # 한편 df는
+> # df.total  30 - 1
+> df.total
+[1] 119
+> df.between
+[1] 3
+> df.within
+[1] 116
+> df.total
+[1] 119
+> df.between + df.within
+[1] 119
 >
 >
-> a.res <- aov(values ~ group, data=comb3)
+> ##################################################
+> a.res <- aov(values ~ group, data=dat)
 > a.res.sum <- summary(a.res)
 > a.res.sum
-            Df Sum Sq Mean Sq F value Pr(>F)
+             Df Sum Sq Mean Sq F value Pr(>F)
-group        2    416     208    4.16  0.022 *
+group         3    300  100.00   3.222 0.0253 *
-Residuals   45   2250      50
+Residuals   116   3600   31.03
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
-> # 위에서
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-> # ssd라는 function을 만들면
->
-> ssd <- function(x) {
-+     var(x) * (length(x)-1) }
-> ss.a1 <- ssd(A)
-> ss.b2 <- ssd(B)
-> ss.c1 <- ssd(C)
->
-> ss.a == ss.a1
-[1] TRUE
-> ss.b == ss.b1
-[1] TRUE
-> ss.c == ss.c1
-[1] TRUE
 > # 그러나 정확히 어떤 그룹에서 차이가 나는지는 판단해주지 않음
-> pairwise.t.test(comb3$values, comb3$group, p.adj = "none")
+> pairwise.t.test(dat$values, dat$group, p.adj = "none")
 	Pairwise comparisons using t tests with pooled SD
-data:  comb3$values and comb3$group
+data:  dat$values and dat$group
-  a      b
+  A      B      C
-b 0.4279 -
+B 0.4883 -      -
-c 0.0075 0.0516
+C 0.0392 0.1671 -
+D 0.0063 0.0392 0.4883
 P value adjustment method: none
 > # OR
-> pairwise.t.test(comb3$values, comb3$group, p.adj = "bonf")
+> pairwise.t.test(dat$values, dat$group, p.adj = "bonf")
 	Pairwise comparisons using t tests with pooled SD
-data:  comb3$values and comb3$group
+data:  dat$values and dat$group
-  a     b
+  A     B     C
-b 1.000 -
+B 1.000 -     -
-c 0.023 0.155
+C 0.235 1.000 -
+D 0.038 0.235 1.000
 P value adjustment method: bonferroni
-> pairwise.t.test(comb3$values, comb3$group, p.adj = "holm")
+> pairwise.t.test(dat$values, dat$group, p.adj = "holm")
 	Pairwise comparisons using t tests with pooled SD
-data:  comb3$values and comb3$group
+data:  dat$values and dat$group
-  a     b
+  A     B     C
-b 0.428 -
+B 0.977 -     -
-c 0.023 0.103
+C 0.196 0.501 -
+D 0.038 0.196 0.977
 P value adjustment method: holm
@@ Line 513: / Line 710: @@
 % family-wise confidence level
-Fit: aov(formula = values ~ group, data = comb3)
+Fit: aov(formula = values ~ group, data = dat)
 $group
-    diff        lwr       upr     p adj
+    diff       lwr        upr     p adj
-b-a   -2  -8.059034  4.059034 0.7049466
+B-A   -1 -4.749401  2.7494006 0.8987322
-c-a   -7 -13.059034 -0.940966 0.0201250
+C-A   -3 -6.749401  0.7494006 0.1638896
-c-b   -5 -11.059034  1.059034 0.1238770
+D-A   -4 -7.749401 -0.2505994 0.0316929
+C-B   -2 -5.749401  1.7494006 0.5078699
+D-B   -3 -6.749401  0.7494006 0.1638896
+D-C   -1 -4.749401  2.7494006 0.8987322
 >
+> boxplot(dat$values~dat$group)
+> f.calculated
+         [,1]
+[1,] 3.222222
+> f.calculated.pvalue
+           [,1]
+[1,] 0.02527283
 >
 >
-</code>
+> # how much IV explains the DV
+> # in terms of SS?
+> r.square <- ss.between / ss.total
+> eta  <- r.square
+> eta
+[1] 0.07692308
+> lm.res <- lm(dat$values~dat$group, data = dat)
+> summary(lm.res)
+Call:
+lm(formula = dat$values ~ dat$group, data = dat)
+Residuals:
+     Min       1Q   Median       3Q      Max
+-13.1181  -3.9308  -0.3568   3.9491  11.7704
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)   26.000      1.017  25.563  < 2e-16 ***
+dat$groupB    -1.000      1.438  -0.695  0.48831
+dat$groupC    -3.000      1.438  -2.086  0.03920 *
+dat$groupD    -4.000      1.438  -2.781  0.00633 **
+---
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 5.571 on 116 degrees of freedom
+Multiple R-squared:  0.07692,	Adjusted R-squared:  0.05305
+F-statistic: 3.222 on 3 and 116 DF,  p-value: 0.02527
+> summary(a.res)
+             Df Sum Sq Mean Sq F value Pr(>F)
+group         3    300  100.00   3.222 0.0253 *
+Residuals   116   3600   31.03
+---
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+>
+>
+></code>
+{{:c:ms:2025:pasted:20250414-084011.png}}
+{{:c:ms:2025:pasted:20250414-084020.png}}
+{{:c:ms:2025:pasted:20250414-084030.png}}
 ====== Post hoc test ======
 [[:post hoc test]]
 <code>
-m.a
+mean.a
-m.b
+mean.b
-m.c
+mean.c
+mean.d
-d.ab <- m.a - m.b
+d.ab <- mean.a - mean.b
-d.ac <- m.a - m.c
+d.ac <- mean.a - mean.c
-d.bc <- m.b - m.c
+d.ad <- mean.a - mean.d
+d.bc <- mean.b - mean.c
+d.bd <- mean.b - mean.d
+d.cd <- mean.c - mean.d
 d.ab
 d.ac
+d.ad
 d.bc
+d.bd
+d.cd
 # mse (ms within) from the a.res.sum output
-# a.res.sum == summary(aov(values ~ group, data=comb3))
+# a.res.sum == summary(aov(values ~ group, data=dat))
 a.res.sum
 # mse = 50
-mse <- 50
+mse <- 35.86
-# 혹은 fansy way from comb3 data.frame
+# 혹은 fansy way from dat data.frame
-# 15 는 각 그룹의 df
+# df.a 는 각 그룹의 df (모든 그룹의 df값이 같으므로 df.a를 사용)
-sse.ch <- sum(tapply(comb3$values, comb3$group, var)*15)
+tapply(dat$values, dat$group, var) # 각 그룹의 분산
+tapply(dat$values, dat$group, var) * df.a # 각 그룹의 SS
+# 이 분산값에 df값을 곱하면 각 그룹의 SS값
+# 이 값들을 sum 하면 총 ss.within 값
+sse.ch <- sum(tapply(dat$values, dat$group, var) * df.a)
 sse.ch
-mse.ch <- sse.ch/45
+mse.ch <- sse.ch / (df.a*4) # 이 값에 df.a * 4 하면 ms.within값
 mse.ch
+mse <- mse.ch
 se <- sqrt(mse/length(A))
@@ Line 558: / Line 818: @@
 t.ab <- d.ab / se
 t.ac <- d.ac / se
+t.ad <- d.ad / se
 t.bc <- d.bc / se
+t.bd <- d.bd / se
+t.cd <- d.cd / se
 t.ab
 t.ac
+t.ad
 t.bc
+t.bd
+t.cd
 # 이제 위의 점수를 .05 레벨에서 비교할 점수를 찾아 비교한다
 # qtukey 펑션을 이용한다
-t.crit <- qtukey( .95, nmeans = 3, df = 45)
+t.crit <- qtukey( .95, nmeans = 4, df = 30 * 4)
 t.crit
-# t.ac만이 큰 것을 알 수 있다.
-# 따라서 a 와 c 가 서로 다른 그룹
-# 즉, 1학년과 3학년이 서로 다른 그룹
 # 혹은 R이 보통 제시한 거꾸로 방법으로 보면
-ptukey(t.ab, nmeans=3, df=45, lower.tail = F)
+ptukey(t.ab, nmeans=4, df=df.within, lower.tail = F)
-ptukey(t.ac, nmeans=3, df=45, lower.tail = F)
+ptukey(t.ac, nmeans=4, df=df.within, lower.tail = F)
-ptukey(t.bc, nmeans=3, df=45, lower.tail = F)
+ptukey(t.ad, nmeans=4, df=df.within, lower.tail = F)
+ptukey(t.bc, nmeans=4, df=df.within, lower.tail = F)
+ptukey(t.bd, nmeans=4, df=df.within, lower.tail = F)
+ptukey(t.cd, nmeans=4, df=df.within, lower.tail = F)
 TukeyHSD(a.res, conf.level=.95)
@@ Line 586: / Line 851: @@
 <code>
-pairwise.t.test(comb3$values, comb3$group, p.adj = "bonf")
+pairwise.t.test(dat$values, dat$group, p.adj = "bonf")
 </code>
 ===== post hoc test: output =====
 <code>
-> m.a
+> mean.a
 [1] 26
-> m.b
+> mean.b
-[1] 24
+[1] 25
-> m.c
+> mean.c
-[1] 19
+[1] 22
+> mean.d
+[1] 20
 >
-> d.ab <- m.a - m.b
+> d.ab <- mean.a - mean.b
-> d.ac <- m.a - m.c
+> d.ac <- mean.a - mean.c
-> d.bc <- m.b - m.c
+> d.ad <- mean.a - mean.d
+> d.bc <- mean.b - mean.c
+> d.bd <- mean.b - mean.d
+> d.cd <- mean.c - mean.d
 >
 > d.ab
-[1] 2
+[1] 1
 > d.ac
-[1] 7
+[1] 4
+> d.ad
+[1] 6
 > d.bc
+[1] 3
+> d.bd
 [1] 5
+> d.cd
+[1] 2
 >
-> # se
+> # mse (ms within) from the a.res.sum output
-> # from the a.res.sum output
+> # a.res.sum == summary(aov(values ~ group, data=dat))
 > a.res.sum
-            Df Sum Sq Mean Sq F value Pr(>F)
+             Df Sum Sq Mean Sq F value   Pr(>F)
-group        2    416     208    4.16  0.022 *
+group         3    682  227.50   6.344 0.000508 ***
-Residuals   45   2250      50
+Residuals   116   4160   35.86
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > # mse = 50
-> mse <- 50
+> mse <- 35.86
->
+> # 혹은 fansy way from dat data.frame
+> # df.a 는 각 그룹의 df (모든 그룹의 df값이 같으므로 df.a를 사용)
+> tapply(dat$values, dat$group, var) # 각 그룹의 분산
+       A        B        C        D
+.00000 34.48276 31.03448 37.93103
+> tapply(dat$values, dat$group, var) * df.a # 각 그룹의 SS
+   A    B    C    D
+1000  900 1100
+> # 이 분산값에 df값을 곱하면 각 그룹의 SS값
+> # 이 값들을 sum 하면 총 ss.within 값
+> sse.ch <- sum(tapply(dat$values, dat$group, var) * df.a)
+> sse.ch
+[1] 4160
+> mse.ch <- sse.ch / (df.a*4) # 이 값에 df.a * 4 하면 ms.within값
+> mse.ch
+[1] 35.86207
+> mse <- mse.ch
 > se <- sqrt(mse/length(A))
 >
@@ Line 624: / Line 917: @@
 > t.ab <- d.ab / se
 > t.ac <- d.ac / se
+> t.ad <- d.ad / se
 > t.bc <- d.bc / se
+> t.bd <- d.bd / se
+> t.cd <- d.cd / se
 >
 > t.ab
-[1] 1.131371
+[1] 0.9146248
 > t.ac
-[1] 3.959798
+[1] 3.658499
+> t.ad
+[1] 5.487749
 > t.bc
-[1] 2.828427
+[1] 2.743874
+> t.bd
+[1] 4.573124
+> t.cd
+[1] 1.82925
 >
 > # 이제 위의 점수를 .05 레벨에서 비교할 점수를 찾아 비교한다
 > # qtukey 펑션을 이용한다
-> t.crit <- qtukey( .95, nmeans = 3, df = 45)
+> t.crit <- qtukey( .95, nmeans = 4, df = 30 * 4)
 > t.crit
-[1] 3.427507
+[1] 3.684589
->
-> # t.ac만이 큰 것을 알 수 있다.
-> # 따라서 a 와 c 가 서로 다른 그룹
-> # 즉, 1학년과 3학년이 서로 다른 그룹
 >
 > # 혹은 R이 보통 제시한 거꾸로 방법으로 보면
-> ptukey(t.ab, nmeans=3, df=45, lower.tail = F)
+> ptukey(t.ab, nmeans=4, df=df.within, lower.tail = F)
-[1] 0.7049466
+[1] 0.9164944
-> ptukey(t.ac, nmeans=3, df=45, lower.tail = F)
+> ptukey(t.ac, nmeans=4, df=df.within, lower.tail = F)
-[1] 0.02012498
+[1] 0.05255324
-> ptukey(t.bc, nmeans=3, df=45, lower.tail = F)
+> ptukey(t.ad, nmeans=4, df=df.within, lower.tail = F)
-[1] 0.123877
+[1] 0.0009861641
+> ptukey(t.bc, nmeans=4, df=df.within, lower.tail = F)
+[1] 0.2171272
+> ptukey(t.bd, nmeans=4, df=df.within, lower.tail = F)
+[1] 0.008539966
+> ptukey(t.cd, nmeans=4, df=df.within, lower.tail = F)
+[1] 0.5689321
 >
 > TukeyHSD(a.res, conf.level=.95)
@@ Line 655: / Line 959: @@
 % family-wise confidence level
-Fit: aov(formula = values ~ group, data = comb3)
+Fit: aov(formula = values ~ group, data = dat)
 $group
-    diff        lwr       upr     p adj
+    diff        lwr        upr     p adj
-b-a   -2  -8.059034  4.059034 0.7049466
+B-A   -1  -5.030484  3.0304845 0.9164944
-c-a   -7 -13.059034 -0.940966 0.0201250
+C-A   -4  -8.030484  0.0304845 0.0525532
-c-b   -5 -11.059034  1.059034 0.1238770
+D-A   -6 -10.030484 -1.9695155 0.0009862
+C-B   -3  -7.030484  1.0304845 0.2171272
+D-B   -5  -9.030484 -0.9695155 0.0085400
+D-C   -2  -6.030484  2.0304845 0.5689321
+> plot(TukeyHSD(a.res, conf.level=.95), las = 2)
+> pairwise.t.test(dat$values, dat$group, p.adj = "bonf")
+	Pairwise comparisons using t tests with pooled SD
+data:  dat$values and dat$group
+  A      B      C
+B 1.0000 -      -
+C 0.0655 0.3287 -
+D 0.0010 0.0096 1.0000
+P value adjustment method: bonferroni
+>
+>
 </code>
-{{:c:ms:2023:pasted:20230418-223608.png}}
+{{:c:ms:2025:pasted:20250410-154903.png}}
 ====== R square or Eta square ======
 SS toal
@@ Line 691: / Line 1014: @@
 <code>
-ss.tot
+ss.total
-ss.bet
+ss.between
-r.sq <- ss.bet / ss.tot
+r.sq <- ss.between / ss.total
 r.sq
 # then . . . .
-lm.res <- lm(values ~ group, data = comb3)
+lm.res <- lm(values ~ group, data = dat)
 summary(lm.res)
 anova(lm.res)
@@ Line 705: / Line 1028: @@
 ===== R square: output =====
 <code>
-> ss.tot
+> ss.total
-[1] 2666
+[1] 3900
-> ss.bet
+> ss.between
-[1] 416
+[1] 300
-> r.sq <- ss.bet / ss.tot
+> r.sq <- ss.between / ss.total
 > r.sq
-[1] 0.156039
+[1] 0.07692308
 >
 > # then . . . .
 >
-> lm.res <- lm(values ~ group, data = comb3)
+> lm.res <- lm(values ~ group, data = dat)
 > summary(lm.res)
 Call:
-lm(formula = values ~ group, data = comb3)
+lm(formula = values ~ group, data = dat)
 Residuals:
-    Min      1Q  Median      3Q     Max
+     Min       1Q   Median       3Q      Max
--16.020  -2.783   1.476   4.892  12.148
+-13.1181  -3.9308  -0.3568   3.9491  11.7704
 Coefficients:
             Estimate Std. Error t value Pr(>|t|)
-(Intercept)   26.000      1.768   14.71   <2e-16 ***
+(Intercept)   26.000      1.017  25.563  < 2e-16 ***
-groupb        -2.000      2.500   -0.80   0.4279
+groupB        -1.000      1.438  -0.695  0.48831
-groupc        -7.000      2.500   -2.80   0.0075 **
+groupC        -3.000      1.438  -2.086  0.03920 *
+groupD        -4.000      1.438  -2.781  0.00633 **
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-Residual standard error: 7.071 on 45 degrees of freedom
+Residual standard error: 5.571 on 116 degrees of freedom
-Multiple R-squared:  0.156,	Adjusted R-squared:  0.1185
+Multiple R-squared:  0.07692,	Adjusted R-squared:  0.05305
-F-statistic:  4.16 on 2 and 45 DF,  p-value: 0.02199
+F-statistic: 3.222 on 3 and 116 DF,  p-value: 0.02527
 > anova(lm.res)
@@ Line 741: / Line 1066: @@
 Response: values
-          Df Sum Sq Mean Sq F value  Pr(>F)
+           Df Sum Sq Mean Sq F value  Pr(>F)
-group      2    416     208    4.16 0.02199 *
+group       3    300 100.000  3.2222 0.02527 *
-Residuals 45   2250      50
+Residuals 116   3600  31.034
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
->
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 >
 </code>