Differences

This shows you the differences between two versions of the page.

--- c:ms:2023:schedule:week06_t-test_and_anova_note [2023/04/11 23:02] – [R] hkimscil
+++ c:ms:2023:schedule:week06_t-test_and_anova_note [2024/04/07 23:42] (current) – [output] hkimscil
@@ Line 73: / Line 73: @@
 df.b <- n.b - 1
-pooled.var <- (SSa+SSb)/(df.a+df.b)
+# we know that we are testing the difference
+# between two independent sample means.
+# Hence, we need to use poole variance between
+# the two group. See
+# http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+pooled.var <- (SSa + SSb) / (df.a + df.b)
 se <- sqrt(pooled.var/n.a + pooled.var/n.b)
-t.calculated <- diff/se
+# Remember t test calculation is based on
+# diff / random error
+t.calculated <- diff / se
 pooled.var
+diff
 se
 t.calculated
+# Now use t.test function for two group
+# (independent sample) t-test
+# with an assumption that variances of
+# the two gorup are the same.
 t.result <- t.test(A, B, var.equal = T)
 t.result
+# t.result$statistic = t.calculated
+# t.result$p.value = probability level of
+# wrong decision with the t calculated value
+str(t.result)
 t.result$statistic
 t.result$p.value
-p.value <- 2*pt(-t.result$statistic, df=df.a+df.b)
+# the above p.value can be obtained with
+# pt function
+p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
 p.value
-str(t.result)
 t.result$p.value
@@ Line 95: / Line 113: @@
 #
+# A combined group with group A and B
+# We call it group total
+# we can obtain its mean, variance, ss, df, etc.
+#
 A
 B
@@ Line 100: / Line 122: @@
 dat
+mean.total <- mean(dat)
 var.total <- var(dat)
+# variance를 ms라고 부르기도 한다
+ms.total <- var.total
 df.total <- length(dat)-1
 ss.total <- var.total*df.total
 ss.total.check <- sum((dat-mean(dat))^2)
+mean.total
+var.total
+ms.total
+df.total
 ss.total
 ss.total.check
-mean.total <- mean(dat)
-mean.total
+# Now for each group
 mean.a <- mean(A)
 mean.b <- mean(B)
+mean.a
+mean.b
+# 그룹 간의 차이에서 나타나는 분산
+# 수업시간에 설명을 잘 들을 것
 # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
-# 이를 제곱하여 A의 숫자만큼 더한다 =
+# 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
 # 즉, SS를 구하는 방법.
 # 전체평균에서 그룹평균을 뺀 것의 제곱을
 # 그룹 구성원 숫자만큼 더하는 것
+# 그리고 이들을 다시 모두 더하여
+# ss.between에 저장
+length(A) * ((mean.total - mean.a)^2)
+length(B) * ((mean.total - mean.b)^2)
-length(A)*((mean.total - mean.a)^2)
-length(B)*((mean.total - mean.b)^2)
 ss.between <-
   length(A)*((mean.total - mean.a)^2) +
   length(B)*((mean.total - mean.b)^2)
 ss.between
+# df between group은 연구에 사용된
+# 그룹의 숫자에서 1을 뺀 숫자
+df.between <- 2 - 1
+# 이 그룹 간 차이에 기인하는 분산 값은
+ms.between <- ss.between / df.between
 # 한편 ss.a 와 ss.b는 각 그룹 내의
@@ Line 132: / Line 173: @@
 ss.b <- var(B) * df.b
 ss.within <- ss.a + ss.b
+df.a <- length(A)-1
-# Now check this
+df.b <- length(B)-1
-ss.total
-ss.between
-ss.within
-ss.total == ss.between + ss.within
-# 한편 df는
-# df.total  30 - 1
-df.between <- 2-1  # 그룹숫자 - 1
-df.a <- length(A)-1 # a 구성원 - 1
-df.b <- length(B)-1 # b 구성원 - 1
 df.within <- df.a + df.b
+ms.within <- ss.within / df.within
-df.total
+# 여기까지 우리는
-df.between
+# 전체분산
-df.within
+# 그룹간분산
-df.total == df.between + df.within
+# 그룹내분산 값을
+# 구한 것
-# 분산을 구하는 방법은 SS/df 이므로
-# 분산을 ms 라고 표기하면 우리는
-# ms.total, ms.between, ms.within을 구할 수 있다
-ms.total <- ss.total / df.total
-ms.between <- ss.between / df.between
-ms.within <- ss.within / df.within
-# 위에서 ms.between은 그룹의 차이때문에 생긴
+# ms.between은 그룹의 차이때문에 생긴
 # 분산으로 IV 혹은 treatment 때문에 생기는
 # 차이에 기인하는 분산이고
@@ Line 169: / Line 194: @@
 # t test 때와 마찬가지로
 # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
-# 구해볼 수 있다. 이것을 f.calculated 이라고 하고
+# 구해볼 수 있다.
-# 이를 프린트아웃 한다
+# 즉, 그룹갑분산은 사실 = diff (between groups)
+# 그리고 그룹내 분산은 사실 = re
+# 따라서 우리는 위 둘 간의 비율을 t test와 같이
+# 살펴볼 수 있다
+# 이것을 f.calculated 이라고 하고
 f.calculated <- ms.between / ms.within
+# 이 값을 출력해 본다
 f.calculated
+# 이 계산은 차이와 랜덤에러의 비율이
+# df에 따라서 얼마나 되어야 그 차이가
+# 충분히 큰 것인지를 판단하기 위해서
+# 쓰인다. 여기서 df에는 두 가지 종류가
+# 있다. df.between 그리고 df.within
+# percentage of f distribution with
+# df1 and df2 option
+# 이는 그림의 왼쪽을 나타내므로
+# 차이가 점점 커지게 되는 오른쪽을
+# 계산하기 위해서는 1-x를 취한다
+f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+f.calculated.pvalue
 # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
 # samples t-test를) 아웃 풋은
 t.result
+# 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+f.calculated.pvalue
+t.result$p.value
+# 또한
 # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
 # 이 값이 2.33333 이었다
@@ Line 194: / Line 243: @@
 t.calculated
+# Now check this
+ss.total
+ss.between
+ss.within
+ss.total
+ss.between + ss.within
+# 한편 df는
+# df.total  30 - 1
+df.total
+df.between
+df.within
+df.total
+df.between + df.within
 # 한 편
@@ Line 229: / Line 292: @@
 sqrt(a.res.sum[[1]][1,4])
 t.result$statistic
 </code>
 ====== output ======
 <code>
+> # from the quiz questions
+> # stu should understand the logic of the ttest
+>
+> set.seed(101)
+> rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
+> A <- rnorm2(16, 26, sqrt(1160/15))
+> B <- rnorm2(16, 19, sqrt(1000/15))
+> A <- c(A)
+> B <- c(B)
+> # we know sqrt(1160/15) is A's sdev
+> # hence, A's var is sqrt(1160/15)^2
+> # hence, A's SS is sqrt(1160/15)^2 * 15
+> # this is 1160
+>
+> # from the above,
+> # the difference between the A and B means
+> # remember we try to find
+> # difference due to the treatment /
+> # / random chance of error
+> diff <- 26 - 19
+>
+> # for se
+> # we know that the situation refers to
+> # #2 se two independent samples t-test
+> # which is sqrt(pooled.var/na + pooled.var/nb)
+>
+> SSa <- 1160
+> SSb <- 1000
+> n.a <- 16
+> n.b <- 16
+> df.a <- n.a - 1
+> df.b <- n.b - 1
+>
+> # we know that we are testing the difference
+> # between two independent sample means.
+> # Hence, we need to use poole variance between
+> # the two group. See
+> # http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+> pooled.var <- (SSa + SSb) / (df.a + df.b)
+> se <- sqrt(pooled.var/n.a + pooled.var/n.b)
+> # Remember t test calculation is based on
+> # diff / random error
+> t.calculated <- diff / se
+> pooled.var
+[1] 72
+> diff
+[1] 7
+> se
+[1] 3
+> t.calculated
+[1] 2.333333
+>
+> # Now use t.test function for two group
+> # (independent sample) t-test
+> # with an assumption that variances of
+> # the two gorup are the same.
+> t.result <- t.test(A, B, var.equal = T)
+> t.result
+	Two Sample t-test
+data:  A and B
+t = 2.3333, df = 30, p-value = 0.02652
+alternative hypothesis: true difference in means is not equal to 0
+percent confidence interval:
+.8731826 13.1268174
+sample estimates:
+mean of x mean of y
+        19
+>
+> # t.result$statistic = t.calculated
+> # t.result$p.value = probability level of
+> # wrong decision with the t calculated value
+> str(t.result)
+List of 10
+ $ statistic  : Named num 2.33
+  ..- attr(*, "names")= chr "t"
+ $ parameter  : Named num 30
+  ..- attr(*, "names")= chr "df"
+ $ p.value    : num 0.0265
+ $ conf.int   : num [1:2] 0.873 13.127
+  ..- attr(*, "conf.level")= num 0.95
+ $ estimate   : Named num [1:2] 26 19
+  ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
+ $ null.value : Named num 0
+  ..- attr(*, "names")= chr "difference in means"
+ $ stderr     : num 3
+ $ alternative: chr "two.sided"
+ $ method     : chr " Two Sample t-test"
+ $ data.name  : chr "A and B"
+ - attr(*, "class")= chr "htest"
+> t.result$statistic
+       t
+.333333
+> t.result$p.value
+[1] 0.02652366
+>
+> # the above p.value can be obtained with
+> # pt function
+> p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
+> p.value
+         t
+.02652366
+> t.result$p.value
+[1] 0.02652366
+>
 > ##
 >
@@ Line 238: / Line 411: @@
 > #
 >
+> # A combined group with group A and B
+> # We call it group total
+> # we can obtain its mean, variance, ss, df, etc.
+> #
 > A
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
+[16] 22.527973
 > B
- [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163
- [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+ [6] 27.231465 18.276359  7.308871 27.560815  7.799787
-[13] 25.018756 25.302096 28.941002 23.293888
+[11] 25.017185 19.639663 25.018756 25.302096 28.941002
+[16] 23.293888
 > dat <- c(A,B)
 > dat
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973 12.941146 21.270062
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
-[19] 13.235378  1.931364 19.232163 27.231465 18.276359  7.308871
+[16] 22.527973 12.941146 21.270062 13.235378  1.931364
-[25] 27.560815  7.799787 25.017185 19.639663 25.018756 25.302096
+[21] 19.232163 27.231465 18.276359  7.308871 27.560815
+[26]  7.799787 25.017185 19.639663 25.018756 25.302096
 [31] 28.941002 23.293888
 >
+> mean.total <- mean(dat)
 > var.total <- var(dat)
+> # variance를 ms라고 부르기도 한다
+> ms.total <- var.total
+>
 > df.total <- length(dat)-1
 > ss.total <- var.total*df.total
 > ss.total.check <- sum((dat-mean(dat))^2)
+>
+> mean.total
+[1] 22.5
+> var.total
+[1] 82.32258
+> ms.total
+[1] 82.32258
+> df.total
+[1] 31
 > ss.total
 [1] 2552
 > ss.total.check
 [1] 2552
-> mean.total <- mean(dat)
-> mean.total
-[1] 22.5
 >
+> # Now for each group
 > mean.a <- mean(A)
 > mean.b <- mean(B)
+> mean.a
+[1] 26
+> mean.b
+[1] 19
+>
+> # 그룹 간의 차이에서 나타나는 분산
+> # 수업시간에 설명을 잘 들을 것
 >
 > # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
-> # 이를 제곱하여 A의 숫자만큼 더한다 =
+> # 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
 > # 즉, SS를 구하는 방법.
 > # 전체평균에서 그룹평균을 뺀 것의 제곱을
 > # 그룹 구성원 숫자만큼 더하는 것
+> # 그리고 이들을 다시 모두 더하여
+> # ss.between에 저장
 >
->
+> length(A) * ((mean.total - mean.a)^2)
-> length(A)*((mean.total - mean.a)^2)
 [1] 196
-> length(B)*((mean.total - mean.b)^2)
+> length(B) * ((mean.total - mean.b)^2)
 [1] 196
+>
 > ss.between <-
 +   length(A)*((mean.total - mean.a)^2) +
 +   length(B)*((mean.total - mean.b)^2)
->
 > ss.between
+[1] 392
+> # df between group은 연구에 사용된
+> # 그룹의 숫자에서 1을 뺀 숫자
+> df.between <- 2 - 1
+> # 이 그룹 간 차이에 기인하는 분산 값은
+> ms.between <- ss.between / df.between
+> ms.between
 [1] 392
 >
@@ Line 293: / Line 499: @@
 > ss.b <- var(B) * df.b
 > ss.within <- ss.a + ss.b
->
+> df.a <- length(A)-1
-> # Now check this
+> df.b <- length(B)-1
-> ss.total
-[1] 2552
-> ss.between
-[1] 392
-> ss.within
-[1] 2160
-> ss.total == ss.between + ss.within
-[1] FALSE
->
-> # 한편 df는
-> # df.total  30 - 1
-> df.between <- 2-1  # 그룹숫자 - 1
-> df.a <- length(A)-1 # a 구성원 - 1
-> df.b <- length(B)-1 # b 구성원 - 1
 > df.within <- df.a + df.b
+> ms.within <- ss.within / df.within
 >
-> df.total
+> ms.within
-[1] 31
+[1] 72
-> df.between
-[1] 1
-> df.within
-[1] 30
-> df.total == df.between + df.within
-[1] TRUE
 >
-> # 분산을 구하는 방법은 SS/df 이므로
+> # 여기까지 우리는
-> # 분산을 ms 라고 표기하면 우리는
+> # 전체분산
-> # ms.total, ms.between, ms.within을 구할 수 있다
+> # 그룹간분산
->
+> # 그룹내분산 값을
-> ms.total <- ss.total / df.total
+> # 구한 것
-> ms.between <- ss.between / df.between
-> ms.within <- ss.within / df.within
 >
-> # 위에서 ms.between은 그룹의 차이때문에 생긴
+> # ms.between은 그룹의 차이때문에 생긴
 > # 분산으로 IV 혹은 treatment 때문에 생기는
 > # 차이에 기인하는 분산이고
@@ Line 338: / Line 523: @@
 > # t test 때와 마찬가지로
 > # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
-> # 구해볼 수 있다. 이것을 f.calculated 이라고 하고
+> # 구해볼 수 있다.
-> # 이를 프린트아웃 한다
 >
+> # 즉, 그룹갑분산은 사실 = diff (between groups)
+> # 그리고 그룹내 분산은 사실 = re
+> # 따라서 우리는 위 둘 간의 비율을 t test와 같이
+> # 살펴볼 수 있다
+>
+>
+> # 이것을 f.calculated 이라고 하고
 > f.calculated <- ms.between / ms.within
+> # 이 값을 출력해 본다
 > f.calculated
 [1] 5.444444
+> # 이 계산은 차이와 랜덤에러의 비율이
+> # df에 따라서 얼마나 되어야 그 차이가
+> # 충분히 큰 것인지를 판단하기 위해서
+> # 쓰인다. 여기서 df에는 두 가지 종류가
+> # 있다. df.between 그리고 df.within
 >
+> # percentage of f distribution with
+> # df1 and df2 option
+> # 이는 그림의 왼쪽을 나타내므로
+> # 차이가 점점 커지게 되는 오른쪽을
+> # 계산하기 위해서는 1-x를 취한다
+> f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+> f.calculated.pvalue
+[1] 0.02652366
 > # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
 > # samples t-test를) 아웃 풋은
@@ Line 361: / Line 566: @@
 >
+> # 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+> f.calculated.pvalue
+[1] 0.02652366
+> t.result$p.value
+[1] 0.02652366
+>
+> # 또한
 > # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
 > # 이 값이 2.33333 이었다
@@ Line 383: / Line 595: @@
 [1] 2.333333
 >
+> # Now check this
+> ss.total
+[1] 2552
+> ss.between
+[1] 392
+> ss.within
+[1] 2160
+> ss.total
+[1] 2552
+> ss.between + ss.within
+[1] 2552
+>
+> # 한편 df는
+> # df.total  30 - 1
+> df.total
+[1] 31
+> df.between
+[1] 1
+> df.within
+[1] 30
+> df.total
+[1] 31
+> df.between + df.within
+[1] 31
 >
 > # 한 편
 >
 > A
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
+[16] 22.527973
 > B
- [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163
- [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+ [6] 27.231465 18.276359  7.308871 27.560815  7.799787
-[13] 25.018756 25.302096 28.941002 23.293888
+[11] 25.017185 19.639663 25.018756 25.302096 28.941002
+[16] 23.293888
 > comb <- stack(list(a=A, b=B))
 > comb
@@ Line 473: / Line 711: @@
 Residuals   30   2160      72
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > # 위에서 F value는 5.444
 > # 그리고 전체적인 아웃풋을 보면
@@ Line 511: / Line 750: @@
        t
 .333333
->
 </code>