Differences

This shows you the differences between two versions of the page.

--- c:ms:2023:schedule:week06_t-test_and_anova_note [2023/04/12 09:05] – [output] hkimscil
+++ c:ms:2023:schedule:week06_t-test_and_anova_note [2024/04/08 08:42] (current) – [output] hkimscil
@@ Line 73: / Line 73: @@
 df.b <- n.b - 1
-pooled.var <- (SSa+SSb)/(df.a+df.b)
+# we know that we are testing the difference
+# between two independent sample means.
+# Hence, we need to use poole variance between
+# the two group. See
+# http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+pooled.var <- (SSa + SSb) / (df.a + df.b)
 se <- sqrt(pooled.var/n.a + pooled.var/n.b)
-t.calculated <- diff/se
+# Remember t test calculation is based on
+# diff / random error
+t.calculated <- diff / se
 pooled.var
+diff
 se
 t.calculated
+# Now use t.test function for two group
+# (independent sample) t-test
+# with an assumption that variances of
+# the two gorup are the same.
 t.result <- t.test(A, B, var.equal = T)
 t.result
+# t.result$statistic = t.calculated
+# t.result$p.value = probability level of
+# wrong decision with the t calculated value
+str(t.result)
 t.result$statistic
 t.result$p.value
-p.value <- 2*pt(-t.result$statistic, df=df.a+df.b)
+# the above p.value can be obtained with
+# pt function
+p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
 p.value
-str(t.result)
 t.result$p.value
@@ Line 95: / Line 113: @@
 #
+# A combined group with group A and B
+# We call it group total
+# we can obtain its mean, variance, ss, df, etc.
+#
 A
 B
@@ Line 100: / Line 122: @@
 dat
+mean.total <- mean(dat)
 var.total <- var(dat)
+# variance를 ms라고 부르기도 한다
+ms.total <- var.total
 df.total <- length(dat)-1
 ss.total <- var.total*df.total
 ss.total.check <- sum((dat-mean(dat))^2)
+mean.total
+var.total
+ms.total
+df.total
 ss.total
 ss.total.check
-mean.total <- mean(dat)
-mean.total
+# Now for each group
 mean.a <- mean(A)
 mean.b <- mean(B)
+mean.a
+mean.b
+# 그룹 간의 차이에서 나타나는 분산
+# 수업시간에 설명을 잘 들을 것
 # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
-# 이를 제곱하여 A의 숫자만큼 더한다 =
+# 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
 # 즉, SS를 구하는 방법.
 # 전체평균에서 그룹평균을 뺀 것의 제곱을
 # 그룹 구성원 숫자만큼 더하는 것
+# 그리고 이들을 다시 모두 더하여
+# ss.between에 저장
+length(A) * ((mean.total - mean.a)^2)
+length(B) * ((mean.total - mean.b)^2)
-length(A)*((mean.total - mean.a)^2)
-length(B)*((mean.total - mean.b)^2)
 ss.between <-
   length(A)*((mean.total - mean.a)^2) +
   length(B)*((mean.total - mean.b)^2)
 ss.between
+# df between group은 연구에 사용된
+# 그룹의 숫자에서 1을 뺀 숫자
+df.between <- 2 - 1
+# 이 그룹 간 차이에 기인하는 분산 값은
+ms.between <- ss.between / df.between
 # 한편 ss.a 와 ss.b는 각 그룹 내의
@@ Line 132: / Line 173: @@
 ss.b <- var(B) * df.b
 ss.within <- ss.a + ss.b
+df.a <- length(A)-1
-# Now check this
+df.b <- length(B)-1
-ss.total
-ss.between
-ss.within
-ss.total == ss.between + ss.within
-# 한편 df는
-# df.total  30 - 1
-df.between <- 2-1  # 그룹숫자 - 1
-df.a <- length(A)-1 # a 구성원 - 1
-df.b <- length(B)-1 # b 구성원 - 1
 df.within <- df.a + df.b
+ms.within <- ss.within / df.within
-df.total
+# 여기까지 우리는
-df.between
+# 전체분산
-df.within
+# 그룹간분산
-df.total == df.between + df.within
+# 그룹내분산 값을
+# 구한 것
-# 분산을 구하는 방법은 SS/df 이므로
-# 분산을 ms 라고 표기하면 우리는
-# ms.total, ms.between, ms.within을 구할 수 있다
-ms.total <- ss.total / df.total
-ms.between <- ss.between / df.between
-ms.within <- ss.within / df.within
-# 위에서 ms.between은 그룹의 차이때문에 생긴
+# ms.between은 그룹의 차이때문에 생긴
 # 분산으로 IV 혹은 treatment 때문에 생기는
 # 차이에 기인하는 분산이고
@@ Line 169: / Line 194: @@
 # t test 때와 마찬가지로
 # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
-# 구해볼 수 있다. 이것을 f.calculated 이라고 하고
+# 구해볼 수 있다.
-# 이를 프린트아웃 한다
+# 즉, 그룹갑분산은 사실 = diff (between groups)
+# 그리고 그룹내 분산은 사실 = re
+# 따라서 우리는 위 둘 간의 비율을 t test와 같이
+# 살펴볼 수 있다
+# 이것을 f.calculated 이라고 하고
 f.calculated <- ms.between / ms.within
+# 이 값을 출력해 본다
 f.calculated
+# 이 계산은 차이와 랜덤에러의 비율이
+# df에 따라서 얼마나 되어야 그 차이가
+# 충분히 큰 것인지를 판단하기 위해서
+# 쓰인다. 여기서 df에는 두 가지 종류가
+# 있다. df.between 그리고 df.within
+# percentage of f distribution with
+# df1 and df2 option
+# 이는 그림의 왼쪽을 나타내므로
+# 차이가 점점 커지게 되는 오른쪽을
+# 계산하기 위해서는 1-x를 취한다
+f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+f.calculated.pvalue
 # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
 # samples t-test를) 아웃 풋은
 t.result
+# 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+f.calculated.pvalue
+t.result$p.value
+# 또한
 # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
 # 이 값이 2.33333 이었다
@@ Line 194: / Line 243: @@
 t.calculated
+# Now check this
+ss.total
+ss.between
+ss.within
+ss.total
+ss.between + ss.within
+# 한편 df는
+# df.total  30 - 1
+df.total
+df.between
+df.within
+df.total
+df.between + df.within
 # 한 편
@@ Line 229: / Line 292: @@
 sqrt(a.res.sum[[1]][1,4])
 t.result$statistic
 </code>
 ====== output ======
+<code>
 > # from the quiz questions
 > # stu should understand the logic of the ttest
@@ Line 424: / Line 491: @@
 > # 이 그룹 간 차이에 기인하는 분산 값은
 > ms.between <- ss.between / df.between
+> ms.between
+[1] 392
 >
 > # 한편 ss.a 와 ss.b는 각 그룹 내의
@@ Line 434: / Line 503: @@
 > df.within <- df.a + df.b
 > ms.within <- ss.within / df.within
+>
+> ms.within
+[1] 72
 >
 > # 여기까지 우리는