====== R ======


# see http://commres.net/wiki/r/t-test 문서 for detailed info
# ?t.test for help
# t.test(A, mu) # one sample t-test
# t.test(A, B, var.equal = T) # two sample t-test 
# t.test(pre, post, paired=TRUE) # paired sample t-test (repeated measure)


# t-test 이해 확인
# pre-medicine; post-medicine 
# Hypothesis: There will be a difference between the pre scores and post scores.
pre <- c(3,0,6,7,4,3,2,8,4)
post <- c(5,2,5,7,10,9,7,11,8)
mean(pre)
mean(post)
sd(pre)
sd(post)

diff.prepost <- pre-post
mean.diff <- mean(diff.prepost)
mean.diff 
sd.diff <- sd(diff.prepost)
sd.diff

#
# remember t test = diff / rand error
#
se <- sd.diff/sqrt(length(diff.prepost))
se                          
t.value <- mean.diff/se
t.value
df.value <- length(diff.prepost)-1
df.value

# pt function is for getting percentage of 
# t score with df value
pt(t.value, df=df.value)
2*pt(t.value, df=df.value)

qt(.975, df=df.value)
qt(.025, df=df.value)
test.output <- t.test(pre,post, paired=T)
test.output
# for the reference
# test.output$
str(test.output)



# from the quiz questions
# stu should understand the logic of the ttest 

set.seed(101)
rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
A <- rnorm2(16, 26, sqrt(1160/15))
B <- rnorm2(16, 19, sqrt(1000/15))
A <- c(A)
B <- c(B)
# we know sqrt(1160/15) is A's sdev
# hence, A's var is sqrt(1160/15)^2
# hence, A's SS is sqrt(1160/15)^2 * 15
# this is 1160

# from the above,
# the difference between the A and B means
# remember we try to find
# difference due to the treatment / 
# / random chance of error
diff <- 26 - 19

# for se
# we know that the situation refers to
# #2 se two independent samples t-test
# which is sqrt(pooled.var/na + pooled.var/nb)

SSa <- 1160
SSb <- 1000
n.a <- 16
n.b <- 16
df.a <- n.a - 1 
df.b <- n.b - 1

# we know that we are testing the difference 
# between two independent sample means.
# Hence, we need to use poole variance between 
# the two group. See
# http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
pooled.var <- (SSa + SSb) / (df.a + df.b)
se <- sqrt(pooled.var/n.a + pooled.var/n.b)
# Remember t test calculation is based on 
# diff / random error 
t.calculated <- diff / se
pooled.var
diff
se
t.calculated

# Now use t.test function for two group
# (independent sample) t-test
# with an assumption that variances of 
# the two gorup are the same.
t.result <- t.test(A, B, var.equal = T)
t.result

# t.result$statistic = t.calculated
# t.result$p.value = probability level of 
# wrong decision with the t calculated value
str(t.result)
t.result$statistic
t.result$p.value

# the above p.value can be obtained with 
# pt function
p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
p.value 
t.result$p.value

##

## different approach
# 

# A combined group with group A and B
# We call it group total
# we can obtain its mean, variance, ss, df, etc.
# 
A
B
dat <- c(A,B)
dat

mean.total <- mean(dat)
var.total <- var(dat)
# variance를 ms라고 부르기도 한다
ms.total <- var.total

df.total <- length(dat)-1
ss.total <- var.total*df.total
ss.total.check <- sum((dat-mean(dat))^2)

mean.total
var.total
ms.total
df.total 
ss.total
ss.total.check

# Now for each group
mean.a <- mean(A)
mean.b <- mean(B)
mean.a
mean.b

# 그룹 간의 차이에서 나타나는 분산
# 수업시간에 설명을 잘 들을 것

# mean.total 에서 그룹a의 평균까지의 차이를 구한 후
# 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 = 
# 즉, SS를 구하는 방법. 
# 전체평균에서 그룹평균을 뺀 것의 제곱을 
# 그룹 구성원 숫자만큼 더하는 것
# 그리고 이들을 다시 모두 더하여 
# ss.between에 저장

length(A) * ((mean.total - mean.a)^2)
length(B) * ((mean.total - mean.b)^2)

ss.between <- 
  length(A)*((mean.total - mean.a)^2) + 
  length(B)*((mean.total - mean.b)^2)
ss.between
# df between group은 연구에 사용된 
# 그룹의 숫자에서 1을 뺀 숫자
df.between <- 2 - 1
# 이 그룹 간 차이에 기인하는 분산 값은
ms.between <- ss.between / df.between

# 한편 ss.a 와 ss.b는 각 그룹 내의 
# 분산을 알아보기 위한 방법
ss.a <- var(A) * df.a
ss.b <- var(B) * df.b
ss.within <- ss.a + ss.b
df.a <- length(A)-1
df.b <- length(B)-1
df.within <- df.a + df.b
ms.within <- ss.within / df.within

# 여기까지 우리는 
# 전체분산
# 그룹간분산
# 그룹내분산 값을 
# 구한 것

# ms.between은 그룹의 차이때문에 생긴
# 분산으로 IV 혹은 treatment 때문에 생기는
# 차이에 기인하는 분산이고

# ms.within은 각 그룹 내부에서 일어나는 분산이므로
# (variation이므로) 연구자의 관심사와는 상관이 없이
# 나타나는 random한 분산이라고 하면 

# t test 때와 마찬가지로 
# 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
# 구해볼 수 있다. 

# 즉, 그룹갑분산은 사실 = diff (between groups) 
# 그리고 그룹내 분산은 사실 = re
# 따라서 우리는 위 둘 간의 비율을 t test와 같이 
# 살펴볼 수 있다


# 이것을 f.calculated 이라고 하고
f.calculated <- ms.between / ms.within
# 이 값을 출력해 본다
f.calculated
# 이 계산은 차이와 랜덤에러의 비율이
# df에 따라서 얼마나 되어야 그 차이가
# 충분히 큰 것인지를 판단하기 위해서
# 쓰인다. 여기서 df에는 두 가지 종류가
# 있다. df.between 그리고 df.within

# percentage of f distribution with
# df1 and df2 option
# 이는 그림의 왼쪽을 나타내므로 
# 차이가 점점 커지게 되는 오른쪽을 
# 계산하기 위해서는 1-x를 취한다
f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
f.calculated.pvalue
# 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent 
# samples t-test를) 아웃 풋은 
t.result 

# 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
f.calculated.pvalue
t.result$p.value

# 또한
# 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
# 이 값이 2.33333 이었다
t.result$statistic
# 혹은 우리가 계산한 값이었던 
# t.calculated
t.calculated

# 그런데 위의 값을 제곱한 값이 바로 f.calculated 값
f.calculated
t.calculated^2

# 혹은 f.calculated 값을 제곱근한 값이 t.calculated
sqrt(f.calculated) 
t.calculated

# Now check this
ss.total
ss.between
ss.within
ss.total 
ss.between + ss.within

# 한편 df는 
# df.total  30 - 1
df.total 
df.between
df.within
df.total 
df.between + df.within

# 한 편

A 
B
comb <- stack(list(a=A, b=B))
comb
colnames(comb)[1] <- "values"
colnames(comb)[2] <- "group"
comb

a.res <- aov(values ~ group, data=comb)
a.res.sum <- summary(a.res)
a.res.sum
# 위에서 F value는 5.444 
# 그리고 전체적인 아웃풋을 보면 
# Df group 과 Df Residuals
# Sum Sq group 과 Residuals
# Mean Sq (MS) group 과 MS residuals

# MS.group = 
ms.between

# MS.within = 
ms.within

# F value
ms.between / ms.within 
f.calculated

# 아래는 기존의 아웃풋에서 확인하는 것
str(a.res.sum)
a.res.sum[[1]][1,4]
sqrt(a.res.sum[[1]][1,4])
t.result$statistic

====== output ======


> # from the quiz questions
> # stu should understand the logic of the ttest 
> 
> set.seed(101)
> rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
> A <- rnorm2(16, 26, sqrt(1160/15))
> B <- rnorm2(16, 19, sqrt(1000/15))
> A <- c(A)
> B <- c(B)
> # we know sqrt(1160/15) is A's sdev
> # hence, A's var is sqrt(1160/15)^2
> # hence, A's SS is sqrt(1160/15)^2 * 15
> # this is 1160
> 
> # from the above,
> # the difference between the A and B means
> # remember we try to find
> # difference due to the treatment / 
> # / random chance of error
> diff <- 26 - 19
> 
> # for se
> # we know that the situation refers to
> # #2 se two independent samples t-test
> # which is sqrt(pooled.var/na + pooled.var/nb)
> 
> SSa <- 1160
> SSb <- 1000
> n.a <- 16
> n.b <- 16
> df.a <- n.a - 1 
> df.b <- n.b - 1
> 
> # we know that we are testing the difference 
> # between two independent sample means.
> # Hence, we need to use poole variance between 
> # the two group. See
> # http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
> pooled.var <- (SSa + SSb) / (df.a + df.b)
> se <- sqrt(pooled.var/n.a + pooled.var/n.b)
> # Remember t test calculation is based on 
> # diff / random error 
> t.calculated <- diff / se
> pooled.var
[1] 72
> diff
[1] 7
> se
[1] 3
> t.calculated
[1] 2.333333
> 
> # Now use t.test function for two group
> # (independent sample) t-test
> # with an assumption that variances of 
> # the two gorup are the same.
> t.result <- t.test(A, B, var.equal = T)
> t.result

	Two Sample t-test

data:  A and B
t = 2.3333, df = 30, p-value = 0.02652
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  0.8731826 13.1268174
sample estimates:
mean of x mean of y 
       26        19 

> 
> # t.result$statistic = t.calculated
> # t.result$p.value = probability level of 
> # wrong decision with the t calculated value
> str(t.result)
List of 10
 $ statistic  : Named num 2.33
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 30
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.0265
 $ conf.int   : num [1:2] 0.873 13.127
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 26 19
  ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ stderr     : num 3
 $ alternative: chr "two.sided"
 $ method     : chr " Two Sample t-test"
 $ data.name  : chr "A and B"
 - attr(*, "class")= chr "htest"
> t.result$statistic
       t 
2.333333 
> t.result$p.value
[1] 0.02652366
> 
> # the above p.value can be obtained with 
> # pt function
> p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
> p.value 
         t 
0.02652366 
> t.result$p.value
[1] 0.02652366
> 
> ##
> 
> ## different approach
> # 
> 
> # A combined group with group A and B
> # We call it group total
> # we can obtain its mean, variance, ss, df, etc.
> # 
> A
 [1] 20.994218 31.148068 16.961481 27.240217 28.354539
 [6] 38.331534 31.914700 23.459605 35.361796 22.182136
[11] 30.847396 15.575648 41.264878  7.808831 22.026979
[16] 22.527973
> B
 [1] 12.941146 21.270062 13.235378  1.931364 19.232163
 [6] 27.231465 18.276359  7.308871 27.560815  7.799787
[11] 25.017185 19.639663 25.018756 25.302096 28.941002
[16] 23.293888
> dat <- c(A,B)
> dat
 [1] 20.994218 31.148068 16.961481 27.240217 28.354539
 [6] 38.331534 31.914700 23.459605 35.361796 22.182136
[11] 30.847396 15.575648 41.264878  7.808831 22.026979
[16] 22.527973 12.941146 21.270062 13.235378  1.931364
[21] 19.232163 27.231465 18.276359  7.308871 27.560815
[26]  7.799787 25.017185 19.639663 25.018756 25.302096
[31] 28.941002 23.293888
> 
> mean.total <- mean(dat)
> var.total <- var(dat)
> # variance를 ms라고 부르기도 한다
> ms.total <- var.total
> 
> df.total <- length(dat)-1
> ss.total <- var.total*df.total
> ss.total.check <- sum((dat-mean(dat))^2)
> 
> mean.total
[1] 22.5
> var.total
[1] 82.32258
> ms.total
[1] 82.32258
> df.total 
[1] 31
> ss.total
[1] 2552
> ss.total.check
[1] 2552
> 
> # Now for each group
> mean.a <- mean(A)
> mean.b <- mean(B)
> mean.a
[1] 26
> mean.b
[1] 19
> 
> # 그룹 간의 차이에서 나타나는 분산
> # 수업시간에 설명을 잘 들을 것
> 
> # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
> # 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 = 
> # 즉, SS를 구하는 방법. 
> # 전체평균에서 그룹평균을 뺀 것의 제곱을 
> # 그룹 구성원 숫자만큼 더하는 것
> # 그리고 이들을 다시 모두 더하여 
> # ss.between에 저장
> 
> length(A) * ((mean.total - mean.a)^2)
[1] 196
> length(B) * ((mean.total - mean.b)^2)
[1] 196
> 
> ss.between <- 
+   length(A)*((mean.total - mean.a)^2) + 
+   length(B)*((mean.total - mean.b)^2)
> ss.between
[1] 392
> # df between group은 연구에 사용된 
> # 그룹의 숫자에서 1을 뺀 숫자
> df.between <- 2 - 1
> # 이 그룹 간 차이에 기인하는 분산 값은
> ms.between <- ss.between / df.between
> ms.between
[1] 392
> 
> # 한편 ss.a 와 ss.b는 각 그룹 내의 
> # 분산을 알아보기 위한 방법
> ss.a <- var(A) * df.a
> ss.b <- var(B) * df.b
> ss.within <- ss.a + ss.b
> df.a <- length(A)-1
> df.b <- length(B)-1
> df.within <- df.a + df.b
> ms.within <- ss.within / df.within
> 
> ms.within
[1] 72
> 
> # 여기까지 우리는 
> # 전체분산
> # 그룹간분산
> # 그룹내분산 값을 
> # 구한 것
> 
> # ms.between은 그룹의 차이때문에 생긴
> # 분산으로 IV 혹은 treatment 때문에 생기는
> # 차이에 기인하는 분산이고
> 
> # ms.within은 각 그룹 내부에서 일어나는 분산이므로
> # (variation이므로) 연구자의 관심사와는 상관이 없이
> # 나타나는 random한 분산이라고 하면 
> 
> # t test 때와 마찬가지로 
> # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
> # 구해볼 수 있다. 
> 
> # 즉, 그룹갑분산은 사실 = diff (between groups) 
> # 그리고 그룹내 분산은 사실 = re
> # 따라서 우리는 위 둘 간의 비율을 t test와 같이 
> # 살펴볼 수 있다
> 
> 
> # 이것을 f.calculated 이라고 하고
> f.calculated <- ms.between / ms.within
> # 이 값을 출력해 본다
> f.calculated
[1] 5.444444
> # 이 계산은 차이와 랜덤에러의 비율이
> # df에 따라서 얼마나 되어야 그 차이가
> # 충분히 큰 것인지를 판단하기 위해서
> # 쓰인다. 여기서 df에는 두 가지 종류가
> # 있다. df.between 그리고 df.within
> 
> # percentage of f distribution with
> # df1 and df2 option
> # 이는 그림의 왼쪽을 나타내므로 
> # 차이가 점점 커지게 되는 오른쪽을 
> # 계산하기 위해서는 1-x를 취한다
> f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
> f.calculated.pvalue
[1] 0.02652366
> # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent 
> # samples t-test를) 아웃 풋은 
> t.result 

	Two Sample t-test

data:  A and B
t = 2.3333, df = 30, p-value = 0.02652
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
  0.8731826 13.1268174
sample estimates:
mean of x mean of y 
       26        19 

> 
> # 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
> f.calculated.pvalue
[1] 0.02652366
> t.result$p.value
[1] 0.02652366
> 
> # 또한
> # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
> # 이 값이 2.33333 이었다
> t.result$statistic
       t 
2.333333 
> # 혹은 우리가 계산한 값이었던 
> # t.calculated
> t.calculated
[1] 2.333333
> 
> # 그런데 위의 값을 제곱한 값이 바로 f.calculated 값
> f.calculated
[1] 5.444444
> t.calculated^2
[1] 5.444444
> 
> # 혹은 f.calculated 값을 제곱근한 값이 t.calculated
> sqrt(f.calculated) 
[1] 2.333333
> t.calculated
[1] 2.333333
> 
> # Now check this
> ss.total
[1] 2552
> ss.between
[1] 392
> ss.within
[1] 2160
> ss.total 
[1] 2552
> ss.between + ss.within
[1] 2552
> 
> # 한편 df는 
> # df.total  30 - 1
> df.total 
[1] 31
> df.between
[1] 1
> df.within
[1] 30
> df.total 
[1] 31
> df.between + df.within
[1] 31
> 
> # 한 편
> 
> A 
 [1] 20.994218 31.148068 16.961481 27.240217 28.354539
 [6] 38.331534 31.914700 23.459605 35.361796 22.182136
[11] 30.847396 15.575648 41.264878  7.808831 22.026979
[16] 22.527973
> B
 [1] 12.941146 21.270062 13.235378  1.931364 19.232163
 [6] 27.231465 18.276359  7.308871 27.560815  7.799787
[11] 25.017185 19.639663 25.018756 25.302096 28.941002
[16] 23.293888
> comb <- stack(list(a=A, b=B))
> comb
      values ind
1  20.994218   a
2  31.148068   a
3  16.961481   a
4  27.240217   a
5  28.354539   a
6  38.331534   a
7  31.914700   a
8  23.459605   a
9  35.361796   a
10 22.182136   a
11 30.847396   a
12 15.575648   a
13 41.264878   a
14  7.808831   a
15 22.026979   a
16 22.527973   a
17 12.941146   b
18 21.270062   b
19 13.235378   b
20  1.931364   b
21 19.232163   b
22 27.231465   b
23 18.276359   b
24  7.308871   b
25 27.560815   b
26  7.799787   b
27 25.017185   b
28 19.639663   b
29 25.018756   b
30 25.302096   b
31 28.941002   b
32 23.293888   b
> colnames(comb)[1] <- "values"
> colnames(comb)[2] <- "group"
> comb
      values group
1  20.994218     a
2  31.148068     a
3  16.961481     a
4  27.240217     a
5  28.354539     a
6  38.331534     a
7  31.914700     a
8  23.459605     a
9  35.361796     a
10 22.182136     a
11 30.847396     a
12 15.575648     a
13 41.264878     a
14  7.808831     a
15 22.026979     a
16 22.527973     a
17 12.941146     b
18 21.270062     b
19 13.235378     b
20  1.931364     b
21 19.232163     b
22 27.231465     b
23 18.276359     b
24  7.308871     b
25 27.560815     b
26  7.799787     b
27 25.017185     b
28 19.639663     b
29 25.018756     b
30 25.302096     b
31 28.941002     b
32 23.293888     b
> 
> a.res <- aov(values ~ group, data=comb)
> a.res.sum <- summary(a.res)
> a.res.sum
            Df Sum Sq Mean Sq F value Pr(>F)  
group        1    392     392   5.444 0.0265 *
Residuals   30   2160      72                 
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> # 위에서 F value는 5.444 
> # 그리고 전체적인 아웃풋을 보면 
> # Df group 과 Df Residuals
> # Sum Sq group 과 Residuals
> # Mean Sq (MS) group 과 MS residuals
> 
> # MS.group = 
> ms.between
[1] 392
> 
> # MS.within = 
> ms.within
[1] 72
> 
> # F value
> ms.between / ms.within 
[1] 5.444444
> f.calculated
[1] 5.444444
> 
> # 아래는 기존의 아웃풋에서 확인하는 것
> str(a.res.sum)
List of 1
 $ :Classes ‘anova’ and 'data.frame':	2 obs. of  5 variables:
  ..$ Df     : num [1:2] 1 30
  ..$ Sum Sq : num [1:2] 392 2160
  ..$ Mean Sq: num [1:2] 392 72
  ..$ F value: num [1:2] 5.44 NA
  ..$ Pr(>F) : num [1:2] 0.0265 NA
 - attr(*, "class")= chr [1:2] "summary.aov" "listof"
> a.res.sum[[1]][1,4]
[1] 5.444444
> sqrt(a.res.sum[[1]][1,4])
[1] 2.333333
> t.result$statistic
       t 
2.333333