t-test summing up

rm(list=ls())
rnorm2 <- function(n,mean,sd){ 
  mean+sd*scale(rnorm(n)) 
}
se <- function(sample) {
  sd(sample)/sqrt(length(sample))
}
ss <- function(x) {
  sum((x-mean(x))^2)
}

N.p <- 1000000
m.p <- 100
sd.p <- 10


p1 <- rnorm2(N.p, m.p, sd.p)
mean(p1)
sd(p1)

p2 <- rnorm2(N.p, m.p+10, sd.p)
mean(p2)
sd(p2)

hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
     main = "histogram of p1 and p2",)
abline(v=mean(p1), col="black", lwd=3)
hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
abline(v=mean(p2), col="red", lwd=3)

s.size <- 100
s2 <- sample(p2, s.size)
mean(s2)
sd(s2)

se.z <- sqrt(var(p1)/s.size)
se.z <- c(se.z)
se.z.range <- c(-2*se.z,2*se.z)
se.z.range

mean(p1)+se.z.range
mean(s2)

z.cal <- (mean(s2) - mean(p1)) / se.z
z.cal
pnorm(z.cal, lower.tail = F) * 2

z.cal 

# principles . . . 
# distribution of sample means 
iter <- 10000
means <- c()
for (i in 1:iter) {
  m.of.s <- mean(sample(p1, s.size))
  means <- append(means, m.of.s)
}

hist(means, 
     xlim = c(mean(means)-3*sd(means), mean(s2)+5), 
     col = rgb(1, 0, 0, 0.5))
abline(v=mean(p1), col="black", lwd=3)
abline(v=mean(s2), 
       col="blue", lwd=3)
lo1 <- mean(p1)-se.z
hi1 <- mean(p1)+se.z
lo2 <- mean(p1)-2*se.z
hi2 <- mean(p1)+2*se.z
abline(v=c(lo1, hi1, lo2, hi2), 
       col=c("green","green", "brown", "brown"), 
       lwd=2)
se.z
c(lo2, hi2)
pnorm(z.cal, lower.tail = F) * 2


# Note that we use sqrt(var(s2)/s.size)
# as our se value instread of 
# sqrt(var(p1)/s.size)
# This is a common practice for R
# In fact, some z.test (made by someone)
# function uses the former rather than
# latter.

sqrt(var(p1)/s.size)
se.z

sqrt(var(s2)/s.size)
se(s2)

t.cal.os <- (mean(s2) - mean(p1))/ se(s2)
z.cal <- (mean(s2) - mean(p1))/ se.z
t.cal.os
z.cal 

df.s2 <- length(s2)-1
df.s2
p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2
p.t.os
t.out <- t.test(s2, mu=mean(p1))

library(BSDA)
z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1))

z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값
z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값

t.out$statistic # se(s2)를 분모로 하여 구한 t 값
t.cal.os # se(s2)를 이용하여 손으로 구한 t 값

# But, after all, we use t.test method regardless of 
# variation

hist(means, 
     xlim = c(mean(means)-3*sd(means), mean(s2)+5), 
     col = rgb(1, 0, 0, 0.5))
abline(v=mean(p1), col="black", lwd=3)
abline(v=mean(s2), 
       col="blue", lwd=3)
lo1 <- mean(p1)-se.z
hi1 <- mean(p1)+se.z
lo2 <- mean(p1)-2*se.z
hi2 <- mean(p1)+2*se.z
abline(v=c(lo1, hi1, lo2, hi2), 
       col=c("green","green", "brown", "brown"), 
       lwd=2)

# difference between black and blue line
# divided by
# se(s2) (= random difference)
# t.value
mean(s2)
mean(p1)
diff <- mean(s2)-mean(p1)
diff
se(s2)
diff/se(s2)

t.cal.os

########################
# 2 sample t-test
########################
# 가정. 아래에서 추출하는 두 
# 샘플의 모집단의 파라미터를
# 모른다. 
s1 <- sample(p1, s.size)
s2 <- sample(p2, s.size)

mean(s1)
mean(s2)
ss(s1)
ss(s2)
df.s1 <- length(s1)-1
df.s2 <- length(s2)-1
df.s1
df.s2

pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2)
pooled.variance
se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2)))
se.ts
t.cal.ts <- (mean(s1)-mean(s2))/se.ts
t.cal.ts
p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2
p.val.ts

t.test(s1, s2, var.equal = T)

se(s1)
se(s2)

mean(s1)+c(-se(s1)*2, se(s1)*2)
mean(s2)+c(-se(s2)*2, se(s2)*2)

mean(p1)
mean(p2)

hist(s1, breaks=50,
     col = rgb(1, 0, 0, 0.5))
hist(s2, breaks=50, add=T, col=rgb(0,0,1,1))
abline(v=mean(s1), col="green", lwd=3)
# hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
abline(v=mean(s2), col="lightblue", lwd=3)

diff <- mean(s1)-mean(s2)
se.ts
diff/se.ts

####
# repeated measure t-test
# we can use the above case 
# pop paramter unknown
# two consecutive measurement 
# for the same sample 

t1 <- s1
t2 <- s2
mean(t1)
mean(t2)
diff.s <- t1 - t2
diff.s
t.cal.rm <- mean(diff.s)/se(diff.s)
t.cal.rm
p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2
p.val.rm
t.test(s1, s2, paired = T)

# create multiple histogram
s.all <- c(s1,s2)
mean(s.all)
hist(s1, col='grey', breaks=50, xlim=c(50, 150))
hist(s2, col='darkgreen', breaks=50, add=TRUE)
abline(v=c(mean(s.all)), 
       col=c("red"), lwd=3)
abline(v=c(mean(s1), mean(s2)), 
       col=c("black", "green"), lwd=3)

comb <- data.frame(s1,s2)
dat <- stack(comb)
head(dat)

m.tot <- mean(s.all)
m.s1 <- mean(s1)
m.s2 <- mean(s2)

ss.tot <- ss(s.all)
ss.s1 <- ss(s1)
ss.s2 <- ss(s2)

df.tot <- length(s.all)-1
df.s1 <- length(s1)-1
df.s2 <- length(s2)-1

ms.tot <- var(s.all)
ms.tot
ss.tot/df.tot

var(s1)
ss.s1 / df.s1

var(s2)
ss.s2 / df.s2

ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2)
ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2)
ss.bet <- ss.b.s1+ss.b.s2
ss.bet

ss.wit <- ss.s1 + ss.s2
ss.wit

ss.bet + ss.wit
ss.tot

library(dplyr)
# df.bet <- length(unique(dat)) - 1
df.bet <- nlevels(dat$ind) - 1
df.wit <- df.s1+df.s2
df.bet
df.wit
df.bet+df.wit
df.tot 

ms.bet <- ss.bet / df.bet
ms.wit <- ss.wit / df.wit
ms.bet
ms.wit

f.cal <- ms.bet / ms.wit
f.cal
pf(f.cal, df.bet, df.wit, lower.tail = F)


f.test <-  aov(dat$values~ dat$ind, data = dat)
summary(f.test)

sqrt(f.cal)
t.cal.ts

# this is anova after all. 

t-test summing up output

> rm(list=ls())
> rnorm2 <- function(n,mean,sd){ 
+   mean+sd*scale(rnorm(n)) 
+ }
> se <- function(sample) {
+   sd(sample)/sqrt(length(sample))
+ }
> ss <- function(x) {
+   sum((x-mean(x))^2)
+ }
> 

EXP.

  • 필요한 펑션들
  • se standard error 값을 구하는 펑션
  • ss sum of square 값을 구하는 펑션
> N.p <- 1000000
> m.p <- 100
> sd.p <- 10
> 
> 
> p1 <- rnorm2(N.p, m.p, sd.p)
> mean(p1)
[1] 100
> sd(p1)
[1] 10
> 
> p2 <- rnorm2(N.p, m.p+10, sd.p)
> mean(p2)
[1] 110
> sd(p2)
[1] 10
> 
> hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
+      main = "histogram of p1 and p2",)
> abline(v=mean(p1), col="black", lwd=3)
> hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
> abline(v=mean(p2), col="red", lwd=3)
> 

> s.size <- 100
> s2 <- sample(p2, s.size)
> mean(s2)
[1] 111.4782
> sd(s2)
[1] 9.715675
> 
> se.z <- sqrt(var(p1)/s.size)
> se.z <- c(se.z)
> se.z.range <- c(-2*se.z,2*se.z)
> se.z.range
[1] -2  2
> 

> mean(p1) + se.z.range
[1]  98 102
> mean(s2)
[1] 111.4782
>
 
> z.cal <- (mean(s2) - mean(p1)) / se.z
> z.cal
[1] 11.47817
> pnorm(z.cal, lower.tail = F) * 2
[1] 1.698319e-30
> 
> z.cal 
[1] 11.47817
> 
> # principles . . . 
> # distribution of sample means 
> iter <- 10000
> means <- c()
> for (i in 1:iter) {
+   m.of.s <- mean(sample(p1, s.size))
+   means <- append(means, m.of.s)
+ }
> 
> hist(means, 
+      xlim = c(mean(means)-3*sd(means), mean(s2)+5), 
+      col = rgb(1, 0, 0, 0.5))
> abline(v=mean(p1), col="black", lwd=3)
> abline(v=mean(s2), 
+        col="blue", lwd=3)
> lo1 <- mean(p1)-se.z
> hi1 <- mean(p1)+se.z
> lo2 <- mean(p1)-2*se.z
> hi2 <- mean(p1)+2*se.z
> abline(v=c(lo1, hi1, lo2, hi2), 
+        col=c("green","green", "brown", "brown"), 
+        lwd=2)
> se.z
[1] 1
> c(lo2, hi2)
[1]  98 102
> pnorm(z.cal, lower.tail = F) * 2
[1] 1.698319e-30
> 
> 
> # Note that we use sqrt(var(s2)/s.size)
> # as our se value instread of 
> # sqrt(var(p1)/s.size)
> # This is a common practice for R
> # In fact, some z.test (made by someone)
> # function uses the former rather than
> # latter.
> 
> sqrt(var(p1)/s.size)
     [,1]
[1,]    1
> se.z
[1] 1
> 
> sqrt(var(s2)/s.size)
[1] 0.9715675
> se(s2)
[1] 0.9715675
> 
> t.cal.os <- (mean(s2) - mean(p1))/ se(s2)
> z.cal <- (mean(s2) - mean(p1))/ se.z
> t.cal.os
[1] 11.81408
> z.cal 
[1] 11.47817
> 
> df.s2 <- length(s2)-1
> df.s2
[1] 99
> p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2
> p.t.os
[1] 1.282856e-20
> t.out <- t.test(s2, mu=mean(p1))
> 
> library(BSDA)
> z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1))
> 
> z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값
       z 
11.81345 
> z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값
[1] 11.47817
> 
> t.out$statistic # se(s2)를 분모로 하여 구한 t 값
       t 
11.81408 
> t.cal.os # se(s2)를 이용하여 손으로 구한 t 값
[1] 11.81408
> 
> # But, after all, we use t.test method regardless of 
> # variation
> 
> hist(means, 
+      xlim = c(mean(means)-3*sd(means), mean(s2)+5), 
+      col = rgb(1, 0, 0, 0.5))
> abline(v=mean(p1), col="black", lwd=3)
> abline(v=mean(s2), 
+        col="blue", lwd=3)
> lo1 <- mean(p1)-se.z
> hi1 <- mean(p1)+se.z
> lo2 <- mean(p1)-2*se.z
> hi2 <- mean(p1)+2*se.z
> abline(v=c(lo1, hi1, lo2, hi2), 
+        col=c("green","green", "brown", "brown"), 
+        lwd=2)
> 
> # difference between black and blue line
> # divided by
> # se(s2) (= random difference)
> # t.value
> mean(s2)
[1] 111.4782
> mean(p1)
[1] 100
> diff <- mean(s2)-mean(p1)
> diff
[1] 11.47817
> se(s2)
[1] 0.9715675
> diff/se(s2)
[1] 11.81408
> 
> t.cal.os
[1] 11.81408
> 
> ########################
> # 2 sample t-test
> ########################
> # 가정. 아래에서 추출하는 두 
> # 샘플의 모집단의 파라미터를
> # 모른다. 
> s1 <- sample(p1, s.size)
> s2 <- sample(p2, s.size)
> 
> mean(s1)
[1] 99.76674
> mean(s2)
[1] 109.8031
> ss(s1)
[1] 9450.294
> ss(s2)
[1] 9666.344
> df.s1 <- length(s1)-1
> df.s2 <- length(s2)-1
> df.s1
[1] 99
> df.s2
[1] 99
> 
> pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2)
> pooled.variance
[1] 96.54867
> se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2)))
> se.ts
[1] 1.389595
> t.cal.ts <- (mean(s1)-mean(s2))/se.ts
> t.cal.ts
[1] -7.22252
> p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2
> p.val.ts
[1] 1.074129e-11
> 
> t.test(s1, s2, var.equal = T)

	Two Sample t-test

data:  s1 and s2
t = -7.2225, df = 198, p-value = 1.074e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -12.776681  -7.296071
sample estimates:
mean of x mean of y 
 99.76674 109.80312 

> 
> se(s1)
[1] 0.9770236
> se(s2)
[1] 0.9881287
> 
> mean(s1)+c(-se(s1)*2, se(s1)*2)
[1]  97.8127 101.7208
> mean(s2)+c(-se(s2)*2, se(s2)*2)
[1] 107.8269 111.7794
> 
> mean(p1)
[1] 100
> mean(p2)
[1] 110
> 
> hist(s1, breaks=50,
+      col = rgb(1, 0, 0, 0.5))
> hist(s2, breaks=50, add=T, col=rgb(0,0,1,1))
> abline(v=mean(s1), col="green", lwd=3)
> # hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
> abline(v=mean(s2), col="lightblue", lwd=3)
> 
> diff <- mean(s1)-mean(s2)
> se.ts
[1] 1.389595
> diff/se.ts
[1] -7.22252
> 
> ####
> # repeated measure t-test
> # we can use the above case 
> # pop paramter unknown
> # two consecutive measurement 
> # for the same sample 
> 
> t1 <- s1
> t2 <- s2
> mean(t1)
[1] 99.76674
> mean(t2)
[1] 109.8031
> diff.s <- t1 - t2
> diff.s
  [1] -12.3185215  -5.7888766 -20.4663333  -7.6935641 -15.8599368  -3.8038147 -14.6440996
  [8] -11.3187618  -8.0302577 -13.4860776  -8.1687444 -11.0823726 -22.9605048 -12.0964467
 [15]   2.5297067  -6.8128211 -17.1935822  -1.9813741 -14.3537710 -17.6430691  -7.9493800
 [22] -28.6689697 -33.5474957 -12.9600175  10.3033820 -15.8633058 -26.1378302  -2.9662922
 [29] -19.6455177  10.2363599  -5.7970308 -22.8465012 -20.4341559   4.2259746 -21.0398539
 [36] -20.3535296   6.1674969 -20.7826608  -8.1970613 -10.4180147  24.1827392  15.9046864
 [43]  19.1082815 -30.8755695 -21.5129671  -4.8939917  -1.1518824 -39.6666501  -3.3716806
 [50]  -2.0099206 -22.5440102  -0.5860960  -1.5608043 -13.4030543  -9.8508005 -10.0521662
 [57]  16.1389067  -8.0517121  -2.4607125 -21.1944560 -24.8550535  15.1671317 -10.0196995
 [64]  -8.3894542 -17.5945751 -15.9720408   7.9406532  11.7115683 -33.0230910 -10.2871065
 [71]  -4.9529100  17.7070205   0.2334945 -11.8526240   0.9691715   9.9437888  -4.5845758
 [78]   1.7267771 -22.6725639 -11.4846042 -15.9595932 -28.0616511   2.9437361 -15.2163948
 [85] -11.7369206   1.7589732 -22.5446572 -26.0919775 -12.6457662 -17.2080987   1.6991409
 [92] -16.7465922 -22.9570187 -30.2519337 -23.0903846 -29.1119243  15.1849488 -20.7395068
 [99]  -9.3179223 -25.5558689
> t.cal.rm <- mean(diff.s)/se(diff.s)
> t.cal.rm
[1] -7.637322
> p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2
> p.val.rm
[1] 1.423701e-11
> t.test(s1, s2, paired = T)

	Paired t-test

data:  s1 and s2
t = -7.6373, df = 99, p-value = 1.424e-11
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -12.643880  -7.428872
sample estimates:
mean difference 
      -10.03638 

> 
> # create multiple histogram
> s.all <- c(s1,s2)
> mean(s.all)
[1] 104.7849
> hist(s1, col='grey', breaks=50, xlim=c(50, 150))
> hist(s2, col='darkgreen', breaks=50, add=TRUE)
> abline(v=c(mean(s.all)), 
+        col=c("red"), lwd=3)
> abline(v=c(mean(s1), mean(s2)), 
+        col=c("black", "green"), lwd=3)
> 
> comb <- data.frame(s1,s2)
> dat <- stack(comb)
> head(dat)
     values ind
1 103.18507  s1
2  92.73934  s1
3 110.40121  s1
4  99.40216  s1
5 103.37853  s1
6 106.80234  s1
> 
> m.tot <- mean(s.all)
> m.s1 <- mean(s1)
> m.s2 <- mean(s2)
> 
> ss.tot <- ss(s.all)
> ss.s1 <- ss(s1)
> ss.s2 <- ss(s2)
> 
> df.tot <- length(s.all)-1
> df.s1 <- length(s1)-1
> df.s2 <- length(s2)-1
> 
> ms.tot <- var(s.all)
> ms.tot
[1] 121.3723
> ss.tot/df.tot
[1] 121.3723
> 
> var(s1)
[1] 95.45751
> ss.s1 / df.s1
[1] 95.45751
> 
> var(s2)
[1] 97.63983
> ss.s2 / df.s2
[1] 97.63983
> 
> ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2)
> ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2)
> ss.bet <- ss.b.s1+ss.b.s2
> ss.bet
[1] 5036.442
> 
> ss.wit <- ss.s1 + ss.s2
> ss.wit
[1] 19116.64
> 
> ss.bet + ss.wit
[1] 24153.08
> ss.tot
[1] 24153.08
> 
> library(dplyr)
> # df.bet <- length(unique(dat)) - 1
> df.bet <- nlevels(dat$ind) - 1
> df.wit <- df.s1+df.s2
> df.bet
[1] 1
> df.wit
[1] 198
> df.bet+df.wit
[1] 199
> df.tot 
[1] 199
> 
> ms.bet <- ss.bet / df.bet
> ms.wit <- ss.wit / df.wit
> ms.bet
[1] 5036.442
> ms.wit
[1] 96.54867
> 
> f.cal <- ms.bet / ms.wit
> f.cal
[1] 52.1648
> pf(f.cal, df.bet, df.wit, lower.tail = F)
[1] 1.074129e-11
> 
> 
> f.test <-  aov(dat$values~ dat$ind, data = dat)
> summary(f.test)
             Df Sum Sq Mean Sq F value   Pr(>F)    
dat$ind       1   5036    5036   52.16 1.07e-11 ***
Residuals   198  19117      97                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> sqrt(f.cal)
[1] 7.22252
> t.cal.ts
[1] -7.22252
> 
> # this is anova after all. 
> 
>