====== t-test summing up ======
rm(list=ls())
rnorm2 <- function(n,mean,sd){
mean+sd*scale(rnorm(n))
}
se <- function(sample) {
sd(sample)/sqrt(length(sample))
}
ss <- function(x) {
sum((x-mean(x))^2)
}
N.p <- 1000000
m.p <- 100
sd.p <- 10
p1 <- rnorm2(N.p, m.p, sd.p)
mean(p1)
sd(p1)
p2 <- rnorm2(N.p, m.p+10, sd.p)
mean(p2)
sd(p2)
hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
main = "histogram of p1 and p2",)
abline(v=mean(p1), col="black", lwd=3)
hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
abline(v=mean(p2), col="red", lwd=3)
s.size <- 100
s2 <- sample(p2, s.size)
mean(s2)
sd(s2)
se.z <- sqrt(var(p1)/s.size)
se.z <- c(se.z)
se.z.range <- c(-2*se.z,2*se.z)
se.z.range
mean(p1)+se.z.range
mean(s2)
z.cal <- (mean(s2) - mean(p1)) / se.z
z.cal
pnorm(z.cal, lower.tail = F) * 2
z.cal
# principles . . .
# distribution of sample means
iter <- 10000
means <- c()
for (i in 1:iter) {
m.of.s <- mean(sample(p1, s.size))
means <- append(means, m.of.s)
}
hist(means,
xlim = c(mean(means)-3*sd(means), mean(s2)+5),
col = rgb(1, 0, 0, 0.5))
abline(v=mean(p1), col="black", lwd=3)
abline(v=mean(s2),
col="blue", lwd=3)
lo1 <- mean(p1)-se.z
hi1 <- mean(p1)+se.z
lo2 <- mean(p1)-2*se.z
hi2 <- mean(p1)+2*se.z
abline(v=c(lo1, hi1, lo2, hi2),
col=c("green","green", "brown", "brown"),
lwd=2)
se.z
c(lo2, hi2)
pnorm(z.cal, lower.tail = F) * 2
# Note that we use sqrt(var(s2)/s.size)
# as our se value instread of
# sqrt(var(p1)/s.size)
# This is a common practice for R
# In fact, some z.test (made by someone)
# function uses the former rather than
# latter.
sqrt(var(p1)/s.size)
se.z
sqrt(var(s2)/s.size)
se(s2)
t.cal.os <- (mean(s2) - mean(p1))/ se(s2)
z.cal <- (mean(s2) - mean(p1))/ se.z
t.cal.os
z.cal
df.s2 <- length(s2)-1
df.s2
p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2
p.t.os
t.out <- t.test(s2, mu=mean(p1))
library(BSDA)
z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1))
z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값
z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값
t.out$statistic # se(s2)를 분모로 하여 구한 t 값
t.cal.os # se(s2)를 이용하여 손으로 구한 t 값
# But, after all, we use t.test method regardless of
# variation
hist(means,
xlim = c(mean(means)-3*sd(means), mean(s2)+5),
col = rgb(1, 0, 0, 0.5))
abline(v=mean(p1), col="black", lwd=3)
abline(v=mean(s2),
col="blue", lwd=3)
lo1 <- mean(p1)-se.z
hi1 <- mean(p1)+se.z
lo2 <- mean(p1)-2*se.z
hi2 <- mean(p1)+2*se.z
abline(v=c(lo1, hi1, lo2, hi2),
col=c("green","green", "brown", "brown"),
lwd=2)
# difference between black and blue line
# divided by
# se(s2) (= random difference)
# t.value
mean(s2)
mean(p1)
diff <- mean(s2)-mean(p1)
diff
se(s2)
diff/se(s2)
t.cal.os
########################
# 2 sample t-test
########################
# 가정. 아래에서 추출하는 두
# 샘플의 모집단의 파라미터를
# 모른다.
s1 <- sample(p1, s.size)
s2 <- sample(p2, s.size)
mean(s1)
mean(s2)
ss(s1)
ss(s2)
df.s1 <- length(s1)-1
df.s2 <- length(s2)-1
df.s1
df.s2
pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2)
pooled.variance
se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2)))
se.ts
t.cal.ts <- (mean(s1)-mean(s2))/se.ts
t.cal.ts
p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2
p.val.ts
t.test(s1, s2, var.equal = T)
se(s1)
se(s2)
mean(s1)+c(-se(s1)*2, se(s1)*2)
mean(s2)+c(-se(s2)*2, se(s2)*2)
mean(p1)
mean(p2)
hist(s1, breaks=50,
col = rgb(1, 0, 0, 0.5))
hist(s2, breaks=50, add=T, col=rgb(0,0,1,1))
abline(v=mean(s1), col="green", lwd=3)
# hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
abline(v=mean(s2), col="lightblue", lwd=3)
diff <- mean(s1)-mean(s2)
se.ts
diff/se.ts
####
# repeated measure t-test
# we can use the above case
# pop paramter unknown
# two consecutive measurement
# for the same sample
t1 <- s1
t2 <- s2
mean(t1)
mean(t2)
diff.s <- t1 - t2
diff.s
t.cal.rm <- mean(diff.s)/se(diff.s)
t.cal.rm
p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2
p.val.rm
t.test(s1, s2, paired = T)
# create multiple histogram
s.all <- c(s1,s2)
mean(s.all)
hist(s1, col='grey', breaks=50, xlim=c(50, 150))
hist(s2, col='darkgreen', breaks=50, add=TRUE)
abline(v=c(mean(s.all)),
col=c("red"), lwd=3)
abline(v=c(mean(s1), mean(s2)),
col=c("black", "green"), lwd=3)
comb <- data.frame(s1,s2)
dat <- stack(comb)
head(dat)
m.tot <- mean(s.all)
m.s1 <- mean(s1)
m.s2 <- mean(s2)
ss.tot <- ss(s.all)
ss.s1 <- ss(s1)
ss.s2 <- ss(s2)
df.tot <- length(s.all)-1
df.s1 <- length(s1)-1
df.s2 <- length(s2)-1
ms.tot <- var(s.all)
ms.tot
ss.tot/df.tot
var(s1)
ss.s1 / df.s1
var(s2)
ss.s2 / df.s2
ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2)
ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2)
ss.bet <- ss.b.s1+ss.b.s2
ss.bet
ss.wit <- ss.s1 + ss.s2
ss.wit
ss.bet + ss.wit
ss.tot
library(dplyr)
# df.bet <- length(unique(dat)) - 1
df.bet <- nlevels(dat$ind) - 1
df.wit <- df.s1+df.s2
df.bet
df.wit
df.bet+df.wit
df.tot
ms.bet <- ss.bet / df.bet
ms.wit <- ss.wit / df.wit
ms.bet
ms.wit
f.cal <- ms.bet / ms.wit
f.cal
pf(f.cal, df.bet, df.wit, lower.tail = F)
f.test <- aov(dat$values~ dat$ind, data = dat)
summary(f.test)
sqrt(f.cal)
t.cal.ts
# this is anova after all.
====== t-test summing up output ======
> rm(list=ls())
> rnorm2 <- function(n,mean,sd){
+ mean+sd*scale(rnorm(n))
+ }
> se <- function(sample) {
+ sd(sample)/sqrt(length(sample))
+ }
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
>
EXP.
* 필요한 펑션들
* se standard error 값을 구하는 펑션
* ss sum of square 값을 구하는 펑션
> N.p <- 1000000
> m.p <- 100
> sd.p <- 10
>
>
> p1 <- rnorm2(N.p, m.p, sd.p)
> mean(p1)
[1] 100
> sd(p1)
[1] 10
>
> p2 <- rnorm2(N.p, m.p+10, sd.p)
> mean(p2)
[1] 110
> sd(p2)
[1] 10
>
> hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
+ main = "histogram of p1 and p2",)
> abline(v=mean(p1), col="black", lwd=3)
> hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
> abline(v=mean(p2), col="red", lwd=3)
>
{{:pasted:20250908-080311.png}}
> s.size <- 100
> s2 <- sample(p2, s.size)
> mean(s2)
[1] 111.4782
> sd(s2)
[1] 9.715675
>
> se.z <- sqrt(var(p1)/s.size)
> se.z <- c(se.z)
> se.z.range <- c(-2*se.z,2*se.z)
> se.z.range
[1] -2 2
>
> mean(p1) + se.z.range
[1] 98 102
> mean(s2)
[1] 111.4782
>
> z.cal <- (mean(s2) - mean(p1)) / se.z
> z.cal
[1] 11.47817
> pnorm(z.cal, lower.tail = F) * 2
[1] 1.698319e-30
>
> z.cal
[1] 11.47817
>
> # principles . . .
> # distribution of sample means
> iter <- 10000
> means <- c()
> for (i in 1:iter) {
+ m.of.s <- mean(sample(p1, s.size))
+ means <- append(means, m.of.s)
+ }
>
> hist(means,
+ xlim = c(mean(means)-3*sd(means), mean(s2)+5),
+ col = rgb(1, 0, 0, 0.5))
> abline(v=mean(p1), col="black", lwd=3)
> abline(v=mean(s2),
+ col="blue", lwd=3)
> lo1 <- mean(p1)-se.z
> hi1 <- mean(p1)+se.z
> lo2 <- mean(p1)-2*se.z
> hi2 <- mean(p1)+2*se.z
> abline(v=c(lo1, hi1, lo2, hi2),
+ col=c("green","green", "brown", "brown"),
+ lwd=2)
> se.z
[1] 1
> c(lo2, hi2)
[1] 98 102
> pnorm(z.cal, lower.tail = F) * 2
[1] 1.698319e-30
>
>
> # Note that we use sqrt(var(s2)/s.size)
> # as our se value instread of
> # sqrt(var(p1)/s.size)
> # This is a common practice for R
> # In fact, some z.test (made by someone)
> # function uses the former rather than
> # latter.
>
> sqrt(var(p1)/s.size)
[,1]
[1,] 1
> se.z
[1] 1
>
> sqrt(var(s2)/s.size)
[1] 0.9715675
> se(s2)
[1] 0.9715675
>
> t.cal.os <- (mean(s2) - mean(p1))/ se(s2)
> z.cal <- (mean(s2) - mean(p1))/ se.z
> t.cal.os
[1] 11.81408
> z.cal
[1] 11.47817
>
> df.s2 <- length(s2)-1
> df.s2
[1] 99
> p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2
> p.t.os
[1] 1.282856e-20
> t.out <- t.test(s2, mu=mean(p1))
>
> library(BSDA)
> z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1))
>
> z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값
z
11.81345
> z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값
[1] 11.47817
>
> t.out$statistic # se(s2)를 분모로 하여 구한 t 값
t
11.81408
> t.cal.os # se(s2)를 이용하여 손으로 구한 t 값
[1] 11.81408
>
> # But, after all, we use t.test method regardless of
> # variation
>
> hist(means,
+ xlim = c(mean(means)-3*sd(means), mean(s2)+5),
+ col = rgb(1, 0, 0, 0.5))
> abline(v=mean(p1), col="black", lwd=3)
> abline(v=mean(s2),
+ col="blue", lwd=3)
> lo1 <- mean(p1)-se.z
> hi1 <- mean(p1)+se.z
> lo2 <- mean(p1)-2*se.z
> hi2 <- mean(p1)+2*se.z
> abline(v=c(lo1, hi1, lo2, hi2),
+ col=c("green","green", "brown", "brown"),
+ lwd=2)
>
> # difference between black and blue line
> # divided by
> # se(s2) (= random difference)
> # t.value
> mean(s2)
[1] 111.4782
> mean(p1)
[1] 100
> diff <- mean(s2)-mean(p1)
> diff
[1] 11.47817
> se(s2)
[1] 0.9715675
> diff/se(s2)
[1] 11.81408
>
> t.cal.os
[1] 11.81408
>
> ########################
> # 2 sample t-test
> ########################
> # 가정. 아래에서 추출하는 두
> # 샘플의 모집단의 파라미터를
> # 모른다.
> s1 <- sample(p1, s.size)
> s2 <- sample(p2, s.size)
>
> mean(s1)
[1] 99.76674
> mean(s2)
[1] 109.8031
> ss(s1)
[1] 9450.294
> ss(s2)
[1] 9666.344
> df.s1 <- length(s1)-1
> df.s2 <- length(s2)-1
> df.s1
[1] 99
> df.s2
[1] 99
>
> pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2)
> pooled.variance
[1] 96.54867
> se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2)))
> se.ts
[1] 1.389595
> t.cal.ts <- (mean(s1)-mean(s2))/se.ts
> t.cal.ts
[1] -7.22252
> p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2
> p.val.ts
[1] 1.074129e-11
>
> t.test(s1, s2, var.equal = T)
Two Sample t-test
data: s1 and s2
t = -7.2225, df = 198, p-value = 1.074e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-12.776681 -7.296071
sample estimates:
mean of x mean of y
99.76674 109.80312
>
> se(s1)
[1] 0.9770236
> se(s2)
[1] 0.9881287
>
> mean(s1)+c(-se(s1)*2, se(s1)*2)
[1] 97.8127 101.7208
> mean(s2)+c(-se(s2)*2, se(s2)*2)
[1] 107.8269 111.7794
>
> mean(p1)
[1] 100
> mean(p2)
[1] 110
>
> hist(s1, breaks=50,
+ col = rgb(1, 0, 0, 0.5))
> hist(s2, breaks=50, add=T, col=rgb(0,0,1,1))
> abline(v=mean(s1), col="green", lwd=3)
> # hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5))
> abline(v=mean(s2), col="lightblue", lwd=3)
>
> diff <- mean(s1)-mean(s2)
> se.ts
[1] 1.389595
> diff/se.ts
[1] -7.22252
>
> ####
> # repeated measure t-test
> # we can use the above case
> # pop paramter unknown
> # two consecutive measurement
> # for the same sample
>
> t1 <- s1
> t2 <- s2
> mean(t1)
[1] 99.76674
> mean(t2)
[1] 109.8031
> diff.s <- t1 - t2
> diff.s
[1] -12.3185215 -5.7888766 -20.4663333 -7.6935641 -15.8599368 -3.8038147 -14.6440996
[8] -11.3187618 -8.0302577 -13.4860776 -8.1687444 -11.0823726 -22.9605048 -12.0964467
[15] 2.5297067 -6.8128211 -17.1935822 -1.9813741 -14.3537710 -17.6430691 -7.9493800
[22] -28.6689697 -33.5474957 -12.9600175 10.3033820 -15.8633058 -26.1378302 -2.9662922
[29] -19.6455177 10.2363599 -5.7970308 -22.8465012 -20.4341559 4.2259746 -21.0398539
[36] -20.3535296 6.1674969 -20.7826608 -8.1970613 -10.4180147 24.1827392 15.9046864
[43] 19.1082815 -30.8755695 -21.5129671 -4.8939917 -1.1518824 -39.6666501 -3.3716806
[50] -2.0099206 -22.5440102 -0.5860960 -1.5608043 -13.4030543 -9.8508005 -10.0521662
[57] 16.1389067 -8.0517121 -2.4607125 -21.1944560 -24.8550535 15.1671317 -10.0196995
[64] -8.3894542 -17.5945751 -15.9720408 7.9406532 11.7115683 -33.0230910 -10.2871065
[71] -4.9529100 17.7070205 0.2334945 -11.8526240 0.9691715 9.9437888 -4.5845758
[78] 1.7267771 -22.6725639 -11.4846042 -15.9595932 -28.0616511 2.9437361 -15.2163948
[85] -11.7369206 1.7589732 -22.5446572 -26.0919775 -12.6457662 -17.2080987 1.6991409
[92] -16.7465922 -22.9570187 -30.2519337 -23.0903846 -29.1119243 15.1849488 -20.7395068
[99] -9.3179223 -25.5558689
> t.cal.rm <- mean(diff.s)/se(diff.s)
> t.cal.rm
[1] -7.637322
> p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2
> p.val.rm
[1] 1.423701e-11
> t.test(s1, s2, paired = T)
Paired t-test
data: s1 and s2
t = -7.6373, df = 99, p-value = 1.424e-11
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-12.643880 -7.428872
sample estimates:
mean difference
-10.03638
>
> # create multiple histogram
> s.all <- c(s1,s2)
> mean(s.all)
[1] 104.7849
> hist(s1, col='grey', breaks=50, xlim=c(50, 150))
> hist(s2, col='darkgreen', breaks=50, add=TRUE)
> abline(v=c(mean(s.all)),
+ col=c("red"), lwd=3)
> abline(v=c(mean(s1), mean(s2)),
+ col=c("black", "green"), lwd=3)
>
> comb <- data.frame(s1,s2)
> dat <- stack(comb)
> head(dat)
values ind
1 103.18507 s1
2 92.73934 s1
3 110.40121 s1
4 99.40216 s1
5 103.37853 s1
6 106.80234 s1
>
> m.tot <- mean(s.all)
> m.s1 <- mean(s1)
> m.s2 <- mean(s2)
>
> ss.tot <- ss(s.all)
> ss.s1 <- ss(s1)
> ss.s2 <- ss(s2)
>
> df.tot <- length(s.all)-1
> df.s1 <- length(s1)-1
> df.s2 <- length(s2)-1
>
> ms.tot <- var(s.all)
> ms.tot
[1] 121.3723
> ss.tot/df.tot
[1] 121.3723
>
> var(s1)
[1] 95.45751
> ss.s1 / df.s1
[1] 95.45751
>
> var(s2)
[1] 97.63983
> ss.s2 / df.s2
[1] 97.63983
>
> ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2)
> ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2)
> ss.bet <- ss.b.s1+ss.b.s2
> ss.bet
[1] 5036.442
>
> ss.wit <- ss.s1 + ss.s2
> ss.wit
[1] 19116.64
>
> ss.bet + ss.wit
[1] 24153.08
> ss.tot
[1] 24153.08
>
> library(dplyr)
> # df.bet <- length(unique(dat)) - 1
> df.bet <- nlevels(dat$ind) - 1
> df.wit <- df.s1+df.s2
> df.bet
[1] 1
> df.wit
[1] 198
> df.bet+df.wit
[1] 199
> df.tot
[1] 199
>
> ms.bet <- ss.bet / df.bet
> ms.wit <- ss.wit / df.wit
> ms.bet
[1] 5036.442
> ms.wit
[1] 96.54867
>
> f.cal <- ms.bet / ms.wit
> f.cal
[1] 52.1648
> pf(f.cal, df.bet, df.wit, lower.tail = F)
[1] 1.074129e-11
>
>
> f.test <- aov(dat$values~ dat$ind, data = dat)
> summary(f.test)
Df Sum Sq Mean Sq F value Pr(>F)
dat$ind 1 5036 5036 52.16 1.07e-11 ***
Residuals 198 19117 97
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> sqrt(f.cal)
[1] 7.22252
> t.cal.ts
[1] -7.22252
>
> # this is anova after all.
>
>