rs.two.sample.t-test
rm(list=ls())
rnorm2 <- function(n,mean,sd){
mean+sd*scale(rnorm(n))
}
ss <- function(x) {
sum((x-mean(x))^2)
}
N.p <- 1000000
m.p <- 100
sd.p <- 10
set.seed(101)
p1 <- rnorm2(N.p, m.p, sd.p)
mean(p1)
sd(p1)
p2 <- rnorm2(N.p, m.p+10, sd.p)
mean(p2)
sd(p2)
sz1 <- sz2 <- 50
df1 <- sz1 - 1
df2 <- sz2 - 1
df.tot <- df1 + df2
iter <- 100000
mdiffs <- rep(NA, iter)
means.s1 <- rep(NA, iter)
means.s2 <- rep(NA, iter)
tail(mdiffs)
for (i in 1:iter) {
# means <- append(means, mean(sample(p1, s.size, replace = T)))
s1 <- sample(p1, sz1, replace = T)
s2 <- sample(p2, sz2, replace = T)
means.s1[i] <- mean(s1)
means.s2[i] <- mean(s2)
mdiffs[i] <- mean(s1)-mean(s2)
}
# 정리, 증명에 의한 계산
mu <- mean(p1) - mean(p2)
# var(x1bar-x2bar) = var(x1bar) + var(x2bar)
# var(x1bar) = var(x1)/n, n = sample size
ms <- var(p1)/sz1 + var(p2)/sz2
se <- sqrt(ms)
mu
ms
se
# 시뮬레이션에 의한 집합 (distribution)
# mdiffs
m.diff <- mean(mdiffs)
var.diff <- var(mdiffs)
sd.diff <- sd(mdiffs)
m.diff
var.diff
sd.diff
var(means.s1)
var(p1)/sz1
var(means.s2)
var(p2)/sz2
var(means.s1-means.s2)
var(means.s1) + var(means.s2) - 2 * cov(means.s1, means.s2)
# 두 집합이 완전히 독립적일 때 cov = 0 이므로
var(p1)/sz1 + var(p2)/sz2
var.diff <- (var(p1)/sz1) + (var(p2)/sz2)
var.diff
sqrt(var.diff)
se.diff <- sqrt(var.diff)
se.diff
# 이것을 그래프로 그려보면
hist(mdiffs, breaks=50)
abline(v=mean(mdiffs),
col="black", lwd=2)
se.diff
one <- qt(1-(.32/2), df=df.tot)
two <- qt(1-(.05/2), df=df.tot)
thr <- qt(1-(.01/2), df=df.tot)
ci68 <- se.diff*one
ci95 <- se.diff*two
ci99 <- se.diff*thr
abline(v=c(m.diff-ci68, m.diff-ci95, m.diff-ci99,
m.diff+ci68, m.diff+ci95, m.diff+ci99),
col=c("green", "blue", "black"), lwd=2)
text(x=m.diff-ci95, y=iter/12,
labels=paste(round(m.diff-ci95,3)),
col="blue", pos = 4)
text(x=m.diff+ci95, y=iter/12,
labels=paste(round(m.diff+ci95,3)),
col="blue", pos = 4)
s1 <- sample(p1, sz1, replace = T)
s2 <- sample(p2, sz2, replace = T)
m.diff <- mean(s1)-mean(s2)
m.diff # <- 100 번 중 95번은 위의 블루라인 안쪽에서
pv <- (ss(s1) + ss(s2))/(df1 + df2)
pv
ms1 <- ss(s1)/df1
ms2 <- ss(s2)/df2
ms1
ms2
(ms1 + ms2)/2
# se <- sqrt(ms.a/sz1 + ms.b/sz2)
# se
se.z <- sqrt(pv/sz1 + pv/sz2)
se.z
diff <- mean(s1)-mean(s2)
t.cal <- diff / se.z
t.test(s1,s2, var.equal = T)
t.cal
df.tot
print(p.val <- pt(abs(t.cal), df.tot, lower.tail = F)*2)
print(mean.diff <- mean(s1)-mean(s2))
two <- qt(.05/2, df.tot)
two
# two <- -2
lo2 <- se.z * two
lo2
mean.diff+c(lo2,-lo2)
zdiffs <- scale(mdiffs)
se.diff <- sd.diff
hist(zdiffs, breaks=50,
xlim =c(-10,10))
two
abline(v=c(0, 0-two, 0+two), col="blue")
text(x=two,y=iter*.03,labels=round(two,3), pos=3)
text(x=two,y=iter*.02, labels=.05,pos=3)
abline(v=c(t.cal,-t.cal), col="red")
text(x=t.cal,y=iter*.06,labels=round(t.cal,3),pos=3)
text(x=t.cal, y=iter*.05,labels=round(p.val,7),pos=3)
p.val
###
# what if s1 and s2 are from
# the same pop?
# the distribution of sample mean
# difference should be
# normal, mu = 0, var = var.p1/s.size + var.p2/s.size
iter <- 100000
# means <- c()
mdiffs <- rep(NA, iter)
means.s3 <- rep(NA, iter)
means.s4 <- rep(NA, iter)
tail(mdiffs)
for (i in 1:iter) {
# means <- append(means, mean(sample(p1, s.size, replace = T)))
s3 <- sample(p1, sz1, replace = T)
s4 <- sample(p1, sz2, replace = T)
means.s3[i] <- mean(s3)
means.s4[i] <- mean(s4)
mdiffs[i] <- mean(s3)-mean(s4)
}
mu <- mean(p1) - mean(p1)
ms <- var(p1)/sz1 + var(p1)/sz2
se <- sqrt(ms)
mu
ms
se
m.diff <- mean(mdiffs)
var.diff <- var(mdiffs)
sd.diff <- sd(mdiffs)
m.diff
var.diff
sd.diff
s3 <- sample(p1, sz1, replace=T)
s4 <- sample(p1, sz2, replace=T)
t.test(s3, s4, var.equal=T)
print(m.diff <- mean(s3)-mean(s4))
# 위의 value는 0을 중심으로 -4 +4 사이에 있을 확률이 95퍼센트이다.
# 정확히는 mean difference = p1 - p1 = 0 se = 2 95% Interval = two
print(two <- qt(.05/2,df.tot))
# 아래에 ss(p1) 대신에 ss(s3)를 사용한 것은
# 모집단의 정보가 없음을 가정하기 때문에
pv <- (ss(s3)+ss(s4))/(df1+df2)
se.diff <- sqrt(pv/sz1 + pv/sz2)
se.diff
lo <- two*se.diff
hi <- -lo
c(lo, hi)
# let's see
iter <- 1000
means.s3 <- means.s4 <- rep(NA, iter)
mdiffs <- rep(NA, iter)
for (i in 1:iter) {
# means <- append(means, mean(sample(p1, s.size, replace = T)))
s3 <- sample(p1, sz1, replace = T)
s4 <- sample(p1, sz2, replace = T)
means.s3[i] <- mean(s3)
means.s4[i] <- mean(s4)
mdiffs[i] <- mean(s3)-mean(s4)
}
table(mdiffs < -4 | mdiffs > 4)
# repeated measure = to be deleted
sz <- 50
t5 <- rnorm2(sz, 75, 10)
t6 <- rnorm2(sz, 82, 10)
t.test(t5, t6, paired=T)
diff <- t5-t6
sd.diff <- sd(diff)
se.diff <- sd.diff/sqrt(sz)
m.diff <- mean(diff)
t.cal <- m.diff/se.diff
p.val <- pt(abs(t.cal), df=sz-1, lower.tail = F)*2
df <- sz-1
t.cal
df
p.val
m.diff
se.diff
two <- qt(.05/2, df=sz-1)
two
lo2 <- se.diff*two
lo2
m.diff + lo2
m.diff - lo2
ro.two.sample.t-test
> rm(list=ls())
> rnorm2 <- function(n,mean,sd){
+ mean+sd*scale(rnorm(n))
+ }
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
>
> N.p <- 1000000
> m.p <- 100
> sd.p <- 10
>
> set.seed(101)
> p1 <- rnorm2(N.p, m.p, sd.p)
> mean(p1)
[1] 100
> sd(p1)
[1] 10
>
> p2 <- rnorm2(N.p, m.p+10, sd.p)
> mean(p2)
[1] 110
> sd(p2)
[1] 10
>
> sz1 <- sz2 <- 50
> df1 <- sz1 - 1
> df2 <- sz2 - 1
> df.tot <- df1 + df2
>
> iter <- 100000
> mdiffs <- rep(NA, iter)
> means.s1 <- rep(NA, iter)
> means.s2 <- rep(NA, iter)
> tail(mdiffs)
[1] NA NA NA NA NA NA
>
> for (i in 1:iter) {
+ # means <- append(means, mean(sample(p1, s.size, replace = T)))
+ s1 <- sample(p1, sz1, replace = T)
+ s2 <- sample(p2, sz2, replace = T)
+ means.s1[i] <- mean(s1)
+ means.s2[i] <- mean(s2)
+ mdiffs[i] <- mean(s1)-mean(s2)
+ }
> # 정리, 증명에 의한 계산
> mu <- mean(p1) - mean(p2)
> # var(x1bar-x2bar) = var(x1bar) + var(x2bar)
> # var(x1bar) = var(x1)/n, n = sample size
> ms <- var(p1)/sz1 + var(p2)/sz2
> se <- sqrt(ms)
>
> mu
[1] -10
> ms
[,1]
[1,] 4
> se
[,1]
[1,] 2
>
> # 시뮬레이션에 의한 집합 (distribution)
> # mdiffs
> m.diff <- mean(mdiffs)
> var.diff <- var(mdiffs)
> sd.diff <- sd(mdiffs)
> m.diff
[1] -9.988058
> var.diff
[1] 4.023723
> sd.diff
[1] 2.005922
>
> var(means.s1)
[1] 2.002125
> var(p1)/sz1
[,1]
[1,] 2
> var(means.s2)
[1] 2.014368
> var(p2)/sz2
[,1]
[1,] 2
> var(means.s1-means.s2)
[1] 4.023723
> var(means.s1) + var(means.s2) - 2 * cov(means.s1, means.s2)
[1] 4.023723
> # 두 집합이 완전히 독립적일 때 cov = 0 이므로
> var(p1)/sz1 + var(p2)/sz2
[,1]
[1,] 4
>
> var.diff <- (var(p1)/sz1) + (var(p2)/sz2)
> var.diff
[,1]
[1,] 4
> sqrt(var.diff)
[,1]
[1,] 2
> se.diff <- sqrt(var.diff)
> se.diff
[,1]
[1,] 2
>
> # 이것을 그래프로 그려보면
> hist(mdiffs, breaks=50)
> abline(v=mean(mdiffs),
+ col="black", lwd=2)
>
> se.diff
[,1]
[1,] 2
> one <- qt(1-(.32/2), df=df.tot)
> two <- qt(1-(.05/2), df=df.tot)
> thr <- qt(1-(.01/2), df=df.tot)
>
> ci68 <- se.diff*one
> ci95 <- se.diff*two
> ci99 <- se.diff*thr
>
> abline(v=c(m.diff-ci68, m.diff-ci95, m.diff-ci99,
+ m.diff+ci68, m.diff+ci95, m.diff+ci99),
+ col=c("green", "blue", "black"), lwd=2)
> text(x=m.diff-ci95, y=iter/12,
+ labels=paste(round(m.diff-ci95,3)),
+ col="blue", pos = 4)
> text(x=m.diff+ci95, y=iter/12,
+ labels=paste(round(m.diff+ci95,3)),
+ col="blue", pos = 4)
>
> s1 <- sample(p1, sz1, replace = T)
> s2 <- sample(p2, sz2, replace = T)
> m.diff <- mean(s1)-mean(s2)
> m.diff # <- 100 번 중 95번은 위의 블루라인 안쪽에서
[1] -6.959851
>
> pv <- (ss(s1) + ss(s2))/(df1 + df2)
> pv
[1] 106.6359
>
> ms1 <- ss(s1)/df1
> ms2 <- ss(s2)/df2
> ms1
[1] 105.3544
> ms2
[1] 107.9175
> (ms1 + ms2)/2
[1] 106.6359
>
> # se <- sqrt(ms.a/sz1 + ms.b/sz2)
> # se
> se.z <- sqrt(pv/sz1 + pv/sz2)
> se.z
[1] 2.065293
>
> diff <- mean(s1)-mean(s2)
> t.cal <- diff / se.z
>
> t.test(s1,s2, var.equal = T)
Two Sample t-test
data: s1 and s2
t = -3.3699, df = 98, p-value = 0.001077
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-11.058359 -2.861343
sample estimates:
mean of x mean of y
101.6366 108.5965
>
> t.cal
[1] -3.369909
> df.tot
[1] 98
> print(p.val <- pt(abs(t.cal), df.tot, lower.tail = F)*2)
[1] 0.001076634
> print(mean.diff <- mean(s1)-mean(s2))
[1] -6.959851
> two <- qt(.05/2, df.tot)
> two
[1] -1.984467
> # two <- -2
> lo2 <- se.z * two
> lo2
[1] -4.098508
> mean.diff+c(lo2,-lo2)
[1] -11.058359 -2.861343
>
> zdiffs <- scale(mdiffs)
> se.diff <- sd.diff
> hist(zdiffs, breaks=50,
+ xlim =c(-10,10))
> two
[1] -1.984467
> abline(v=c(0, 0-two, 0+two), col="blue")
> text(x=two,y=iter*.03,labels=round(two,3), pos=3)
> text(x=two,y=iter*.02, labels=.05,pos=3)
> abline(v=c(t.cal,-t.cal), col="red")
> text(x=t.cal,y=iter*.06,labels=round(t.cal,3),pos=3)
> text(x=t.cal, y=iter*.05,labels=round(p.val,7),pos=3)
> p.val
[1] 0.001076634
>
> ###
> # what if s1 and s2 are from
> # the same pop?
> # the distribution of sample mean
> # difference should be
> # normal, mu = 0, var = var.p1/s.size + var.p2/s.size
>
> iter <- 100000
> # means <- c()
> mdiffs <- rep(NA, iter)
> means.s3 <- rep(NA, iter)
> means.s4 <- rep(NA, iter)
> tail(mdiffs)
[1] NA NA NA NA NA NA
>
> for (i in 1:iter) {
+ # means <- append(means, mean(sample(p1, s.size, replace = T)))
+ s3 <- sample(p1, sz1, replace = T)
+ s4 <- sample(p1, sz2, replace = T)
+ means.s3[i] <- mean(s3)
+ means.s4[i] <- mean(s4)
+ mdiffs[i] <- mean(s3)-mean(s4)
+ }
>
> mu <- mean(p1) - mean(p1)
> ms <- var(p1)/sz1 + var(p1)/sz2
> se <- sqrt(ms)
>
> mu
[1] 0
> ms
[,1]
[1,] 4
> se
[,1]
[1,] 2
>
> m.diff <- mean(mdiffs)
> var.diff <- var(mdiffs)
> sd.diff <- sd(mdiffs)
> m.diff
[1] -0.00273072
> var.diff
[1] 3.997207
> sd.diff
[1] 1.999302
>
> s3 <- sample(p1, sz1, replace=T)
> s4 <- sample(p1, sz2, replace=T)
> t.test(s3, s4, var.equal=T)
Two Sample t-test
data: s3 and s4
t = 1.7165, df = 98, p-value = 0.08924
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.5535058 7.6433427
sample estimates:
mean of x mean of y
103.04431 99.49939
> print(m.diff <- mean(s3)-mean(s4))
[1] 3.544918
> # 위의 value는 0을 중심으로 -4 +4 사이에 있을 확률이 95퍼센트이다.
> # 정확히는 mean difference = p1 - p1 = 0 se = 2 95% Interval = two
> print(two <- qt(.05/2,df.tot))
[1] -1.984467
> # 아래에 ss(p1) 대신에 ss(s3)를 사용한 것은
> # 모집단의 정보가 없음을 가정하기 때문에
> pv <- (ss(s3)+ss(s4))/(df1+df2)
> se.diff <- sqrt(pv/sz1 + pv/sz2)
> se.diff
[1] 2.065251
> lo <- two*se.diff
> hi <- -lo
> c(lo, hi)
[1] -4.098424 4.098424
>
>
> # let's see
>
> iter <- 1000
> means.s3 <- means.s4 <- rep(NA, iter)
> mdiffs <- rep(NA, iter)
>
> for (i in 1:iter) {
+ # means <- append(means, mean(sample(p1, s.size, replace = T)))
+ s3 <- sample(p1, sz1, replace = T)
+ s4 <- sample(p1, sz2, replace = T)
+ means.s3[i] <- mean(s3)
+ means.s4[i] <- mean(s4)
+ mdiffs[i] <- mean(s3)-mean(s4)
+ }
>
> table(mdiffs < -4 | mdiffs > 4)
FALSE TRUE
951 49
>
> # repeated measure = to be deleted
> sz <- 50
> t5 <- rnorm2(sz, 75, 10)
> t6 <- rnorm2(sz, 82, 10)
>
> t.test(t5, t6, paired=T)
Paired t-test
data: t5 and t6
t = -3.3984, df = 49, p-value = 0.001355
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-11.13931 -2.86069
sample estimates:
mean difference
-7
>
> diff <- t5-t6
> sd.diff <- sd(diff)
> se.diff <- sd.diff/sqrt(sz)
> m.diff <- mean(diff)
>
> t.cal <- m.diff/se.diff
> p.val <- pt(abs(t.cal), df=sz-1, lower.tail = F)*2
> df <- sz-1
>
> t.cal
[1] -3.398399
> df
[1] 49
> p.val
[1] 0.001354538
>
> m.diff
[1] -7
> se.diff
[1] 2.059793
> two <- qt(.05/2, df=sz-1)
> two
[1] -2.009575
> lo2 <- se.diff*two
> lo2
[1] -4.13931
> m.diff + lo2
[1] -11.13931
> m.diff - lo2
[1] -2.86069
>