====== t-test summing up ====== rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } se <- function(sample) { sd(sample)/sqrt(length(sample)) } ss <- function(x) { sum((x-mean(x))^2) } N.p <- 1000000 m.p <- 100 sd.p <- 10 p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+10, sd.p) mean(p2) sd(p2) hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), main = "histogram of p1 and p2",) abline(v=mean(p1), col="black", lwd=3) hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5)) abline(v=mean(p2), col="red", lwd=3) s.size <- 100 s2 <- sample(p2, s.size) mean(s2) sd(s2) se.z <- sqrt(var(p1)/s.size) se.z <- c(se.z) se.z.range <- c(-2*se.z,2*se.z) se.z.range mean(p1)+se.z.range mean(s2) z.cal <- (mean(s2) - mean(p1)) / se.z z.cal pnorm(z.cal, lower.tail = F) * 2 z.cal # principles . . . # distribution of sample means iter <- 10000 means <- c() for (i in 1:iter) { m.of.s <- mean(sample(p1, s.size)) means <- append(means, m.of.s) } hist(means, xlim = c(mean(means)-3*sd(means), mean(s2)+5), col = rgb(1, 0, 0, 0.5)) abline(v=mean(p1), col="black", lwd=3) abline(v=mean(s2), col="blue", lwd=3) lo1 <- mean(p1)-se.z hi1 <- mean(p1)+se.z lo2 <- mean(p1)-2*se.z hi2 <- mean(p1)+2*se.z abline(v=c(lo1, hi1, lo2, hi2), col=c("green","green", "brown", "brown"), lwd=2) se.z c(lo2, hi2) pnorm(z.cal, lower.tail = F) * 2 # Note that we use sqrt(var(s2)/s.size) # as our se value instread of # sqrt(var(p1)/s.size) # This is a common practice for R # In fact, some z.test (made by someone) # function uses the former rather than # latter. sqrt(var(p1)/s.size) se.z sqrt(var(s2)/s.size) se(s2) t.cal.os <- (mean(s2) - mean(p1))/ se(s2) z.cal <- (mean(s2) - mean(p1))/ se.z t.cal.os z.cal df.s2 <- length(s2)-1 df.s2 p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2 p.t.os t.out <- t.test(s2, mu=mean(p1)) library(BSDA) z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1)) z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값 z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값 t.out$statistic # se(s2)를 분모로 하여 구한 t 값 t.cal.os # se(s2)를 이용하여 손으로 구한 t 값 # But, after all, we use t.test method regardless of # variation hist(means, xlim = c(mean(means)-3*sd(means), mean(s2)+5), col = rgb(1, 0, 0, 0.5)) abline(v=mean(p1), col="black", lwd=3) abline(v=mean(s2), col="blue", lwd=3) lo1 <- mean(p1)-se.z hi1 <- mean(p1)+se.z lo2 <- mean(p1)-2*se.z hi2 <- mean(p1)+2*se.z abline(v=c(lo1, hi1, lo2, hi2), col=c("green","green", "brown", "brown"), lwd=2) # difference between black and blue line # divided by # se(s2) (= random difference) # t.value mean(s2) mean(p1) diff <- mean(s2)-mean(p1) diff se(s2) diff/se(s2) t.cal.os ######################## # 2 sample t-test ######################## # 가정. 아래에서 추출하는 두 # 샘플의 모집단의 파라미터를 # 모른다. s1 <- sample(p1, s.size) s2 <- sample(p2, s.size) mean(s1) mean(s2) ss(s1) ss(s2) df.s1 <- length(s1)-1 df.s2 <- length(s2)-1 df.s1 df.s2 pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2) pooled.variance se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2))) se.ts t.cal.ts <- (mean(s1)-mean(s2))/se.ts t.cal.ts p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2 p.val.ts t.test(s1, s2, var.equal = T) se(s1) se(s2) mean(s1)+c(-se(s1)*2, se(s1)*2) mean(s2)+c(-se(s2)*2, se(s2)*2) mean(p1) mean(p2) hist(s1, breaks=50, col = rgb(1, 0, 0, 0.5)) hist(s2, breaks=50, add=T, col=rgb(0,0,1,1)) abline(v=mean(s1), col="green", lwd=3) # hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5)) abline(v=mean(s2), col="lightblue", lwd=3) diff <- mean(s1)-mean(s2) se.ts diff/se.ts #### # repeated measure t-test # we can use the above case # pop paramter unknown # two consecutive measurement # for the same sample t1 <- s1 t2 <- s2 mean(t1) mean(t2) diff.s <- t1 - t2 diff.s t.cal.rm <- mean(diff.s)/se(diff.s) t.cal.rm p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2 p.val.rm t.test(s1, s2, paired = T) # create multiple histogram s.all <- c(s1,s2) mean(s.all) hist(s1, col='grey', breaks=50, xlim=c(50, 150)) hist(s2, col='darkgreen', breaks=50, add=TRUE) abline(v=c(mean(s.all)), col=c("red"), lwd=3) abline(v=c(mean(s1), mean(s2)), col=c("black", "green"), lwd=3) comb <- data.frame(s1,s2) dat <- stack(comb) head(dat) m.tot <- mean(s.all) m.s1 <- mean(s1) m.s2 <- mean(s2) ss.tot <- ss(s.all) ss.s1 <- ss(s1) ss.s2 <- ss(s2) df.tot <- length(s.all)-1 df.s1 <- length(s1)-1 df.s2 <- length(s2)-1 ms.tot <- var(s.all) ms.tot ss.tot/df.tot var(s1) ss.s1 / df.s1 var(s2) ss.s2 / df.s2 ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2) ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2) ss.bet <- ss.b.s1+ss.b.s2 ss.bet ss.wit <- ss.s1 + ss.s2 ss.wit ss.bet + ss.wit ss.tot library(dplyr) # df.bet <- length(unique(dat)) - 1 df.bet <- nlevels(dat$ind) - 1 df.wit <- df.s1+df.s2 df.bet df.wit df.bet+df.wit df.tot ms.bet <- ss.bet / df.bet ms.wit <- ss.wit / df.wit ms.bet ms.wit f.cal <- ms.bet / ms.wit f.cal pf(f.cal, df.bet, df.wit, lower.tail = F) f.test <- aov(dat$values~ dat$ind, data = dat) summary(f.test) sqrt(f.cal) t.cal.ts # this is anova after all. ====== t-test summing up output ====== > rm(list=ls()) > rnorm2 <- function(n,mean,sd){ + mean+sd*scale(rnorm(n)) + } > se <- function(sample) { + sd(sample)/sqrt(length(sample)) + } > ss <- function(x) { + sum((x-mean(x))^2) + } > EXP. * 필요한 펑션들 * se standard error 값을 구하는 펑션 * ss sum of square 값을 구하는 펑션 > N.p <- 1000000 > m.p <- 100 > sd.p <- 10 > > > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > > p2 <- rnorm2(N.p, m.p+10, sd.p) > mean(p2) [1] 110 > sd(p2) [1] 10 > > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5)) > abline(v=mean(p2), col="red", lwd=3) > {{:pasted:20250908-080311.png}} > s.size <- 100 > s2 <- sample(p2, s.size) > mean(s2) [1] 111.4782 > sd(s2) [1] 9.715675 > > se.z <- sqrt(var(p1)/s.size) > se.z <- c(se.z) > se.z.range <- c(-2*se.z,2*se.z) > se.z.range [1] -2 2 > > mean(p1) + se.z.range [1] 98 102 > mean(s2) [1] 111.4782 > > z.cal <- (mean(s2) - mean(p1)) / se.z > z.cal [1] 11.47817 > pnorm(z.cal, lower.tail = F) * 2 [1] 1.698319e-30 > > z.cal [1] 11.47817 > > # principles . . . > # distribution of sample means > iter <- 10000 > means <- c() > for (i in 1:iter) { + m.of.s <- mean(sample(p1, s.size)) + means <- append(means, m.of.s) + } > > hist(means, + xlim = c(mean(means)-3*sd(means), mean(s2)+5), + col = rgb(1, 0, 0, 0.5)) > abline(v=mean(p1), col="black", lwd=3) > abline(v=mean(s2), + col="blue", lwd=3) > lo1 <- mean(p1)-se.z > hi1 <- mean(p1)+se.z > lo2 <- mean(p1)-2*se.z > hi2 <- mean(p1)+2*se.z > abline(v=c(lo1, hi1, lo2, hi2), + col=c("green","green", "brown", "brown"), + lwd=2) > se.z [1] 1 > c(lo2, hi2) [1] 98 102 > pnorm(z.cal, lower.tail = F) * 2 [1] 1.698319e-30 > > > # Note that we use sqrt(var(s2)/s.size) > # as our se value instread of > # sqrt(var(p1)/s.size) > # This is a common practice for R > # In fact, some z.test (made by someone) > # function uses the former rather than > # latter. > > sqrt(var(p1)/s.size) [,1] [1,] 1 > se.z [1] 1 > > sqrt(var(s2)/s.size) [1] 0.9715675 > se(s2) [1] 0.9715675 > > t.cal.os <- (mean(s2) - mean(p1))/ se(s2) > z.cal <- (mean(s2) - mean(p1))/ se.z > t.cal.os [1] 11.81408 > z.cal [1] 11.47817 > > df.s2 <- length(s2)-1 > df.s2 [1] 99 > p.t.os <- pt(abs(t.cal.os), df.s2, lower.tail = F) * 2 > p.t.os [1] 1.282856e-20 > t.out <- t.test(s2, mu=mean(p1)) > > library(BSDA) > z.out <- z.test(s2, p1, sigma.x = sd(s2), sigma.y = sd(p1)) > > z.out$statistic # se.z 대신에 se(s2) 값으로 구한 z 값 z 11.81345 > z.cal # se.z으로 (sqrt(var(p1)/s.size)값) 구한 z 값 [1] 11.47817 > > t.out$statistic # se(s2)를 분모로 하여 구한 t 값 t 11.81408 > t.cal.os # se(s2)를 이용하여 손으로 구한 t 값 [1] 11.81408 > > # But, after all, we use t.test method regardless of > # variation > > hist(means, + xlim = c(mean(means)-3*sd(means), mean(s2)+5), + col = rgb(1, 0, 0, 0.5)) > abline(v=mean(p1), col="black", lwd=3) > abline(v=mean(s2), + col="blue", lwd=3) > lo1 <- mean(p1)-se.z > hi1 <- mean(p1)+se.z > lo2 <- mean(p1)-2*se.z > hi2 <- mean(p1)+2*se.z > abline(v=c(lo1, hi1, lo2, hi2), + col=c("green","green", "brown", "brown"), + lwd=2) > > # difference between black and blue line > # divided by > # se(s2) (= random difference) > # t.value > mean(s2) [1] 111.4782 > mean(p1) [1] 100 > diff <- mean(s2)-mean(p1) > diff [1] 11.47817 > se(s2) [1] 0.9715675 > diff/se(s2) [1] 11.81408 > > t.cal.os [1] 11.81408 > > ######################## > # 2 sample t-test > ######################## > # 가정. 아래에서 추출하는 두 > # 샘플의 모집단의 파라미터를 > # 모른다. > s1 <- sample(p1, s.size) > s2 <- sample(p2, s.size) > > mean(s1) [1] 99.76674 > mean(s2) [1] 109.8031 > ss(s1) [1] 9450.294 > ss(s2) [1] 9666.344 > df.s1 <- length(s1)-1 > df.s2 <- length(s2)-1 > df.s1 [1] 99 > df.s2 [1] 99 > > pooled.variance <- (ss(s1)+ss(s2))/(df.s1+df.s2) > pooled.variance [1] 96.54867 > se.ts <- sqrt((pooled.variance/length(s1))+(pooled.variance/length(s2))) > se.ts [1] 1.389595 > t.cal.ts <- (mean(s1)-mean(s2))/se.ts > t.cal.ts [1] -7.22252 > p.val.ts <- pt(abs(t.cal.ts), df=df.s1+df.s2, lower.tail = F) * 2 > p.val.ts [1] 1.074129e-11 > > t.test(s1, s2, var.equal = T) Two Sample t-test data: s1 and s2 t = -7.2225, df = 198, p-value = 1.074e-11 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -12.776681 -7.296071 sample estimates: mean of x mean of y 99.76674 109.80312 > > se(s1) [1] 0.9770236 > se(s2) [1] 0.9881287 > > mean(s1)+c(-se(s1)*2, se(s1)*2) [1] 97.8127 101.7208 > mean(s2)+c(-se(s2)*2, se(s2)*2) [1] 107.8269 111.7794 > > mean(p1) [1] 100 > mean(p2) [1] 110 > > hist(s1, breaks=50, + col = rgb(1, 0, 0, 0.5)) > hist(s2, breaks=50, add=T, col=rgb(0,0,1,1)) > abline(v=mean(s1), col="green", lwd=3) > # hist(s2, breaks=50, add=TRUE, col = rgb(0, 0, 1, 0.5)) > abline(v=mean(s2), col="lightblue", lwd=3) > > diff <- mean(s1)-mean(s2) > se.ts [1] 1.389595 > diff/se.ts [1] -7.22252 > > #### > # repeated measure t-test > # we can use the above case > # pop paramter unknown > # two consecutive measurement > # for the same sample > > t1 <- s1 > t2 <- s2 > mean(t1) [1] 99.76674 > mean(t2) [1] 109.8031 > diff.s <- t1 - t2 > diff.s [1] -12.3185215 -5.7888766 -20.4663333 -7.6935641 -15.8599368 -3.8038147 -14.6440996 [8] -11.3187618 -8.0302577 -13.4860776 -8.1687444 -11.0823726 -22.9605048 -12.0964467 [15] 2.5297067 -6.8128211 -17.1935822 -1.9813741 -14.3537710 -17.6430691 -7.9493800 [22] -28.6689697 -33.5474957 -12.9600175 10.3033820 -15.8633058 -26.1378302 -2.9662922 [29] -19.6455177 10.2363599 -5.7970308 -22.8465012 -20.4341559 4.2259746 -21.0398539 [36] -20.3535296 6.1674969 -20.7826608 -8.1970613 -10.4180147 24.1827392 15.9046864 [43] 19.1082815 -30.8755695 -21.5129671 -4.8939917 -1.1518824 -39.6666501 -3.3716806 [50] -2.0099206 -22.5440102 -0.5860960 -1.5608043 -13.4030543 -9.8508005 -10.0521662 [57] 16.1389067 -8.0517121 -2.4607125 -21.1944560 -24.8550535 15.1671317 -10.0196995 [64] -8.3894542 -17.5945751 -15.9720408 7.9406532 11.7115683 -33.0230910 -10.2871065 [71] -4.9529100 17.7070205 0.2334945 -11.8526240 0.9691715 9.9437888 -4.5845758 [78] 1.7267771 -22.6725639 -11.4846042 -15.9595932 -28.0616511 2.9437361 -15.2163948 [85] -11.7369206 1.7589732 -22.5446572 -26.0919775 -12.6457662 -17.2080987 1.6991409 [92] -16.7465922 -22.9570187 -30.2519337 -23.0903846 -29.1119243 15.1849488 -20.7395068 [99] -9.3179223 -25.5558689 > t.cal.rm <- mean(diff.s)/se(diff.s) > t.cal.rm [1] -7.637322 > p.val.rm <- pt(abs(t.cal.rm), length(s1)-1, lower.tail = F) * 2 > p.val.rm [1] 1.423701e-11 > t.test(s1, s2, paired = T) Paired t-test data: s1 and s2 t = -7.6373, df = 99, p-value = 1.424e-11 alternative hypothesis: true mean difference is not equal to 0 95 percent confidence interval: -12.643880 -7.428872 sample estimates: mean difference -10.03638 > > # create multiple histogram > s.all <- c(s1,s2) > mean(s.all) [1] 104.7849 > hist(s1, col='grey', breaks=50, xlim=c(50, 150)) > hist(s2, col='darkgreen', breaks=50, add=TRUE) > abline(v=c(mean(s.all)), + col=c("red"), lwd=3) > abline(v=c(mean(s1), mean(s2)), + col=c("black", "green"), lwd=3) > > comb <- data.frame(s1,s2) > dat <- stack(comb) > head(dat) values ind 1 103.18507 s1 2 92.73934 s1 3 110.40121 s1 4 99.40216 s1 5 103.37853 s1 6 106.80234 s1 > > m.tot <- mean(s.all) > m.s1 <- mean(s1) > m.s2 <- mean(s2) > > ss.tot <- ss(s.all) > ss.s1 <- ss(s1) > ss.s2 <- ss(s2) > > df.tot <- length(s.all)-1 > df.s1 <- length(s1)-1 > df.s2 <- length(s2)-1 > > ms.tot <- var(s.all) > ms.tot [1] 121.3723 > ss.tot/df.tot [1] 121.3723 > > var(s1) [1] 95.45751 > ss.s1 / df.s1 [1] 95.45751 > > var(s2) [1] 97.63983 > ss.s2 / df.s2 [1] 97.63983 > > ss.b.s1 <- length(s1) * ((m.tot - m.s1)^2) > ss.b.s2 <- length(s2) * ((m.tot - m.s1)^2) > ss.bet <- ss.b.s1+ss.b.s2 > ss.bet [1] 5036.442 > > ss.wit <- ss.s1 + ss.s2 > ss.wit [1] 19116.64 > > ss.bet + ss.wit [1] 24153.08 > ss.tot [1] 24153.08 > > library(dplyr) > # df.bet <- length(unique(dat)) - 1 > df.bet <- nlevels(dat$ind) - 1 > df.wit <- df.s1+df.s2 > df.bet [1] 1 > df.wit [1] 198 > df.bet+df.wit [1] 199 > df.tot [1] 199 > > ms.bet <- ss.bet / df.bet > ms.wit <- ss.wit / df.wit > ms.bet [1] 5036.442 > ms.wit [1] 96.54867 > > f.cal <- ms.bet / ms.wit > f.cal [1] 52.1648 > pf(f.cal, df.bet, df.wit, lower.tail = F) [1] 1.074129e-11 > > > f.test <- aov(dat$values~ dat$ind, data = dat) > summary(f.test) Df Sum Sq Mean Sq F value Pr(>F) dat$ind 1 5036 5036 52.16 1.07e-11 *** Residuals 198 19117 97 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > sqrt(f.cal) [1] 7.22252 > t.cal.ts [1] -7.22252 > > # this is anova after all. > >