rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } ss <- function(x) { sum((x-mean(x))^2) } N.p <- 1000000 m.p <- 100 sd.p <- 10 set.seed(101) p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+10, sd.p) mean(p2) sd(p2) sz1 <- sz2 <- 50 df1 <- sz1 - 1 df2 <- sz2 - 1 df.tot <- df1 + df2 iter <- 100000 mdiffs <- rep(NA, iter) means.s1 <- rep(NA, iter) means.s2 <- rep(NA, iter) tail(mdiffs) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) s1 <- sample(p1, sz1, replace = T) s2 <- sample(p2, sz2, replace = T) means.s1[i] <- mean(s1) means.s2[i] <- mean(s2) mdiffs[i] <- mean(s1)-mean(s2) } # 정리, 증명에 의한 계산 mu <- mean(p1) - mean(p2) # var(x1bar-x2bar) = var(x1bar) + var(x2bar) # var(x1bar) = var(x1)/n, n = sample size ms <- var(p1)/sz1 + var(p2)/sz2 se <- sqrt(ms) mu ms se # 시뮬레이션에 의한 집합 (distribution) # mdiffs m.diff <- mean(mdiffs) var.diff <- var(mdiffs) sd.diff <- sd(mdiffs) m.diff var.diff sd.diff var(means.s1) var(p1)/sz1 var(means.s2) var(p2)/sz2 var(means.s1-means.s2) var(means.s1) + var(means.s2) - 2 * cov(means.s1, means.s2) # 두 집합이 완전히 독립적일 때 cov = 0 이므로 var(p1)/sz1 + var(p2)/sz2 var.diff <- (var(p1)/sz1) + (var(p2)/sz2) var.diff sqrt(var.diff) se.diff <- sqrt(var.diff) se.diff # 이것을 그래프로 그려보면 hist(mdiffs, breaks=50) abline(v=mean(mdiffs), col="black", lwd=2) se.diff one <- qt(1-(.32/2), df=df.tot) two <- qt(1-(.05/2), df=df.tot) thr <- qt(1-(.01/2), df=df.tot) ci68 <- se.diff*one ci95 <- se.diff*two ci99 <- se.diff*thr abline(v=c(m.diff-ci68, m.diff-ci95, m.diff-ci99, m.diff+ci68, m.diff+ci95, m.diff+ci99), col=c("green", "blue", "black"), lwd=2) text(x=m.diff-ci95, y=iter/12, labels=paste(round(m.diff-ci95,3)), col="blue", pos = 4) text(x=m.diff+ci95, y=iter/12, labels=paste(round(m.diff+ci95,3)), col="blue", pos = 4) s1 <- sample(p1, sz1, replace = T) s2 <- sample(p2, sz2, replace = T) m.diff <- mean(s1)-mean(s2) m.diff # <- 100 번 중 95번은 위의 블루라인 안쪽에서 pv <- (ss(s1) + ss(s2))/(df1 + df2) pv ms1 <- ss(s1)/df1 ms2 <- ss(s2)/df2 ms1 ms2 (ms1 + ms2)/2 # se <- sqrt(ms.a/sz1 + ms.b/sz2) # se se.z <- sqrt(pv/sz1 + pv/sz2) se.z diff <- mean(s1)-mean(s2) t.cal <- diff / se.z t.test(s1,s2, var.equal = T) t.cal df.tot print(p.val <- pt(abs(t.cal), df.tot, lower.tail = F)*2) print(mean.diff <- mean(s1)-mean(s2)) two <- qt(.05/2, df.tot) two # two <- -2 lo2 <- se.z * two lo2 mean.diff+c(lo2,-lo2) zdiffs <- scale(mdiffs) se.diff <- sd.diff hist(zdiffs, breaks=50, xlim =c(-10,10)) two abline(v=c(0, 0-two, 0+two), col="blue") text(x=two,y=iter*.03,labels=round(two,3), pos=3) text(x=two,y=iter*.02, labels=.05,pos=3) abline(v=c(t.cal,-t.cal), col="red") text(x=t.cal,y=iter*.06,labels=round(t.cal,3),pos=3) text(x=t.cal, y=iter*.05,labels=round(p.val,7),pos=3) p.val ### # what if s1 and s2 are from # the same pop? # the distribution of sample mean # difference should be # normal, mu = 0, var = var.p1/s.size + var.p2/s.size iter <- 100000 # means <- c() mdiffs <- rep(NA, iter) means.s3 <- rep(NA, iter) means.s4 <- rep(NA, iter) tail(mdiffs) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) s3 <- sample(p1, sz1, replace = T) s4 <- sample(p1, sz2, replace = T) means.s3[i] <- mean(s3) means.s4[i] <- mean(s4) mdiffs[i] <- mean(s3)-mean(s4) } mu <- mean(p1) - mean(p1) ms <- var(p1)/sz1 + var(p1)/sz2 se <- sqrt(ms) mu ms se m.diff <- mean(mdiffs) var.diff <- var(mdiffs) sd.diff <- sd(mdiffs) m.diff var.diff sd.diff s3 <- sample(p1, sz1, replace=T) s4 <- sample(p1, sz2, replace=T) t.test(s3, s4, var.equal=T) print(m.diff <- mean(s3)-mean(s4)) # 위의 value는 0을 중심으로 -4 +4 사이에 있을 확률이 95퍼센트이다. # 정확히는 mean difference = p1 - p1 = 0 se = 2 95% Interval = two print(two <- qt(.05/2,df.tot)) # 아래에 ss(p1) 대신에 ss(s3)를 사용한 것은 # 모집단의 정보가 없음을 가정하기 때문에 pv <- (ss(s3)+ss(s4))/(df1+df2) se.diff <- sqrt(pv/sz1 + pv/sz2) se.diff lo <- two*se.diff hi <- -lo c(lo, hi) # let's see iter <- 1000 means.s3 <- means.s4 <- rep(NA, iter) mdiffs <- rep(NA, iter) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) s3 <- sample(p1, sz1, replace = T) s4 <- sample(p1, sz2, replace = T) means.s3[i] <- mean(s3) means.s4[i] <- mean(s4) mdiffs[i] <- mean(s3)-mean(s4) } table(mdiffs < -4 | mdiffs > 4) # repeated measure = to be deleted sz <- 50 t5 <- rnorm2(sz, 75, 10) t6 <- rnorm2(sz, 82, 10) t.test(t5, t6, paired=T) diff <- t5-t6 sd.diff <- sd(diff) se.diff <- sd.diff/sqrt(sz) m.diff <- mean(diff) t.cal <- m.diff/se.diff p.val <- pt(abs(t.cal), df=sz-1, lower.tail = F)*2 df <- sz-1 t.cal df p.val m.diff se.diff two <- qt(.05/2, df=sz-1) two lo2 <- se.diff*two lo2 m.diff + lo2 m.diff - lo2 > rm(list=ls()) > rnorm2 <- function(n,mean,sd){ + mean+sd*scale(rnorm(n)) + } > ss <- function(x) { + sum((x-mean(x))^2) + } > > N.p <- 1000000 > m.p <- 100 > sd.p <- 10 > > set.seed(101) > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > > p2 <- rnorm2(N.p, m.p+10, sd.p) > mean(p2) [1] 110 > sd(p2) [1] 10 > > sz1 <- sz2 <- 50 > df1 <- sz1 - 1 > df2 <- sz2 - 1 > df.tot <- df1 + df2 > > iter <- 100000 > mdiffs <- rep(NA, iter) > means.s1 <- rep(NA, iter) > means.s2 <- rep(NA, iter) > tail(mdiffs) [1] NA NA NA NA NA NA > > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + s1 <- sample(p1, sz1, replace = T) + s2 <- sample(p2, sz2, replace = T) + means.s1[i] <- mean(s1) + means.s2[i] <- mean(s2) + mdiffs[i] <- mean(s1)-mean(s2) + } > # 정리, 증명에 의한 계산 > mu <- mean(p1) - mean(p2) > # var(x1bar-x2bar) = var(x1bar) + var(x2bar) > # var(x1bar) = var(x1)/n, n = sample size > ms <- var(p1)/sz1 + var(p2)/sz2 > se <- sqrt(ms) > > mu [1] -10 > ms [,1] [1,] 4 > se [,1] [1,] 2 > > # 시뮬레이션에 의한 집합 (distribution) > # mdiffs > m.diff <- mean(mdiffs) > var.diff <- var(mdiffs) > sd.diff <- sd(mdiffs) > m.diff [1] -9.988058 > var.diff [1] 4.023723 > sd.diff [1] 2.005922 > > var(means.s1) [1] 2.002125 > var(p1)/sz1 [,1] [1,] 2 > var(means.s2) [1] 2.014368 > var(p2)/sz2 [,1] [1,] 2 > var(means.s1-means.s2) [1] 4.023723 > var(means.s1) + var(means.s2) - 2 * cov(means.s1, means.s2) [1] 4.023723 > # 두 집합이 완전히 독립적일 때 cov = 0 이므로 > var(p1)/sz1 + var(p2)/sz2 [,1] [1,] 4 > > var.diff <- (var(p1)/sz1) + (var(p2)/sz2) > var.diff [,1] [1,] 4 > sqrt(var.diff) [,1] [1,] 2 > se.diff <- sqrt(var.diff) > se.diff [,1] [1,] 2 > > # 이것을 그래프로 그려보면 > hist(mdiffs, breaks=50) > abline(v=mean(mdiffs), + col="black", lwd=2) > > se.diff [,1] [1,] 2 > one <- qt(1-(.32/2), df=df.tot) > two <- qt(1-(.05/2), df=df.tot) > thr <- qt(1-(.01/2), df=df.tot) > > ci68 <- se.diff*one > ci95 <- se.diff*two > ci99 <- se.diff*thr > > abline(v=c(m.diff-ci68, m.diff-ci95, m.diff-ci99, + m.diff+ci68, m.diff+ci95, m.diff+ci99), + col=c("green", "blue", "black"), lwd=2) > text(x=m.diff-ci95, y=iter/12, + labels=paste(round(m.diff-ci95,3)), + col="blue", pos = 4) > text(x=m.diff+ci95, y=iter/12, + labels=paste(round(m.diff+ci95,3)), + col="blue", pos = 4) > > s1 <- sample(p1, sz1, replace = T) > s2 <- sample(p2, sz2, replace = T) > m.diff <- mean(s1)-mean(s2) > m.diff # <- 100 번 중 95번은 위의 블루라인 안쪽에서 [1] -6.959851 > > pv <- (ss(s1) + ss(s2))/(df1 + df2) > pv [1] 106.6359 > > ms1 <- ss(s1)/df1 > ms2 <- ss(s2)/df2 > ms1 [1] 105.3544 > ms2 [1] 107.9175 > (ms1 + ms2)/2 [1] 106.6359 > > # se <- sqrt(ms.a/sz1 + ms.b/sz2) > # se > se.z <- sqrt(pv/sz1 + pv/sz2) > se.z [1] 2.065293 > > diff <- mean(s1)-mean(s2) > t.cal <- diff / se.z > > t.test(s1,s2, var.equal = T) Two Sample t-test data: s1 and s2 t = -3.3699, df = 98, p-value = 0.001077 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -11.058359 -2.861343 sample estimates: mean of x mean of y 101.6366 108.5965 > > t.cal [1] -3.369909 > df.tot [1] 98 > print(p.val <- pt(abs(t.cal), df.tot, lower.tail = F)*2) [1] 0.001076634 > print(mean.diff <- mean(s1)-mean(s2)) [1] -6.959851 > two <- qt(.05/2, df.tot) > two [1] -1.984467 > # two <- -2 > lo2 <- se.z * two > lo2 [1] -4.098508 > mean.diff+c(lo2,-lo2) [1] -11.058359 -2.861343 > > zdiffs <- scale(mdiffs) > se.diff <- sd.diff > hist(zdiffs, breaks=50, + xlim =c(-10,10)) > two [1] -1.984467 > abline(v=c(0, 0-two, 0+two), col="blue") > text(x=two,y=iter*.03,labels=round(two,3), pos=3) > text(x=two,y=iter*.02, labels=.05,pos=3) > abline(v=c(t.cal,-t.cal), col="red") > text(x=t.cal,y=iter*.06,labels=round(t.cal,3),pos=3) > text(x=t.cal, y=iter*.05,labels=round(p.val,7),pos=3) > p.val [1] 0.001076634 > > ### > # what if s1 and s2 are from > # the same pop? > # the distribution of sample mean > # difference should be > # normal, mu = 0, var = var.p1/s.size + var.p2/s.size > > iter <- 100000 > # means <- c() > mdiffs <- rep(NA, iter) > means.s3 <- rep(NA, iter) > means.s4 <- rep(NA, iter) > tail(mdiffs) [1] NA NA NA NA NA NA > > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + s3 <- sample(p1, sz1, replace = T) + s4 <- sample(p1, sz2, replace = T) + means.s3[i] <- mean(s3) + means.s4[i] <- mean(s4) + mdiffs[i] <- mean(s3)-mean(s4) + } > > mu <- mean(p1) - mean(p1) > ms <- var(p1)/sz1 + var(p1)/sz2 > se <- sqrt(ms) > > mu [1] 0 > ms [,1] [1,] 4 > se [,1] [1,] 2 > > m.diff <- mean(mdiffs) > var.diff <- var(mdiffs) > sd.diff <- sd(mdiffs) > m.diff [1] -0.00273072 > var.diff [1] 3.997207 > sd.diff [1] 1.999302 > > s3 <- sample(p1, sz1, replace=T) > s4 <- sample(p1, sz2, replace=T) > t.test(s3, s4, var.equal=T) Two Sample t-test data: s3 and s4 t = 1.7165, df = 98, p-value = 0.08924 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.5535058 7.6433427 sample estimates: mean of x mean of y 103.04431 99.49939 > print(m.diff <- mean(s3)-mean(s4)) [1] 3.544918 > # 위의 value는 0을 중심으로 -4 +4 사이에 있을 확률이 95퍼센트이다. > # 정확히는 mean difference = p1 - p1 = 0 se = 2 95% Interval = two > print(two <- qt(.05/2,df.tot)) [1] -1.984467 > # 아래에 ss(p1) 대신에 ss(s3)를 사용한 것은 > # 모집단의 정보가 없음을 가정하기 때문에 > pv <- (ss(s3)+ss(s4))/(df1+df2) > se.diff <- sqrt(pv/sz1 + pv/sz2) > se.diff [1] 2.065251 > lo <- two*se.diff > hi <- -lo > c(lo, hi) [1] -4.098424 4.098424 > > > # let's see > > iter <- 1000 > means.s3 <- means.s4 <- rep(NA, iter) > mdiffs <- rep(NA, iter) > > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + s3 <- sample(p1, sz1, replace = T) + s4 <- sample(p1, sz2, replace = T) + means.s3[i] <- mean(s3) + means.s4[i] <- mean(s4) + mdiffs[i] <- mean(s3)-mean(s4) + } > > table(mdiffs < -4 | mdiffs > 4) FALSE TRUE 951 49 > > # repeated measure = to be deleted > sz <- 50 > t5 <- rnorm2(sz, 75, 10) > t6 <- rnorm2(sz, 82, 10) > > t.test(t5, t6, paired=T) Paired t-test data: t5 and t6 t = -3.3984, df = 49, p-value = 0.001355 alternative hypothesis: true mean difference is not equal to 0 95 percent confidence interval: -11.13931 -2.86069 sample estimates: mean difference -7 > > diff <- t5-t6 > sd.diff <- sd(diff) > se.diff <- sd.diff/sqrt(sz) > m.diff <- mean(diff) > > t.cal <- m.diff/se.diff > p.val <- pt(abs(t.cal), df=sz-1, lower.tail = F)*2 > df <- sz-1 > > t.cal [1] -3.398399 > df [1] 49 > p.val [1] 0.001354538 > > m.diff [1] -7 > se.diff [1] 2.059793 > two <- qt(.05/2, df=sz-1) > two [1] -2.009575 > lo2 <- se.diff*two > lo2 [1] -4.13931 > m.diff + lo2 [1] -11.13931 > m.diff - lo2 [1] -2.86069 > {{.:pasted:20260405-235733.png}} {{.:pasted:20260405-235744.png}}