note.w02
This is an old revision of the document!
Sampling Distribution and z-test
rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } se <- function(sample) { sd(sample)/sqrt(length(sample)) } ss <- function(x) { sum((x-mean(x))^2) } ################################ N.p <- 1000000 m.p <- 100 sd.p <- 10 p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+5, sd.p) mean(p2) sd(p2) m.p1 <- mean(p1) sd.p1 <- sd(p1) var(p1) hist(p1, breaks=50, col = rgb(1, 1, 1, 0.5), main = "histogram of p1 and p2",) abline(v=mean(p1), col="black", lwd=3) hist(p2, add=T, breaks=50, col=rgb(1,1,.5,.5)) abline(v=mean(p2), col="red", lwd=3) hist(p1, breaks=50, col=rgb(0,.5,.5,.5)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # area bet black = 68% # between red = 95% # between green = 99% pnorm(m.p1+sd.p1, m.p1, sd.p1) pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+sd.p1, m.p1, sd.p1) - pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+2*sd.p1, m.p1, sd.p1) - pnorm(m.p1-2*sd.p1, m.p1, sd.p1) pnorm(m.p1+3*sd.p1, m.p1, sd.p1) - pnorm(m.p1-3*sd.p1, m.p1, sd.p1) m.p1 sd.p1 (m.p1-m.p1)/sd.p1 ((m.p1-sd.p1) - m.p1) / sd.p1 (120-100)/10 pnorm(1)-pnorm(-1) pnorm(2)-pnorm(-2) pnorm(3)-pnorm(3) 1-pnorm(-2)*2 pnorm(2)-pnorm(-2) pnorm(120, 100, 10) pnorm(2)-pnorm(-2) zscore <- (120-100)/10 pnorm(zscore)-pnorm(-zscore) zscore # reminder. pnorm(-1) pnorm(1, 0, 1, lower.tail = F) pnorm(110, 100, 10, lower.tail = F) zscore <- (110-100)/10 pnorm(zscore, lower.tail = F) pnorm(118, 100, 10, lower.tail = F) pnorm(18/10, lower.tail = F) z.p1 <- (p1-mean(p1))/sd(p1) mean(z.p1) round(mean(z.p1),10) sd(z.p1) pnorm(1.8)-pnorm(-1.8) hist(z.p1, breaks=50, col=rgb(1,0,0,0)) abline(v=c(m.p1, -1.8, 1.8), col='red') 1-(pnorm(1.8)-pnorm(-1.8)) pnorm(1)-pnorm(-1) 1-(pnorm(-1)*2) pnorm(2)-pnorm(-2) 1-(pnorm(-2)*2) 1-(pnorm(-3)*2) # hist(p1, breaks=50, col=rgb(.9,.9,.9,.9)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # 68% a <- qnorm(.32/2) b <- qnorm(1-.32/2) c(a, b) c(-1, 1) # note that .32/2 pnorm(-1) qnorm(.32/2) qnorm(pnorm(-1)) # 95% c <- qnorm(.05/2) d <- qnorm(1-.05/2) c(c, d) c(-2,2) # 99% e <- qnorm(.01/2) f <- qnorm(1-.01/2) c(e,f) c(-3,3) pnorm(b)-pnorm(a) c(a, b) pnorm(d)-pnorm(c) c(c, d) pnorm(f)-pnorm(e) c(e, f) qnorm(.5) qnorm(1) ################################ s.size <- 50 means.temp <- c() s1 <- sample(p1, s.size, replace = T) mean(s1) means.temp <- append(means.temp, mean(s1)) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp iter <- 1000000 # means <- c() means <- rep(NA, iter) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) means[i] <- mean(sample(p1, s.size, replace = T)) } length(means) mean(means) sd(means) se.s <- sd(means) hist(means, breaks=50, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col=rgb(1, 1, 1, .5)) abline(v=mean(means), col="black", lwd=3) # now we want to get sd of this distribution lo1 <- mean(means)-se.s hi1 <- mean(means)+se.s lo2 <- mean(means)-2*se.s hi2 <- mean(means)+2*se.s lo3 <- mean(means)-3*se.s hi3 <- mean(means)+3*se.s abline(v=mean(means), col="black", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=2) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("red","red", "blue", "blue", "orange", "orange"), lwd=2) # meanwhile . . . . se.s se.z <- sqrt(var(p1)/s.size) se.z <- c(se.z) se.z se.z.adjusted <- sqrt(var(s1)/s.size) se.z.adjusted # sd of sample means (sd(means)) # = se.s # when iter value goes to # infinite value: # mean(means) = mean(p1) # and # sd(means) = sd(p1) / sqrt(s.size) # that is, se.s = se.z # This is called CLT (Central Limit Theorem) # see http://commres.net/wiki/cetral_limit_theorem mean(means) mean(p1) sd(means) var(p1) # remember we started talking sample size 10 sqrt(var(p1)/s.size) se.z sd(means) se.s se.z # because CLT loz1 <- mean(p1)-se.z hiz1 <- mean(p1)+se.z loz2 <- mean(p1)-2*se.z hiz2 <- mean(p1)+2*se.z loz3 <- mean(p1)-3*se.z hiz3 <- mean(p1)+3*se.z c(lo1, loz1) c(lo2, loz2) c(lo3, loz3) c(hi1, hiz1) c(hi2, hiz2) c(hi3, hiz3) hist(means, breaks=50, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 1, 1, .5)) abline(v=mean(means), col="black", lwd=3) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("darkgreen","darkgreen", "blue", "blue", "orange", "orange"), lwd=2) round(c(lo1, hi1)) round(c(lo2, hi2)) round(c(lo3, hi3)) round(c(loz1, hiz1)) round(c(loz2, hiz2)) round(c(loz3, hiz3)) m.sample.i.got <- mean(means)+ 1.5*sd(means) m.sample.i.got hist(means, breaks=30, xlim = c(mean(means)-7*sd(means), mean(means)+10*sd(means)), col = rgb(1, 1, 1, .5)) abline(v=mean(means), col="black", lwd=3) abline(v=m.sample.i.got, col='darkgreen', lwd=3) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(means), sd(means), lower.tail = F) pnorm(m.sample.i.got, mean(p1), se.z, lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='red', lwd=3) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) m.sample.i.got ### one more time # this time, with a story mean(p2) sd(p2) s.from.p2 <- sample(p2, s.size) m.s.from.p2 <- mean(s.from.p2) m.s.from.p2 se.s se.z sd(means) tmp <- mean(means) - (m.s.from.p2 - mean(means)) tmp hist(means, breaks=30, xlim = c(tmp-4*sd(means), m.s.from.p2+4*sd(means)), col = rgb(1, 1, 1, .5)) abline(v=mean(means), col="black", lwd=3) abline(v=m.s.from.p2, col='blue', lwd=3) # what is the probablity of getting # greater than # m.sample.i.got? m.s.from.p2 pnorm(m.s.from.p2, mean(p1), se.z, lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) abline(v=tmp, col='red', lwd=3) 2 * pnorm(m.s.from.p2, mean(p1), se.z, lower.tail = F) se.z sd(s.from.p2)/sqrt(s.size) se.z.adjusted <- sqrt(var(s.from.p2)/s.size) se.z.adjusted 2 * pnorm(m.s.from.p2, mean(p1), se.z.adjusted, lower.tail = F) z.cal <- (m.s.from.p2 - mean(p1))/se.z.adjusted z.cal pnorm(z.cal, lower.tail = F)*2 pt(z.cal, 49, lower.tail = F)*2 t.test(s.from.p2, mu=mean(p1), var.equal = T)
note.w02.1757554535.txt.gz · Last modified: 2025/09/11 10:35 by hkimscil