r:sampling_distribution
This is an old revision of the document!
rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } se <- function(sample) { sd(sample)/sqrt(length(sample)) } ss <- function(x) { sum((x-mean(x))^2) } ################################ # reminder. pnorm(-1) pnorm(1, lower.tail = F) 1-(pnorm(-1) + pnorm(1, lower.tail = F)) 1-(pnorm(-1)*2) pnorm(-2) pnorm(2, lower.tail = F) 1-(pnorm(-2)*2) pnorm(-3) pnorm(3, lower.tail = F) 1-(pnorm(-3)*2) # 68% a <- qnorm(.32/2) b <- qnorm(1-.32/2) c(a, b) c(-1, 1) # 95% c <- qnorm(.05/2) d <- qnorm(1-.05/2) c(c, d) c(-2,2) # 99% e <- qnorm(.01/2) f <- qnorm(1-.01/2) c(e,f) c(-3,3) pnorm(b)-pnorm(a) pnorm(d)-pnorm(c) pnorm(f)-pnorm(e) ################################ N.p <- 1000000 m.p <- 100 sd.p <- 10 p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+30, sd.p) mean(p2) sd(p2) hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), main = "histogram of p1",) abline(v=mean(p1), col="black", lwd=3) s.size <- 25 means.temp <- c() s1 <- sample(p1, s.size, replace = T) mean(s1) means.temp <- append(means.temp, mean(s1)) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp iter <- 1000000 means <- c() means <- rep(NA, iter) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) means[i] <- mean(sample(p1, s.size, replace = T)) } sd(means) se.s <- sd(means) se.z <- sqrt(var(p1)/s.size) se.z <- c(se.z) se.z # sd of sample means (sd(means)) # is sqrt(var(s1)/s.size) or # sd(s1) / sqrt(s.size) # = se.s # when iter value goes to # unlimited value: # mean(means) = mean(p1) # and # sd(means) = sd(p1) / sqrt(s.size) # that is, sd(means) = se.z # This is called CLT (Central Limit Theorem) mean(means) mean(p1) sd(means) se.z lo1 <- mean(means)-se.s hi1 <- mean(means)+se.s lo2 <- mean(means)-2*se.s hi2 <- mean(means)+2*se.s lo3 <- mean(means)-3*se.s hi3 <- mean(means)+3*se.s # because CLT loz1 <- mean(p1)-se.z hiz1 <- mean(p1)+se.z loz2 <- mean(p1)-2*se.z hiz2 <- mean(p1)+2*se.z loz3 <- mean(p1)-3*se.z hiz3 <- mean(p1)+3*se.z c(lo1, loz1) c(lo2, loz2) c(lo3, loz3) c(hi1, hiz1) c(hi2, hiz2) c(hi3, hiz3) hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) abline(v=mean(p1), col="red", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("green","green", "blue", "blue", "orange", "orange"), lwd=2) abline(v=c(loz1, hiz1, loz2, hiz2, loz3, hiz3), col=c("red","red", "red", "red", "red", "red"), lwd=2) round(c(lo1, hi1)) round(c(lo2, hi2)) round(c(lo3, hi3)) round(c(loz1, hiz1)) round(c(loz2, hiz2)) round(c(loz3, hiz3)) m.sample.i.got <- 101.8 # m.sample.i.got <- mean(sample(p1, s.size)) hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=3) abline(v=m.sample.i.got, col='darkgreen', lwd=3) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='green', lwd=3) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
r/sampling_distribution.1757406706.txt.gz · Last modified: 2025/09/09 17:31 by hkimscil