r:sampling_distribution
This is an old revision of the document!
PS1. week02
rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } se <- function(sample) { sd(sample)/sqrt(length(sample)) } ss <- function(x) { sum((x-mean(x))^2) } ################################ N.p <- 1000000 m.p <- 100 sd.p <- 10 p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+20, sd.p) mean(p2) sd(p2) m.p1 <- mean(p1) sd.p1 <- sd(p1) var(p1) hist(p1, breaks=100, col=rgb(1,1,1,1)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # area bet black = 68% # between red = 95% # between green = 99% pnorm(m.p1+sd.p1, m.p1, sd.p1) pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+sd.p1, m.p1, sd.p1) - pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+2*sd.p1, m.p1, sd.p1) - pnorm(m.p1-2*sd.p1, m.p1, sd.p1) pnorm(m.p1+3*sd.p1, m.p1, sd.p1) - pnorm(m.p1-3*sd.p1, m.p1, sd.p1) m.p1 sd.p1 (m.p1-m.p1)/sd.p1 ((m.p1-sd.p1) - m.p1) / sd.p1 (120-100)/10 pnorm(1)-pnorm(-1) pnorm(2)-pnorm(-2) pnorm(3)-pnorm(3) 1-pnorm(-2)*2 pnorm(2)-pnorm(-2) pnorm(120, 100, 10) pnorm(2)-pnorm(-2) zscore <- (120-100)/10 pnorm(zscore)-pnorm(-zscore) zscore # reminder. pnorm(-1) pnorm(1, 0, 1, lower.tail = F) pnorm(110, 100, 10, lower.tail = F) zscore <- (110-100)/10 pnorm(zscore, lower.tail = F) pnorm(118, 100, 10, lower.tail = F) pnorm(18/10, lower.tail = F) z.p1 <- (p1-mean(p1))/sd(p1) mean(z.p1) round(mean(z.p1),10) sd(z.p1) pnorm(1.8)-pnorm(-1.8) hist(z.p1, breaks=100, col=rgb(0,0,0,0)) abline(v=c(m.p1, -1.8, 1.8), col='red') 1-(pnorm(1.8)-pnorm(-1.8)) pnorm(1)-pnorm(-1) 1-(pnorm(-1)*2) pnorm(2)-pnorm(-2) 1-(pnorm(-2)*2) 1-(pnorm(-3)*2) # hist(p1, breaks=100, col=rgb(1,1,1,1)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # 68% a <- qnorm(.32/2) b <- qnorm(1-.32/2) c(a, b) c(-1, 1) # 95% c <- qnorm(.05/2) d <- qnorm(1-.05/2) c(c, d) c(-2,2) # 99% e <- qnorm(.01/2) f <- qnorm(1-.01/2) c(e,f) c(-3,3) pnorm(b)-pnorm(a) c(a, b) pnorm(d)-pnorm(c) c(c, d) pnorm(f)-pnorm(e) c(e, f) qnorm(.5) qnorm(1) ################################ hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), main = "histogram of p1 and p2",) abline(v=mean(p1), col="black", lwd=3) hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5)) abline(v=mean(p2), col="violet", lwd=3) s.size <- 10 means.temp <- c() s1 <- sample(p1, s.size, replace = T) mean(s1) means.temp <- append(means.temp, mean(s1)) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp iter <- 1000000 # means <- c() means <- rep(NA, iter) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) means[i] <- mean(sample(p1, s.size, replace = T)) } length(means) mean(means) sd(means) se.s <- sd(means) hist(means, breaks=100, col=rgb(.1, 0, 0, .5)) abline(v=mean(means), col="red", lwd=2) # now we want to get sd of this distribution lo1 <- mean(means)-se.s hi1 <- mean(means)+se.s lo2 <- mean(means)-2*se.s hi2 <- mean(means)+2*se.s lo3 <- mean(means)-3*se.s hi3 <- mean(means)+3*se.s hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("green","green", "blue", "blue", "orange", "orange"), lwd=2) # meanwhile . . . . se.s se.z <- sqrt(var(p1)/s.size) se.z <- c(se.z) se.z # sd of sample means (sd(means)) # is sqrt(var(s1)/s.size) or # sd(s1) / sqrt(s.size) # = se.s # when iter value goes to # unlimited value: # mean(means) = mean(p1) # and # sd(means) = sd(p1) / sqrt(s.size) # that is, sd(means) = se.z # This is called CLT (Central Limit Theorem) mean(means) mean(p1) sd(means) var(p1) sqrt(var(p1)/s.size) se.z sd(means) se.s se.z # because CLT loz1 <- mean(p1)-se.z hiz1 <- mean(p1)+se.z loz2 <- mean(p1)-2*se.z hiz2 <- mean(p1)+2*se.z loz3 <- mean(p1)-3*se.z hiz3 <- mean(p1)+3*se.z c(lo1, loz1) c(lo2, loz2) c(lo3, loz3) c(hi1, hiz1) c(hi2, hiz2) c(hi3, hiz3) hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("green","green", "blue", "blue", "orange", "orange"), lwd=2) round(c(lo1, hi1)) round(c(lo2, hi2)) round(c(lo3, hi3)) round(c(loz1, hiz1)) round(c(loz2, hiz2)) round(c(loz3, hiz3)) m.sample.i.got <- mean(means)+ 1.5*sd(means) m.sample.i.got hist(means, xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=3) abline(v=m.sample.i.got, col='darkgreen', lwd=3) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(means), sd(means), lower.tail = F) pnorm(m.sample.i.got, mean(p1), se.z, lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='green', lwd=3) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) m.sample.i.got ### one more time mean(p2) sd(p2) sample(p2, s.size) m.sample.i.got <- mean(sample(p2, s.size)) m.sample.i.got hist(means, xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) abline(v=m.sample.i.got, col='darkgreen', lwd=2) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='green', lwd=2) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
output
> rm(list=ls()) > rnorm2 <- function(n,mean,sd){ + mean+sd*scale(rnorm(n)) + } > > se <- function(sample) { + sd(sample)/sqrt(length(sample)) + } > > ss <- function(x) { + sum((x-mean(x))^2) + } > ################################ > > # reminder. > pnorm(-1) [1] 0.1586553 > pnorm(1, lower.tail = F) [1] 0.1586553 > 1-(pnorm(-1) + pnorm(1, lower.tail = F)) [1] 0.6826895 > 1-(pnorm(-1)*2) [1] 0.6826895 > > pnorm(-2) [1] 0.02275013 > pnorm(2, lower.tail = F) [1] 0.02275013 > 1-(pnorm(-2)*2) [1] 0.9544997 > > pnorm(-3) [1] 0.001349898 > pnorm(3, lower.tail = F) [1] 0.001349898 > 1-(pnorm(-3)*2) [1] 0.9973002 > > > # 68% > a <- qnorm(.32/2) > b <- qnorm(1-.32/2) > c(a, b) [1] -0.9944579 0.9944579 > c(-1, 1) [1] -1 1 > # 95% > c <- qnorm(.05/2) > d <- qnorm(1-.05/2) > c(c, d) [1] -1.959964 1.959964 > c(-2,2) [1] -2 2 > # 99% > e <- qnorm(.01/2) > f <- qnorm(1-.01/2) > c(e,f) [1] -2.575829 2.575829 > c(-3,3) [1] -3 3 > > pnorm(b)-pnorm(a) [1] 0.68 > pnorm(d)-pnorm(c) [1] 0.95 > pnorm(f)-pnorm(e) [1] 0.99 > > ################################ > N.p <- 1000000 > m.p <- 100 > sd.p <- 10 > > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > > p2 <- rnorm2(N.p, m.p+20, sd.p) > mean(p2) [1] 120 > sd(p2) [1] 10 > > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1",) > abline(v=mean(p1), col="black", lwd=3) > > > s.size <- 10 > > means.temp <- c() > s1 <- sample(p1, s.size, replace = T) > mean(s1) [1] 94.66982 > means.temp <- append(means.temp, mean(s1)) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp [1] 94.66982 100.08316 99.93752 99.83882 > > iter <- 1000000 > means <- c() > means <- rep(NA, iter) > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + means[i] <- mean(sample(p1, s.size, replace = T)) + } > > mean(means) [1] 100.0042 > sd(means) [1] 3.163687 > se.s <- sd(means) > > lo1 <- mean(means)-se.s > hi1 <- mean(means)+se.s > lo2 <- mean(means)-2*se.s > hi2 <- mean(means)+2*se.s > lo3 <- mean(means)-3*se.s > hi3 <- mean(means)+3*se.s > > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > > # meanwhile . . . . > se.s [1] 3.163687 > > se.z <- sqrt(var(p1)/s.size) > se.z <- c(se.z) > se.z [1] 3.162278 > > # sd of sample means (sd(means)) > # is sqrt(var(s1)/s.size) or > # sd(s1) / sqrt(s.size) > # = se.s > > # when iter value goes to > # unlimited value: > # mean(means) = mean(p1) > # and > # sd(means) = sd(p1) / sqrt(s.size) > # that is, sd(means) = se.z > # This is called CLT (Central Limit Theorem) > mean(means) [1] 100.0042 > mean(p1) [1] 100 > sd(means) [1] 3.163687 > se.z [1] 3.162278 > > # because CLT > loz1 <- mean(p1)-se.z > hiz1 <- mean(p1)+se.z > loz2 <- mean(p1)-2*se.z > hiz2 <- mean(p1)+2*se.z > loz3 <- mean(p1)-3*se.z > hiz3 <- mean(p1)+3*se.z > > c(lo1, loz1) [1] 96.84055 96.83772 > c(lo2, loz2) [1] 93.67686 93.67544 > c(lo3, loz3) [1] 90.51317 90.51317 > > c(hi1, hiz1) [1] 103.1679 103.1623 > c(hi2, hiz2) [1] 106.3316 106.3246 > c(hi3, hiz3) [1] 109.4953 109.4868 > > > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > abline(v=mean(p1), col="red", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > abline(v=c(loz1, hiz1, loz2, hiz2, loz3, hiz3), + col=c("red","red", "red", "red", "red", "red"), + lwd=2) > > > round(c(lo1, hi1)) [1] 97 103 > round(c(lo2, hi2)) [1] 94 106 > round(c(lo3, hi3)) [1] 91 109 > > round(c(loz1, hiz1)) [1] 97 103 > round(c(loz2, hiz2)) [1] 94 106 > round(c(loz3, hiz3)) [1] 91 109 > > m.sample.i.got <- 101.8 > > hist(means, + xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=3) > abline(v=m.sample.i.got, col='darkgreen', lwd=3) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 101.8 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 0.2846929 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=3) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 0.5693857 > > ### one more time > mean(p2) [1] 120 > sd(p2) [1] 10 > sample(p2, s.size) [1] 112.3761 123.0103 124.5613 116.8093 113.7091 113.2657 126.8168 125.4273 123.8021 118.1890 > m.sample.i.got <- mean(sample(p2, s.size)) > m.sample.i.got [1] 115.9053 > > hist(means, + xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > abline(v=m.sample.i.got, col='darkgreen', lwd=2) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 115.9053 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 2.484948e-07 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=2) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 4.969897e-07 > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, breaks=50, col=rgb(0,0,1,.5)) > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > p2 <- rnorm2(N.p, m.p+20, sd.p) > mean(p2) [1] 120 > sd(p2) [1] 10 > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, breaks=50, col=rgb(0,0,1,.5)) > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5)) > s.size <- 10 > abline(v=mean(p2), col="violet", lwd=3) > s.size <- 10 > means.temp <- c() > # means.temp <- c() > s1 <- sample(p1, s.size, replace = T) > mean(s1) [1] 98.98075 > means.temp <- append(means.temp, mean(s1)) > means.temp <- c() > s1 <- sample(p1, s.size, replace = T) > mean(s1) [1] 95.23863 > means.temp <- append(means.temp, mean(s1)) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp [1] 95.23863 98.50240 104.56303 89.11020 > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp [1] 95.23863 98.50240 104.56303 89.11020 99.51111 > iter <- 1000000 > # means <- c() > means <- rep(NA, iter) > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + means[i] <- mean(sample(p1, s.size, replace = T)) + } > > > length(means) [1] 1000000 > length(means) [1] 1000000 > mean(means) [1] 99.99929 > sd(means) [1] 3.162556 > se.s <- sd(means) > hist(means, breaks=100, rgb=(.4,.4,.4,.4)) 에러: 예기치 않은 ','입니다 in "hist(means, breaks=100, rgb=(.4," > hist(means, breaks=100, rgb=(.4, .4, .4, .4)) 에러: 예기치 않은 ','입니다 in "hist(means, breaks=100, rgb=(.4," > hist(means, breaks=100, col=rgb(.1, 0, 0, .5)) > # now we want to get sd of this distribution > lo1 <- mean(means)-se.s > hi1 <- mean(means)+se.s > lo2 <- mean(means)-2*se.s > hi2 <- mean(means)+2*se.s > lo3 <- mean(means)-3*se.s > hi3 <- mean(means)+3*se.s > abline(v=mean(means), col="black", lwd=2) > abline(v=mean(means), col="red", lwd=2) > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > # meanwhile . . . . > se.s [1] 3.162556 > se.z <- sqrt(var(p1)/s.size) > se.z <- c(se.z) > se.z [1] 3.162278 > # when iter value goes to > # unlimited value: > # mean(means) = mean(p1) > # and > # sd(means) = sd(p1) / sqrt(s.size) > # that is, sd(means) = se.z > # This is called CLT (Central Limit Theorem) > mean(means) [1] 99.99929 > rm(list=ls()) > rnorm2 <- function(n,mean,sd){ + mean+sd*scale(rnorm(n)) + } > > se <- function(sample) { + sd(sample)/sqrt(length(sample)) + } > > ss <- function(x) { + sum((x-mean(x))^2) + } > ################################ > > # reminder. > pnorm(-1) [1] 0.1586553 > pnorm(1, lower.tail = F) [1] 0.1586553 > 1-(pnorm(-1) + pnorm(1, lower.tail = F)) [1] 0.6826895 > 1-(pnorm(-1)*2) [1] 0.6826895 > > pnorm(-2) [1] 0.02275013 > pnorm(2, lower.tail = F) [1] 0.02275013 > 1-(pnorm(-2)*2) [1] 0.9544997 > > pnorm(-3) [1] 0.001349898 > pnorm(3, lower.tail = F) [1] 0.001349898 > 1-(pnorm(-3)*2) [1] 0.9973002 > > > # 68% > a <- qnorm(.32/2) > b <- qnorm(1-.32/2) > c(a, b) [1] -0.9944579 0.9944579 > c(-1, 1) [1] -1 1 > # 95% > c <- qnorm(.05/2) > d <- qnorm(1-.05/2) > c(c, d) [1] -1.959964 1.959964 > c(-2,2) [1] -2 2 > # 99% > e <- qnorm(.01/2) > f <- qnorm(1-.01/2) > c(e,f) [1] -2.575829 2.575829 > c(-3,3) [1] -3 3 > > pnorm(b)-pnorm(a) [1] 0.68 > pnorm(d)-pnorm(c) [1] 0.95 > pnorm(f)-pnorm(e) [1] 0.99 > > ################################ > N.p <- 1000000 > m.p <- 100 > sd.p <- 10 > > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > > p2 <- rnorm2(N.p, m.p+20, sd.p) > mean(p2) [1] 120 > sd(p2) [1] 10 > > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5)) > abline(v=mean(p2), col="violet", lwd=3) > > s.size <- 10 > > means.temp <- c() > s1 <- sample(p1, s.size, replace = T) > mean(s1) [1] 107.1742 > means.temp <- append(means.temp, mean(s1)) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp [1] 107.17418 101.54022 99.38215 96.19341 100.70208 > > iter <- 1000000 > # means <- c() > means <- rep(NA, iter) > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + means[i] <- mean(sample(p1, s.size, replace = T)) + } > length(means) [1] 1000000 > mean(means) [1] 99.99699 > sd(means) [1] 3.160973 > se.s <- sd(means) > > hist(means, breaks=100, col=rgb(.1, 0, 0, .5)) > abline(v=mean(means), col="red", lwd=2) > > # now we want to get sd of this distribution > lo1 <- mean(means)-se.s > hi1 <- mean(means)+se.s > lo2 <- mean(means)-2*se.s > hi2 <- mean(means)+2*se.s > lo3 <- mean(means)-3*se.s > hi3 <- mean(means)+3*se.s > > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > > # meanwhile . . . . > se.s [1] 3.160973 > > se.z <- sqrt(var(p1)/s.size) > se.z <- c(se.z) > se.z [1] 3.162278 > > # sd of sample means (sd(means)) > # is sqrt(var(s1)/s.size) or > # sd(s1) / sqrt(s.size) > # = se.s > > # when iter value goes to > # unlimited value: > # mean(means) = mean(p1) > # and > # sd(means) = sd(p1) / sqrt(s.size) > # that is, sd(means) = se.z > # This is called CLT (Central Limit Theorem) > mean(means) [1] 99.99699 > mean(p1) [1] 100 > sd(means) [1] 3.160973 > se.z [1] 3.162278 > > # because CLT > loz1 <- mean(p1)-se.z > hiz1 <- mean(p1)+se.z > loz2 <- mean(p1)-2*se.z > hiz2 <- mean(p1)+2*se.z > loz3 <- mean(p1)-3*se.z > hiz3 <- mean(p1)+3*se.z > > c(lo1, loz1) [1] 96.83602 96.83772 > c(lo2, loz2) [1] 93.67504 93.67544 > c(lo3, loz3) [1] 90.51407 90.51317 > > c(hi1, hiz1) [1] 103.1580 103.1623 > c(hi2, hiz2) [1] 106.3189 106.3246 > c(hi3, hiz3) [1] 109.4799 109.4868 > > > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > abline(v=mean(p1), col="red", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > abline(v=c(loz1, hiz1, loz2, hiz2, loz3, hiz3), + col=c("red","red", "red", "red", "red", "red"), + lwd=2) > > > round(c(lo1, hi1)) [1] 97 103 > round(c(lo2, hi2)) [1] 94 106 > round(c(lo3, hi3)) [1] 91 109 > > round(c(loz1, hiz1)) [1] 97 103 > round(c(loz2, hiz2)) [1] 94 106 > round(c(loz3, hiz3)) [1] 91 109 > > m.sample.i.got <- 101.8 > > hist(means, + xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=3) > abline(v=m.sample.i.got, col='darkgreen', lwd=3) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 101.8 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 0.2845272 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=3) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 0.5690543 > > ### one more time > mean(p2) [1] 120 > sd(p2) [1] 10 > sample(p2, s.size) [1] 115.5752 109.5564 137.8459 115.3346 130.8225 129.1909 118.8858 107.1550 118.8443 109.5731 > m.sample.i.got <- mean(sample(p2, s.size)) > m.sample.i.got [1] 122.4168 > > hist(means, + xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > abline(v=m.sample.i.got, col='darkgreen', lwd=2) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 122.4168 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 6.621728e-13 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=2) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 1.324346e-12 > > >
r/sampling_distribution.1757419773.txt.gz · Last modified: 2025/09/09 21:09 by hkimscil