====== Sampling distribution in R e.g. 1 ====== n.ajstu <- 100000 mean.ajstu <- 100 sd.ajstu <- 10 set.seed(1024) ajstu <- rnorm2(n.ajstu, mean=mean.ajstu, sd=sd.ajstu) mean(ajstu) sd(ajstu) var(ajstu) iter <- 10000 # # of sampling n.4 <- 4 means4 <- rep (NA, iter) for(i in 1:iter){ means4[i] = mean(sample(ajstu, n.4)) } n.25 <- 25 means25 <- rep (NA, iter) for(i in 1:iter){ means25[i] = mean(sample(ajstu, n.25)) } n.100 <- 100 means100 <- rep (NA, iter) for(i in 1:iter){ means100[i] = mean(sample(ajstu, n.100)) } n.400 <- 400 means400 <- rep (NA, iter) for(i in 1:iter){ means400[i] = mean(sample(ajstu, n.400)) } n.900 <- 900 means900 <- rep (NA, iter) for(i in 1:iter){ means900[i] = mean(sample(ajstu, n.900)) } n.1600 <- 1600 means1600 <- rep (NA, iter) for(i in 1:iter){ means1600[i] = mean(sample(ajstu, n.1600)) } n.2500 <- 2500 means2500 <- rep (NA, iter) for(i in 1:iter){ means2500[i] = mean(sample(ajstu, n.2500)) } h4 <- hist(means4) h25 <- hist(means25) h100 <- hist(means100) h400 <- hist(means400) h900 <- hist(means900) h1600 <- hist(means1600) h2500 <- hist(means2500) plot(h4, ylim=c(0,3000), col="red") plot(h25, add = T, col="blue") plot(h100, add = T, col="green") plot(h400, add = T, col="grey") plot(h900, add = T, col="yellow") sss <- c(4,25,100,400,900,1600,2500) # sss sample sizes ses <- rep (NA, length(sss)) # std errors for(i in 1:length(sss)){ ses[i] = sqrt(var(ajstu)/sss[i]) } ses se.1 <- ses se.2 <- 2 * ses lower.s2 <- mean(ajstu)-se.2 upper.s2 <- mean(ajstu)+se.2 data.frame(cbind(sss, ses, lower.s2, upper.s2)) # n =1600 일 경우에 # sample의 평균이 100.15보다 작을 # 확률은 어떻게 구해야 할까? # n = 1600 일 경우에 # sampling distribution은 # Xbar ~ N(100, var(ajstu)/n.1600) # 그리고, 위에서 standard error값은 # sqrt(var(ajstu)/n.1600) # 이것을 standard error라고 부른다 # 따라서 se.1600 <- sqrt(var(ajstu)/n.1600) pnorm(100.15, mean(ajstu), se.1600) {{:pasted:20240319-120709.png}} ===== Sampling distribution in proportion in R ===== pop <- rbinom(100000, size = 1, prob = 0.5) par(mfrow=c(2,2)) iter <- 10000 n <- 5 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 25 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 100 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 900 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) sd(means) var(means) hist(means, xlim=c(0,1), main=n) par(mfrow=c(1,1)) set.seed(2020) pop <- rbinom(100000, size = 1, prob = 0.4) par(mfrow=c(2,2)) iter <- 1000 ns <- c(25, 100, 400, 900) l.ns <- length(ns) for (i in 1:l.ns) { for(k in 1:iter) { means[k] = mean(sample(pop, ns[i])) } mean(means) sd(means) hist(means, xlim=c(0,1), main=n) } par(mfrow=c(1,1)) 0.5가 비율인 (proportion) 모집단에 대한 여론 조사를 위해서 900명의 샘플을 취하고 이를 이용하여 모집단의 위치를 추정하자. n <- 900 samp <- sample(pop, n) mean(samp) p <- mean(samp) q <- 1-p ser <- sqrt((p*q)/n) ser2 <- ser * 2 p - ser2 p + ser2