sampling_distribution_in_r
This is an old revision of the document!
Table of Contents
Sampling distribution in R e.g. 1
n.ca <- 100000 mean.ca <- 70 sd.ca <- 15 set.seed(2020) ca <- rnorm(n.ca, mean=mean.ca, sd=sd.ca) ca <- round(ca, 0) hist(ca, xlab="ca", main="ca index", freq=F) curve(dnorm(x, mean=mean(ca), sd=sd(ca)), add=TRUE, col="blue") abline(v=mean.ca,lwd=3,lty=2, col="red") summary(ca) mu <- round(mean(ca)) sigma <- round(sd(ca)) mu sigma
rnorm2 ← function(n,mean,sd) { mean+sd*scale(rnorm(n)) }
n.ca ← 100000
mean.ca ← 70
sd.ca ← 15
set.seed(101)
ca ← rnorm2(n.ca, mean=mean.ca, sd=sd.ca)
hist(ca, xlab=“ca”, main=“ca index”, freq=F)
curve(dnorm(x, mean=mean(ca), sd=sd(ca)), add=TRUE, col=“blue”)
abline(v=mean.ca,lwd=3,lty=2, col=“red”)
summary(ca)
mu ← round(mean(ca))
sigma ← round(sd(ca))
mu
sigma
</code>
> summary(ca) Min. 1st Qu. Median Mean 3rd Qu. Max. 7.00 60.00 70.00 69.96 80.00 132.00 >
최소값 70
최대값 132
대강의 아이디어.
- 위의 점수가 전국 고등학교 2년생의 (모집단) 수학점수라고 가정을 하자. 그리고, 이 모집단의 수학점수 평균은 70, 표준편차는 15임을 알고 있으며 최소값과 최대값 또한 알고 있다 (각각 70, 132)
- 그런데 내가 수학을 학생들에게 (25명) 가르치는데 그 방법이 남달라서 효과가 좋다는 것을 확신한고 있다고 하자.
- 이를 증명하는데 가장 확실하게 느낄수 있는 (?) 방법은 내 학생의 점수가 위의 모집단 점수의 최대값인 132점을 넘는 것이다.
- 132점을 넘는 학생은 모집단에 속한 학생이 아니라 다른 모집단에 (나의 교육방법을 교수받은 모집단) 속한 학생이라고 생각할 수 있는 것이다.
- 내가 가르친 학생들의 평균점수가 132점을 모두 넘는다면 한 학생이 아니라 나의 집단이 (샘플이) 모집단에 속하지 않는 특별한 집단이라고 생각할 수 있다.
- 그러나 현실적으로 이렇게 판단하기에는 넘어야 할 점수가 너무 크다.
n=4
iter <- 10000 n <- 4 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ca, n)) } m <- mean(means) sd1 <- sd(means) se <- sigma/sqrt(n) sd2 <- 2*sd1 sd3 <- 3*sd1 m sd1 se sd2 sd3 max(means) min(means) h4 <- hist(means) hist(means, main="Dist. of means, n=4", xlim=c(40,100), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ca), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
> sd(means) [1] 7.495025 > s.ca <- sd(ca)/sqrt(n) > s.ca [1] 7.477513
n = 36
iter <- 10000 n <- 36 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ca, n)) } m <- mean(means) sd1 <- sd(means) se <- sigma/sqrt(n) sd2 <- 2*sd1 sd3 <- 3*sd1 m sd1 se sd2 sd3 max(means) min(means) h36 <- hist(means) hist(means, main="Dist. of means, n=36", xlim=c(60,80), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ca), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
n = 100
iter <- 10000 n <- 100 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ca, n)) } m <- mean(means) sd1 <- sd(means) se <- sigma/sqrt(n) sd2 <- 2*sd1 sd3 <- 3*sd1 m sd1 se sd2 sd3 max(means) min(means) h100 <- hist(means) hist(means, main="Dist. of means, n=100", xlim=c(60,80), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ca), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
n = 400
iter <- 10000 n <- 400 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ca, n)) } m <- mean(means) sd1 <- sd(means) se <- sigma/sqrt(n) sd2 <- 2*sd1 sd3 <- 3*sd1 m sd1 se sd2 sd3 max(means) min(means) h400 <- hist(means) hist(means, main="Dist. of means, n=400", xlim=c(60,80), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ca), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
n = 900
iter <- 10000 n <- 900 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ca, n)) } m <- mean(means) sd1 <- sd(means) se <- sigma/sqrt(n) sd2 <- 2*sd1 sd3 <- 3*sd1 m sd1 se sd2 sd3 max(means) min(means) hist(means, main="Dist. of means, n=900", xlim=c(60,80), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ca), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
xmin <- 40 xmax <- 100 ymax <- 2500 plot(h4, col=rgb(0,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax)) # first histogram plot(h36, col=rgb(1/5,1,0,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T) # second plot(h100, col=rgb(2/5,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T) plot(h400, col=rgb(3/5,1,0,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T) plot(h900, col=rgb(4/5,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T)
Sampling distribution in proportion in R
pop <- rbinom(100000, size = 1, prob = 0.5) par(mfrow=c(2,2)) iter <- 10000 n <- 5 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 25 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 100 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) hist(means, xlim=c(0,1), main=n) iter <- 10000 n <- 900 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(pop, n)) } mean(means) sd(means) var(means) hist(means, xlim=c(0,1), main=n) par(mfrow=c(1,1))
set.seed(2020) pop <- rbinom(100000, size = 1, prob = 0.4) par(mfrow=c(2,2)) iter <- 1000 ns <- c(25, 100, 400, 900) l.ns <- length(ns) for (i in 1:l.ns) { for(k in 1:iter) { means[k] = mean(sample(pop, ns[i])) } mean(means) sd(means) hist(means, xlim=c(0,1), main=n) } par(mfrow=c(1,1))
0.5가 비율인 (proportion) 모집단에 대한 여론 조사를 위해서 900명의 샘플을 취하고 이를 이용하여 모집단의 위치를 추정하자.
n <- 900 samp <- sample(pop, n) mean(samp) p <- mean(samp) q <- 1-p ser <- sqrt((p*q)/n) ser2 <- ser * 2 p - ser2 p + ser2
Sampling distribution in R e.g. 2
아주대학교 학생의 나이에 대한 모집단 정보가 있다고 하자. 아주대학교 학생의 학생 수는 모두 10,000명이라고 한다 (N = 10000). 데이터는 다음과 같이 구하여 r에 저장한다.
n.ajstu <- 100000 mean.ajstu <- 24.6 sd.ajstu <- 2 set.seed(1024) ajstu <- rnorm(n.ajstu, mean=mean.ajstu, sd=sd.ajstu) hist(ajstu,xlab="age", main="dist. of ajou stu", freq=F) abline(v=mean(ajstu), lwd=3, lty=2, col="red") curve(dnorm(x, mean=mean(ajstu), sd=sd(ajstu)), add=TRUE, col="blue")
n = 4
iter <- 10000 n <- 4 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } mean(ajstu) m <- mean(means) sd1 <- sd(means) ## sdev of the dist. of sample means sd1 sd(ajstu)/sqrt(4) sd2 <- 2*sd(means) sd3 <- 3*sd(means) max(means) min(means) h4 <- hist(means) hist(means, main="Dist. of means, n=4", xlim=c(21,29), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
n = 36
n <- 36 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } mean(ajstu) m <- mean(means) m sd1 <- sd(means) ## sdev of the dist. of sample means sd1 sd(ajstu)/sqrt(n) sd2 <- 2*sd(means) sd3 <- 3*sd(means) max(means) min(means) h36 <- hist(means) hist(means, main="Dist. of means, n=36", xlim=c(21,29), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
n = 100
n <- 100 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } mean(ajstu) m <- mean(means) m sd1 <- sd(means) ## sdev of the dist. of sample means sd1 sd(ajstu)/sqrt(n) sd2 <- 2*sd(means) sd3 <- 3*sd(means) max(means) min(means) h100 <- hist(means) hist(means, main="Dist. of means, n=100", xlim = c(20, 30), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
> mean(ajstu) [1] 24.60763 > m <- mean(means) > m [1] 24.60779 > sd1 <- sd(means) ## sdev of the dist. of sample means > sd1 [1] 0.1983636 > sd(ajstu)/sqrt(n) [1] 0.1997546 > sd2 <- 2*sd(means) > sd3 <- 3*sd(means) > max(means) [1] 25.29987 > min(means) [1] 23.8735
n = 400
n <- 400 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } mean(ajstu) m <- mean(means) m sd1 <- sd(means) ## sdev of the dist. of sample means sd1 sd(ajstu)/sqrt(n) sd2 <- 2*sd(means) sd3 <- 3*sd(means) max(means) min(means) h400 <- hist(means) hist(means, main="Dist. of means, n=400", xlim = c(21,29), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
> mean(ajstu) [1] 24.60763 > m <- mean(means) > m [1] 24.60927 > sd1 <- sd(means) ## sdev of the dist. of sample means > sd1 [1] 0.09943006 > sd(ajstu)/sqrt(n) [1] 0.09987731 > sd2 <- 2*sd(means) > sd3 <- 3*sd(means) > max(means) [1] 24.95824 > min(means) [1] 24.28413
n = 900
n <- 900 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } mean(ajstu) m <- mean(means) m sd1 <- sd(means) ## sdev of the dist. of sample means sd1 sd(ajstu)/sqrt(n) sd2 <- 2*sd(means) sd3 <- 3*sd(means) max(means) min(means) h900 <- hist(means) hist(means, main="Dist. of means, n=900", xlim = c(21,29), freq=F) curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE) abline(v = m, lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") abline(v = (m - sd1), lty=2, lwd=1, col="blue") abline(v = (m - sd2), lty=2, lwd=1, col="blue") abline(v = (m - sd3), lty=2, lwd=1, col="blue") abline(v = (m + sd1), lty=2, lwd=1, col="blue") abline(v = (m + sd2), lty=2, lwd=1, col="blue") abline(v = (m + sd3), lty=2, lwd=1, col="blue")
xmin <- 21 xmax <- 28 ymax <- 3000 plot(h4, col=rgb(0,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax)) # first histogram plot(h36, col=rgb(1/5,1,0,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T) # second plot(h100, col=rgb(2/5,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T) plot(h400, col=rgb(3/5,1,0,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T) plot(h900, col=rgb(4/5,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T)
n <- 10000 means <- rep (NA, iter) for(i in 1:iter){ means[i] = mean(sample(ajstu, n)) } h10000 <- hist(means) hist(means, main="Dist. of means, n=100000") abline(v = mean(means), lty=2, lwd=3, col="blue") abline(v = mean(ajstu), lty=2, lwd=3, col="red") mean(ajstu) mean(means) max(means) min(means)
sampling_distribution_in_r.1699832985.txt.gz · Last modified: 2023/11/13 08:49 by hkimscil