User Tools

Site Tools


sampling_distribution_in_r

Sampling distribution in R e.g. 1

n.ca <- 100000
mean.ca <- 70
sd.ca <- 15
set.seed(2020)
ca <- rnorm(n.ca, mean=mean.ca, sd=sd.ca)
hist(ca, xlab="ca", main="ca index", freq=F)
curve(dnorm(x, mean=mean(ca), sd=sd(ca)), add=TRUE, col="blue")
abline(v=mean.ca,lwd=3,lty=2, col="red")

n=4

iter <- 10000
n <- 4
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ca, n))
}
m.ca <- mean(ca)
m <- mean(means)
sd1 <- sd(means)
sd2 <- 2*sd1
sd3 <- 3*sd1
max(means)
min(means)

h4 <- hist(means)
hist(means, main="Dist. of means, n=4", xlim=c(40,100), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ca), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

> sd(means)
[1] 7.495025

> s.ca <- sd(ca)/sqrt(n)
> s.ca
[1] 7.477513

n = 36

n <- 36
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ca, n))
}
m.ca <- mean(ca)
m <- mean(means)
sd1 <- sd(means)
sd2 <- 2*sd1
sd3 <- 3*sd1
max(means)
min(means)

h36 <- hist(means)
hist(means, main="Dist. of means, n=36", xlim=c(60,80), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ca), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

n = 100

n <- 100
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ca, n))
}
m <- mean(means)
sd1 <- sd(means)
sd2 <- 2*sd1
sd3 <- 3*sd1
max.means <- max(means)
min.means <- min(means)

h100 <- hist(means)
hist(means, main="Dist. of means, n=100", xlim=c(60,80), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ca), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

n = 400

n <- 400
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ca, n))
}
m <- mean(means)
sd1 <- sd(means)
sd2 <- 2*sd1
sd3 <- 3*sd1
max.means <- max(means)
min.means <- min(means)

h400 <- hist(means)
hist(means, main="Dist. of means, n=400", xlim=c(60,80), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ca), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

n = 900

n <- 900
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ca, n))
}
h900 <- hist(means)
m <- mean(means)
sd1 <- sd(means)
sd2 <- 2*sd1
sd3 <- 3*sd1
max.means <- max(means)
min.means <- min(means)

hist(means, main="Dist. of means, n=900", xlim=c(60,80), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ca), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

xmin <- 40
xmax <- 100
ymax <- 2500
plot(h4, col=rgb(0,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax))  # first histogram
plot(h36, col=rgb(1/5,1,0,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T)  # second
plot(h100, col=rgb(2/5,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T) 
plot(h400, col=rgb(3/5,1,0,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T)
plot(h900, col=rgb(4/5,0,1,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T)    

Sampling distribution in proportion in R

pop <- rbinom(100000, size = 1, prob = 0.5)
par(mfrow=c(2,2)) 
iter <- 10000
n <- 5
means <- rep (NA, iter)
for(i in 1:iter){
    means[i] = mean(sample(pop, n))
}
mean(means)
hist(means, xlim=c(0,1), main=n)

iter <- 10000
n <- 25
means <- rep (NA, iter)
for(i in 1:iter){
    means[i] = mean(sample(pop, n))
}
mean(means)
hist(means, xlim=c(0,1), main=n)

iter <- 10000
n <- 100
means <- rep (NA, iter)
for(i in 1:iter){
    means[i] = mean(sample(pop, n))
}
mean(means)
hist(means, xlim=c(0,1), main=n)

iter <- 10000
n <- 900
means <- rep (NA, iter)
for(i in 1:iter){
    means[i] = mean(sample(pop, n))
}
mean(means)
sd(means)
var(means)
hist(means, xlim=c(0,1), main=n)

par(mfrow=c(1,1)) 
set.seed(2020)
pop <- rbinom(100000, size = 1, prob = 0.4)
par(mfrow=c(2,2)) 
iter <- 1000
ns <- c(25, 100, 400, 900)
l.ns <- length(ns)
for (i in 1:l.ns) {
    for(k in 1:iter) {
        means[k] = mean(sample(pop, ns[i]))
    }
    mean(means)
    sd(means)
    hist(means, xlim=c(0,1), main=n)
}
par(mfrow=c(1,1)) 

0.5가 비율인 (proportion) 모집단에 대한 여론 조사를 위해서 900명의 샘플을 취하고 이를 이용하여 모집단의 위치를 추정하자.

n <- 900
samp <- sample(pop, n)
mean(samp)
p <- mean(samp)
q <- 1-p
ser <- sqrt((p*q)/n)
ser2 <- ser * 2
p - ser2 
p + ser2  

Sampling distribution in R e.g. 2

아주대학교 학생의 나이에 대한 모집단 정보가 있다고 하자. 아주대학교 학생의 학생 수는 모두 10,000명이라고 한다 (N = 10000). 데이터는 다음과 같이 구하여 r에 저장한다.

n.ajstu <- 100000
mean.ajstu <- 24.6
sd.ajstu <- 2
set.seed(1024)
ajstu <- rnorm(n.ajstu, mean=mean.ajstu, sd=sd.ajstu)
hist(ajstu,xlab="age", main="dist. of ajou stu", freq=F)
abline(v=mean(ajstu), lwd=3, lty=2, col="red")
curve(dnorm(x, mean=mean(ajstu), sd=sd(ajstu)), add=TRUE, col="blue")

n = 4

iter <- 10000
n <- 4
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ajstu, n))
}

mean(ajstu)
m <- mean(means)
sd1 <- sd(means) ## sdev of the dist. of sample means
sd1
sd(ajstu)/sqrt(4)
sd2 <- 2*sd(means) 
sd3 <- 3*sd(means) 
max(means)
min(means)

h4 <- hist(means)
hist(means, main="Dist. of means, n=4", xlim=c(21,29), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

n = 36

n <- 36
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ajstu, n))
}

mean(ajstu)
m <- mean(means)
m
sd1 <- sd(means) ## sdev of the dist. of sample means
sd1
sd(ajstu)/sqrt(n)
sd2 <- 2*sd(means) 
sd3 <- 3*sd(means) 
max(means)
min(means)

h36 <- hist(means)
hist(means, main="Dist. of means, n=36", xlim=c(21,29), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

n = 100

n <- 100
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ajstu, n))
}

mean(ajstu)
m <- mean(means)
m
sd1 <- sd(means) ## sdev of the dist. of sample means
sd1
sd(ajstu)/sqrt(n)
sd2 <- 2*sd(means) 
sd3 <- 3*sd(means) 
max(means)
min(means)

h100 <- hist(means)
hist(means, main="Dist. of means, n=100", xlim = c(20, 30), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")
> mean(ajstu)
[1] 24.60763
> m <- mean(means)
> m
[1] 24.60779
> sd1 <- sd(means) ## sdev of the dist. of sample means
> sd1
[1] 0.1983636
> sd(ajstu)/sqrt(n)
[1] 0.1997546
> sd2 <- 2*sd(means) 
> sd3 <- 3*sd(means) 
> max(means)
[1] 25.29987
> min(means)
[1] 23.8735

n = 400

n <- 400
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ajstu, n))
}

mean(ajstu)
m <- mean(means)
m
sd1 <- sd(means) ## sdev of the dist. of sample means
sd1
sd(ajstu)/sqrt(n)
sd2 <- 2*sd(means) 
sd3 <- 3*sd(means) 
max(means)
min(means)

h400 <- hist(means)
hist(means, main="Dist. of means, n=400", xlim = c(21,29), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")
> mean(ajstu)
[1] 24.60763
> m <- mean(means)
> m
[1] 24.60927
> sd1 <- sd(means) ## sdev of the dist. of sample means
> sd1
[1] 0.09943006
> sd(ajstu)/sqrt(n)
[1] 0.09987731
> sd2 <- 2*sd(means) 
> sd3 <- 3*sd(means) 
> max(means)
[1] 24.95824
> min(means)
[1] 24.28413

n = 900

n <- 900
means <- rep (NA, iter)
for(i in 1:iter){
  means[i] = mean(sample(ajstu, n))
}

mean(ajstu)
m <- mean(means)
m
sd1 <- sd(means) ## sdev of the dist. of sample means
sd1
sd(ajstu)/sqrt(n)
sd2 <- 2*sd(means) 
sd3 <- 3*sd(means) 
max(means)
min(means)

h900 <- hist(means)
hist(means, main="Dist. of means, n=900", xlim = c(21,29), freq=F)
curve(dnorm(x, mean=m, sd=sd1), col="blue", add=TRUE)
abline(v = m, lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
abline(v = (m - sd1), lty=2, lwd=1, col="blue")
abline(v = (m - sd2), lty=2, lwd=1, col="blue")
abline(v = (m - sd3), lty=2, lwd=1, col="blue")
abline(v = (m + sd1), lty=2, lwd=1, col="blue")
abline(v = (m + sd2), lty=2, lwd=1, col="blue")
abline(v = (m + sd3), lty=2, lwd=1, col="blue")

xmin <- 21
xmax <- 28
ymax <- 3000
plot(h4, col=rgb(0,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax))  # first histogram
plot(h36, col=rgb(1/5,1,0,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T)  # second
plot(h100, col=rgb(2/5,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T) 
plot(h400, col=rgb(3/5,1,0,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T)
plot(h900, col=rgb(4/5,0,1,1/4), xlim=c(xmin, xmax), ylim=c(0,ymax), add=T)    

n <- 10000
means <- rep (NA, iter)
for(i in 1:iter){
    means[i] = mean(sample(ajstu, n))
}
h10000 <- hist(means)
hist(means, main="Dist. of means, n=100000")
abline(v = mean(means), lty=2, lwd=3, col="blue")
abline(v = mean(ajstu), lty=2, lwd=3, col="red")
mean(ajstu)
mean(means)
max(means)
min(means)
sampling_distribution_in_r.txt · Last modified: 2020/04/25 15:54 by hkimscil