sampling_distribution_in_r
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
sampling_distribution_in_r [2023/11/13 08:49] – hkimscil | sampling_distribution_in_r [2024/03/20 14:15] (current) – [Sampling distribution in R e.g. 1] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== Sampling distribution in R e.g. 1 ====== | ====== Sampling distribution in R e.g. 1 ====== | ||
< | < | ||
- | n.ca <- 100000 | + | n.ajstu <- 100000 |
- | mean.ca <- 70 | + | mean.ajstu <- 100 |
- | sd.ca <- 15 | + | sd.ajstu <- 10 |
- | set.seed(2020) | + | |
- | ca <- rnorm(n.ca, mean=mean.ca, | + | |
- | ca <- round(ca, 0) | + | |
- | hist(ca, xlab=" | + | |
- | curve(dnorm(x, | + | |
- | abline(v=mean.ca, | + | |
- | summary(ca) | + | |
- | mu <- round(mean(ca)) | + | |
- | sigma <- round(sd(ca)) | + | |
- | mu | + | |
- | sigma | + | |
- | </ | + | set.seed(1024) |
- | rnorm2 <- function(n, | + | ajstu <- rnorm2(n.ajstu, mean=mean.ajstu, sd=sd.ajstu) |
- | n.ca <- 100000 | + | |
- | mean.ca <- 70 | + | |
- | sd.ca <- 15 | + | |
- | set.seed(101) | + | |
- | ca <- rnorm2(n.ca, mean=mean.ca, sd=sd.ca) | + | |
- | hist(ca, xlab=" | + | |
- | curve(dnorm(x, | + | |
- | abline(v=mean.ca, | + | |
- | summary(ca) | + | |
- | mu <- round(mean(ca)) | + | |
- | sigma <- round(sd(ca)) | + | |
- | mu | + | |
- | sigma | + | |
- | </ | + | |
- | < | + | mean(ajstu) |
- | > summary(ca) | + | sd(ajstu) |
- | Min. 1st Qu. Median | + | var(ajstu) |
- | | + | |
- | > | + | |
- | </ | + | |
- | 최소값 70 | + | iter <- 10000 # # of sampling |
- | 최대값 132 | + | |
- | 대강의 아이디어. | + | n.4 <- 4 |
+ | means4 <- rep (NA, iter) | ||
+ | for(i in 1:iter){ | ||
+ | means4[i] = mean(sample(ajstu, | ||
+ | } | ||
- | * 위의 점수가 전국 고등학교 2년생의 (모집단) 수학점수라고 가정을 하자. 그리고, 이 모집단의 수학점수 평균은 70, 표준편차는 15임을 알고 있으며 최소값과 최대값 또한 알고 있다 | + | n.25 <- 25 |
- | * 그런데 내가 수학을 학생들에게 | + | means25 <- rep (NA, iter) |
- | | + | for(i in 1:iter){ |
- | * 132점을 넘는 학생은 모집단에 속한 학생이 아니라 다른 모집단에 (나의 교육방법을 교수받은 모집단) 속한 학생이라고 생각할 수 있는 것이다. | + | |
- | * 내가 가르친 학생들의 평균점수가 132점을 모두 넘는다면 한 학생이 아니라 나의 집단이 (샘플이) 모집단에 속하지 않는 특별한 집단이라고 생각할 수 있다. | + | } |
- | * 그러나 현실적으로 이렇게 판단하기에는 넘어야 할 점수가 너무 크다. | + | n.100 <- 100 |
+ | means100 <- rep (NA, iter) | ||
+ | for(i in 1:iter){ | ||
+ | means100[i] = mean(sample(ajstu, | ||
+ | } | ||
- | + | n.400 <- 400 | |
- | {{: | + | means400 |
- | ===== n=4 ===== | + | |
- | + | ||
- | < | + | |
- | iter <- 10000 | + | |
- | n <- 4 | + | |
- | means <- rep (NA, iter) | + | |
for(i in 1:iter){ | for(i in 1:iter){ | ||
- | | + | |
} | } | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) | ||
- | se <- sigma/ | ||
- | sd2 <- 2*sd1 | + | n.900 <- 900 |
- | sd3 <- 3*sd1 | + | means900 |
- | + | ||
- | m | + | |
- | sd1 | + | |
- | se | + | |
- | + | ||
- | sd2 | + | |
- | sd3 | + | |
- | max(means) | + | |
- | min(means) | + | |
- | + | ||
- | h4 <- hist(means) | + | |
- | hist(means, main=" | + | |
- | curve(dnorm(x, | + | |
- | abline(v = m, lty=2, lwd=3, col=" | + | |
- | abline(v = mean(ca), lty=2, lwd=3, col=" | + | |
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | + | |
- | </ | + | |
- | {{: | + | |
- | < | + | |
- | > sd(means) | + | |
- | [1] 7.495025 | + | |
- | + | ||
- | > s.ca <- sd(ca)/ | + | |
- | > s.ca | + | |
- | [1] 7.477513 | + | |
- | </ | + | |
- | + | ||
- | ===== n = 36 ===== | + | |
- | + | ||
- | < | + | |
- | iter <- 10000 | + | |
- | n <- 36 | + | |
- | means <- rep (NA, iter) | + | |
for(i in 1:iter){ | for(i in 1:iter){ | ||
- | | + | |
} | } | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) | ||
- | se <- sigma/ | ||
- | sd2 <- 2*sd1 | + | n.1600 <- 1600 |
- | sd3 <- 3*sd1 | + | means1600 |
- | + | ||
- | m | + | |
- | sd1 | + | |
- | se | + | |
- | + | ||
- | sd2 | + | |
- | sd3 | + | |
- | max(means) | + | |
- | min(means) | + | |
- | + | ||
- | h36 <- hist(means) | + | |
- | hist(means, main=" | + | |
- | curve(dnorm(x, | + | |
- | abline(v = m, lty=2, lwd=3, col=" | + | |
- | abline(v = mean(ca), lty=2, lwd=3, col=" | + | |
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | + | |
- | + | ||
- | </ | + | |
- | + | ||
- | {{: | + | |
- | ===== n = 100 ===== | + | |
- | < | + | |
- | iter <- 10000 | + | |
- | n <- 100 | + | |
- | means <- rep (NA, iter) | + | |
for(i in 1:iter){ | for(i in 1:iter){ | ||
- | | + | |
} | } | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) | ||
- | se <- sigma/ | ||
- | sd2 <- 2*sd1 | + | n.2500 <- 2500 |
- | sd3 <- 3*sd1 | + | means2500 |
- | + | ||
- | m | + | |
- | sd1 | + | |
- | se | + | |
- | + | ||
- | sd2 | + | |
- | sd3 | + | |
- | max(means) | + | |
- | min(means) | + | |
- | + | ||
- | + | ||
- | h100 <- hist(means) | + | |
- | hist(means, main=" | + | |
- | curve(dnorm(x, | + | |
- | abline(v = m, lty=2, lwd=3, col=" | + | |
- | abline(v = mean(ca), lty=2, lwd=3, col=" | + | |
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | + | |
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | + | |
- | </ | + | |
- | {{: | + | |
- | ===== n = 400 ===== | + | |
- | < | + | |
- | iter <- 10000 | + | |
- | n <- 400 | + | |
- | means <- rep (NA, iter) | + | |
for(i in 1:iter){ | for(i in 1:iter){ | ||
- | | + | |
} | } | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) | ||
- | se <- sigma/ | ||
- | sd2 <- 2*sd1 | + | h4 <- hist(means4) |
- | sd3 <- 3*sd1 | + | h25 <- hist(means25) |
+ | h100 <- hist(means100) | ||
+ | h400 <- hist(means400) | ||
+ | h900 <- hist(means900) | ||
+ | h1600 <- hist(means1600) | ||
+ | h2500 <- hist(means2500) | ||
- | m | ||
- | sd1 | ||
- | se | ||
- | sd2 | + | plot(h4, ylim=c(0, |
- | sd3 | + | plot(h25, add = T, col=" |
- | max(means) | + | plot(h100, add = T, col=" |
- | min(means) | + | plot(h400, add = T, col=" |
+ | plot(h900, add = T, col=" | ||
- | h400 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ca), lty=2, lwd=3, col=" | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | </ | ||
- | {{: | ||
- | ===== n = 900 ===== | ||
- | < | + | sss <- c(4, |
- | iter <- 10000 | + | ses <- rep (NA, length(sss)) # std errors |
- | n <- 900 | + | for(i in 1:length(sss)){ |
- | means <- rep (NA, iter) | + | |
- | for(i in 1:iter){ | + | |
- | | + | |
} | } | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) | ||
- | se <- sigma/ | ||
- | sd2 <- 2*sd1 | + | ses |
- | sd3 <- 3*sd1 | + | se.1 <- ses |
+ | se.2 <- 2 * ses | ||
- | m | + | lower.s2 <- mean(ajstu)-se.2 |
- | sd1 | + | upper.s2 <- mean(ajstu)+se.2 |
- | se | + | data.frame(cbind(sss, |
- | sd2 | ||
- | sd3 | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | |||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ca), lty=2, lwd=3, col=" | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
</ | </ | ||
- | {{: | ||
< | < | ||
- | xmin <- 40 | + | # n =1600 일 경우에 |
- | xmax <- 100 | + | # sample의 평균이 |
- | ymax <- 2500 | + | # 확률은 어떻게 구해야 할까? |
- | plot(h4, col=rgb(0, | + | |
- | plot(h36, col=rgb(1/5,1,0,1/4), xlim=c(xmin,xmax), ylim=c(0,ymax), add=T) # second | + | # n = 1600 일 경우에 |
- | plot(h100, col=rgb(2/5,0,1,1/4), xlim=c(xmin, | + | # sampling distribution은 |
- | plot(h400, col=rgb(3/5,1,0,1/4), xlim=c(xmin, | + | # Xbar ~ N(100, var(ajstu)/n.1600) |
- | plot(h900, col=rgb(4/5,0,1,1/4), xlim=c(xmin, | + | # 그리고, 위에서 standard error값은 |
+ | # sqrt(var(ajstu)/n.1600) | ||
+ | # 이것을 standard error라고 부른다 | ||
+ | # 따라서 | ||
+ | se.1600 <- sqrt(var(ajstu)/n.1600) | ||
+ | pnorm(100.15, mean(ajstu), se.1600) | ||
</ | </ | ||
- | {{: | ||
+ | {{: | ||
===== Sampling distribution in proportion in R ===== | ===== Sampling distribution in proportion in R ===== | ||
Line 348: | Line 185: | ||
- | |||
- | |||
- | |||
- | ====== Sampling distribution in R e.g. 2 ====== | ||
- | 아주대학교 학생의 나이에 대한 모집단 정보가 있다고 하자. 아주대학교 학생의 학생 수는 모두 10, | ||
- | < | ||
- | n.ajstu <- 100000 | ||
- | mean.ajstu <- 24.6 | ||
- | sd.ajstu <- 2 | ||
- | set.seed(1024) | ||
- | ajstu <- rnorm(n.ajstu, | ||
- | hist(ajstu, | ||
- | abline(v=mean(ajstu), | ||
- | curve(dnorm(x, | ||
- | </ | ||
- | {{: | ||
- | ===== n = 4 ===== | ||
- | < | ||
- | iter <- 10000 | ||
- | n <- 4 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | |||
- | mean(ajstu) | ||
- | m <- mean(means) | ||
- | sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | sd1 | ||
- | sd(ajstu)/ | ||
- | sd2 <- 2*sd(means) | ||
- | sd3 <- 3*sd(means) | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | h4 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ajstu), | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | </ | ||
- | |||
- | {{: | ||
- | ===== n = 36 ===== | ||
- | < | ||
- | n <- 36 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | |||
- | mean(ajstu) | ||
- | m <- mean(means) | ||
- | m | ||
- | sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | sd1 | ||
- | sd(ajstu)/ | ||
- | sd2 <- 2*sd(means) | ||
- | sd3 <- 3*sd(means) | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | h36 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ajstu), | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | </ | ||
- | {{: | ||
- | ===== n = 100 ===== | ||
- | < | ||
- | n <- 100 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | |||
- | mean(ajstu) | ||
- | m <- mean(means) | ||
- | m | ||
- | sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | sd1 | ||
- | sd(ajstu)/ | ||
- | sd2 <- 2*sd(means) | ||
- | sd3 <- 3*sd(means) | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | h100 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ajstu), | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | </ | ||
- | < | ||
- | > mean(ajstu) | ||
- | [1] 24.60763 | ||
- | > m <- mean(means) | ||
- | > m | ||
- | [1] 24.60779 | ||
- | > sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | > sd1 | ||
- | [1] 0.1983636 | ||
- | > sd(ajstu)/ | ||
- | [1] 0.1997546 | ||
- | > sd2 <- 2*sd(means) | ||
- | > sd3 <- 3*sd(means) | ||
- | > max(means) | ||
- | [1] 25.29987 | ||
- | > min(means) | ||
- | [1] 23.8735 | ||
- | </ | ||
- | |||
- | {{: | ||
- | ===== n = 400 ===== | ||
- | < | ||
- | n <- 400 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | |||
- | mean(ajstu) | ||
- | m <- mean(means) | ||
- | m | ||
- | sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | sd1 | ||
- | sd(ajstu)/ | ||
- | sd2 <- 2*sd(means) | ||
- | sd3 <- 3*sd(means) | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | h400 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ajstu), | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | </ | ||
- | < | ||
- | > mean(ajstu) | ||
- | [1] 24.60763 | ||
- | > m <- mean(means) | ||
- | > m | ||
- | [1] 24.60927 | ||
- | > sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | > sd1 | ||
- | [1] 0.09943006 | ||
- | > sd(ajstu)/ | ||
- | [1] 0.09987731 | ||
- | > sd2 <- 2*sd(means) | ||
- | > sd3 <- 3*sd(means) | ||
- | > max(means) | ||
- | [1] 24.95824 | ||
- | > min(means) | ||
- | [1] 24.28413 | ||
- | </ | ||
- | {{: | ||
- | |||
- | ===== n = 900 ===== | ||
- | < | ||
- | n <- 900 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | |||
- | mean(ajstu) | ||
- | m <- mean(means) | ||
- | m | ||
- | sd1 <- sd(means) ## sdev of the dist. of sample means | ||
- | sd1 | ||
- | sd(ajstu)/ | ||
- | sd2 <- 2*sd(means) | ||
- | sd3 <- 3*sd(means) | ||
- | max(means) | ||
- | min(means) | ||
- | |||
- | h900 <- hist(means) | ||
- | hist(means, main=" | ||
- | curve(dnorm(x, | ||
- | abline(v = m, lty=2, lwd=3, col=" | ||
- | abline(v = mean(ajstu), | ||
- | abline(v = (m - sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m - sd3), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd1), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd2), lty=2, lwd=1, col=" | ||
- | abline(v = (m + sd3), lty=2, lwd=1, col=" | ||
- | |||
- | {{: | ||
- | |||
- | < | ||
- | xmin <- 21 | ||
- | xmax <- 28 | ||
- | ymax <- 3000 | ||
- | plot(h4, col=rgb(0, | ||
- | plot(h36, col=rgb(1/ | ||
- | plot(h100, col=rgb(2/ | ||
- | plot(h400, col=rgb(3/ | ||
- | plot(h900, col=rgb(4/ | ||
- | </ | ||
- | |||
- | {{: | ||
- | |||
- | |||
- | < | ||
- | n <- 10000 | ||
- | means <- rep (NA, iter) | ||
- | for(i in 1:iter){ | ||
- | means[i] = mean(sample(ajstu, | ||
- | } | ||
- | h10000 <- hist(means) | ||
- | hist(means, main=" | ||
- | abline(v = mean(means), | ||
- | abline(v = mean(ajstu), | ||
- | mean(ajstu) | ||
- | mean(means) | ||
- | max(means) | ||
- | min(means) | ||
- | </ | ||
sampling_distribution_in_r.1699832985.txt.gz · Last modified: 2023/11/13 08:49 by hkimscil