User Tools

Site Tools


r:sampling_distribution

This is an old revision of the document!


rm(list=ls())
rnorm2 <- function(n,mean,sd){ 
  mean+sd*scale(rnorm(n)) 
}

se <- function(sample) {
  sd(sample)/sqrt(length(sample))
}

ss <- function(x) {
  sum((x-mean(x))^2)
}
################################

# reminder. 
pnorm(-1)
pnorm(1, lower.tail = F)
1-(pnorm(-1) + pnorm(1, lower.tail = F))
1-(pnorm(-1)*2)

pnorm(-2)
pnorm(2, lower.tail = F)
1-(pnorm(-2)*2)

pnorm(-3)
pnorm(3, lower.tail = F)
1-(pnorm(-3)*2)


# 68%
a <- qnorm(.32/2)
b <- qnorm(1-.32/2)
c(a, b)
c(-1, 1)
# 95%
c <- qnorm(.05/2)
d <- qnorm(1-.05/2)
c(c, d)
c(-2,2)
# 99% 
e <- qnorm(.01/2)
f <- qnorm(1-.01/2)
c(e,f)
c(-3,3)

pnorm(b)-pnorm(a)
pnorm(d)-pnorm(c)
pnorm(f)-pnorm(e)

################################
N.p <- 1000000
m.p <- 100
sd.p <- 10

p1 <- rnorm2(N.p, m.p, sd.p)
mean(p1)
sd(p1)

p2 <- rnorm2(N.p, m.p+20, sd.p)
mean(p2)
sd(p2)

hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
     main = "histogram of p1",)
abline(v=mean(p1), col="black", lwd=3)


s.size <- 10

means.temp <- c()
s1 <- sample(p1, s.size, replace = T)
mean(s1)
means.temp <- append(means.temp, mean(s1))
means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T)))
means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T)))
means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T)))
means.temp

iter <- 1000000
means <- c()
means <- rep(NA, iter)
for (i in 1:iter) {
  # means <- append(means, mean(sample(p1, s.size, replace = T)))
  means[i] <- mean(sample(p1, s.size, replace = T))
}

mean(means)
sd(means)
se.s <- sd(means)

lo1 <- mean(means)-se.s
hi1 <- mean(means)+se.s
lo2 <- mean(means)-2*se.s
hi2 <- mean(means)+2*se.s
lo3 <- mean(means)-3*se.s
hi3 <- mean(means)+3*se.s

hist(means, 
     xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), 
     col = rgb(1, 0, 0, .5))
abline(v=mean(means), col="black", lwd=2)
# abline(v=mean(p2), colo='darkgreen', lwd=3)
abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), 
       col=c("green","green", "blue", "blue", "orange", "orange"), 
       lwd=2)

# meanwhile . . . .
se.s

se.z <- sqrt(var(p1)/s.size)
se.z <- c(se.z)
se.z

# sd of sample means (sd(means))
# is sqrt(var(s1)/s.size) or 
# sd(s1) / sqrt(s.size) 
# = se.s

# when iter value goes to 
# unlimited value:
# mean(means) = mean(p1) 
# and
# sd(means) = sd(p1) / sqrt(s.size)
# that is, sd(means) = se.z
# This is called CLT (Central Limit Theorem)
mean(means)
mean(p1)
sd(means)
se.z

# because CLT
loz1 <- mean(p1)-se.z
hiz1 <- mean(p1)+se.z
loz2 <- mean(p1)-2*se.z
hiz2 <- mean(p1)+2*se.z
loz3 <- mean(p1)-3*se.z
hiz3 <- mean(p1)+3*se.z

c(lo1, loz1)
c(lo2, loz2)
c(lo3, loz3)

c(hi1, hiz1)
c(hi2, hiz2)
c(hi3, hiz3)


hist(means, 
     xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), 
     col = rgb(1, 0, 0, .5))
abline(v=mean(means), col="black", lwd=2)
abline(v=mean(p1), col="red", lwd=2)
# abline(v=mean(p2), colo='darkgreen', lwd=3)
abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), 
       col=c("green","green", "blue", "blue", "orange", "orange"), 
       lwd=2)
abline(v=c(loz1, hiz1, loz2, hiz2, loz3, hiz3), 
       col=c("red","red", "red", "red", "red", "red"), 
       lwd=2)


round(c(lo1, hi1))
round(c(lo2, hi2))
round(c(lo3, hi3))

round(c(loz1, hiz1))
round(c(loz2, hiz2))
round(c(loz3, hiz3))

m.sample.i.got <- 101.8

hist(means, 
     xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), 
     col = rgb(1, 0, 0, .5))
abline(v=mean(means), col="black", lwd=3)
abline(v=m.sample.i.got, col='darkgreen', lwd=3)

# what is the probablity of getting
# greater than 
# m.sample.i.got?
m.sample.i.got
pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)

# then, what is the probabilty of getting 
# greater than m.sample.i.got and
# less than corresponding value, which is
# mean(means) - m.sample.i.got - mean(means)
# (green line)
tmp <- mean(means) - (m.sample.i.got - mean(means))
abline(v=tmp, col='green', lwd=3)
2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)

### one more time 
mean(p2)
sd(p2)
sample(p2, s.size)
m.sample.i.got <- mean(sample(p2, s.size))
m.sample.i.got

hist(means, 
     xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), 
     col = rgb(1, 0, 0, .5))
abline(v=mean(means), col="black", lwd=2)
abline(v=m.sample.i.got, col='darkgreen', lwd=2)

# what is the probablity of getting
# greater than 
# m.sample.i.got?
m.sample.i.got
pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)

# then, what is the probabilty of getting 
# greater than m.sample.i.got and
# less than corresponding value, which is
# mean(means) - m.sample.i.got - mean(means)
# (green line)
tmp <- mean(means) - (m.sample.i.got - mean(means))
abline(v=tmp, col='green', lwd=2)
2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
r/sampling_distribution.1757407662.txt.gz · Last modified: 2025/09/09 17:47 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki