====== PS1. week02 ====== rm(list=ls()) rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) } se <- function(sample) { sd(sample)/sqrt(length(sample)) } ss <- function(x) { sum((x-mean(x))^2) } ################################ N.p <- 1000000 m.p <- 100 sd.p <- 10 p1 <- rnorm2(N.p, m.p, sd.p) mean(p1) sd(p1) p2 <- rnorm2(N.p, m.p+20, sd.p) mean(p2) sd(p2) m.p1 <- mean(p1) sd.p1 <- sd(p1) var(p1) hist(p1, breaks=100, col=rgb(1,1,1,1)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # area bet black = 68% # between red = 95% # between green = 99% pnorm(m.p1+sd.p1, m.p1, sd.p1) pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+sd.p1, m.p1, sd.p1) - pnorm(m.p1-sd.p1, m.p1, sd.p1) pnorm(m.p1+2*sd.p1, m.p1, sd.p1) - pnorm(m.p1-2*sd.p1, m.p1, sd.p1) pnorm(m.p1+3*sd.p1, m.p1, sd.p1) - pnorm(m.p1-3*sd.p1, m.p1, sd.p1) m.p1 sd.p1 (m.p1-m.p1)/sd.p1 ((m.p1-sd.p1) - m.p1) / sd.p1 (120-100)/10 pnorm(1)-pnorm(-1) pnorm(2)-pnorm(-2) pnorm(3)-pnorm(3) 1-pnorm(-2)*2 pnorm(2)-pnorm(-2) pnorm(120, 100, 10) pnorm(2)-pnorm(-2) zscore <- (120-100)/10 pnorm(zscore)-pnorm(-zscore) zscore # reminder. pnorm(-1) pnorm(1, 0, 1, lower.tail = F) pnorm(110, 100, 10, lower.tail = F) zscore <- (110-100)/10 pnorm(zscore, lower.tail = F) pnorm(118, 100, 10, lower.tail = F) pnorm(18/10, lower.tail = F) z.p1 <- (p1-mean(p1))/sd(p1) mean(z.p1) round(mean(z.p1),10) sd(z.p1) pnorm(1.8)-pnorm(-1.8) hist(z.p1, breaks=100, col=rgb(0,0,0,0)) abline(v=c(m.p1, -1.8, 1.8), col='red') 1-(pnorm(1.8)-pnorm(-1.8)) pnorm(1)-pnorm(-1) 1-(pnorm(-1)*2) pnorm(2)-pnorm(-2) 1-(pnorm(-2)*2) 1-(pnorm(-3)*2) # hist(p1, breaks=100, col=rgb(1,1,1,1)) abline(v=mean(p1),lwd=2) abline(v=mean(p1)-sd(p1), lwd=2) abline(v=mean(p1)+sd(p1), lwd=2) abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') # 68% a <- qnorm(.32/2) b <- qnorm(1-.32/2) c(a, b) c(-1, 1) # 95% c <- qnorm(.05/2) d <- qnorm(1-.05/2) c(c, d) c(-2,2) # 99% e <- qnorm(.01/2) f <- qnorm(1-.01/2) c(e,f) c(-3,3) pnorm(b)-pnorm(a) c(a, b) pnorm(d)-pnorm(c) c(c, d) pnorm(f)-pnorm(e) c(e, f) qnorm(.5) qnorm(1) ################################ hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), main = "histogram of p1 and p2",) abline(v=mean(p1), col="black", lwd=3) hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5)) abline(v=mean(p2), col="violet", lwd=3) s.size <- 10 means.temp <- c() s1 <- sample(p1, s.size, replace = T) mean(s1) means.temp <- append(means.temp, mean(s1)) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) means.temp iter <- 1000000 # means <- c() means <- rep(NA, iter) for (i in 1:iter) { # means <- append(means, mean(sample(p1, s.size, replace = T))) means[i] <- mean(sample(p1, s.size, replace = T)) } length(means) mean(means) sd(means) se.s <- sd(means) hist(means, breaks=100, col=rgb(.1, 0, 0, .5)) abline(v=mean(means), col="red", lwd=2) # now we want to get sd of this distribution lo1 <- mean(means)-se.s hi1 <- mean(means)+se.s lo2 <- mean(means)-2*se.s hi2 <- mean(means)+2*se.s lo3 <- mean(means)-3*se.s hi3 <- mean(means)+3*se.s hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("green","green", "blue", "blue", "orange", "orange"), lwd=2) # meanwhile . . . . se.s se.z <- sqrt(var(p1)/s.size) se.z <- c(se.z) se.z # sd of sample means (sd(means)) # is sqrt(var(s1)/s.size) or # sd(s1) / sqrt(s.size) # = se.s # when iter value goes to # unlimited value: # mean(means) = mean(p1) # and # sd(means) = sd(p1) / sqrt(s.size) # that is, sd(means) = se.z # This is called CLT (Central Limit Theorem) mean(means) mean(p1) sd(means) var(p1) sqrt(var(p1)/s.size) se.z sd(means) se.s se.z # because CLT loz1 <- mean(p1)-se.z hiz1 <- mean(p1)+se.z loz2 <- mean(p1)-2*se.z hiz2 <- mean(p1)+2*se.z loz3 <- mean(p1)-3*se.z hiz3 <- mean(p1)+3*se.z c(lo1, loz1) c(lo2, loz2) c(lo3, loz3) c(hi1, hiz1) c(hi2, hiz2) c(hi3, hiz3) hist(means, xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) # abline(v=mean(p2), colo='darkgreen', lwd=3) abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), col=c("green","green", "blue", "blue", "orange", "orange"), lwd=2) round(c(lo1, hi1)) round(c(lo2, hi2)) round(c(lo3, hi3)) round(c(loz1, hiz1)) round(c(loz2, hiz2)) round(c(loz3, hiz3)) m.sample.i.got <- mean(means)+ 1.5*sd(means) m.sample.i.got hist(means, xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=3) abline(v=m.sample.i.got, col='darkgreen', lwd=3) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(means), sd(means), lower.tail = F) pnorm(m.sample.i.got, mean(p1), se.z, lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='green', lwd=3) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) m.sample.i.got ### one more time mean(p2) sd(p2) sample(p2, s.size) m.sample.i.got <- mean(sample(p2, s.size)) m.sample.i.got hist(means, xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), col = rgb(1, 0, 0, .5)) abline(v=mean(means), col="black", lwd=2) abline(v=m.sample.i.got, col='darkgreen', lwd=2) # what is the probablity of getting # greater than # m.sample.i.got? m.sample.i.got pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) # then, what is the probabilty of getting # greater than m.sample.i.got and # less than corresponding value, which is # mean(means) - m.sample.i.got - mean(means) # (green line) tmp <- mean(means) - (m.sample.i.got - mean(means)) abline(v=tmp, col='green', lwd=2) 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) ====== output ====== > rm(list=ls()) > rnorm2 <- function(n,mean,sd){ + mean+sd*scale(rnorm(n)) + } > > se <- function(sample) { + sd(sample)/sqrt(length(sample)) + } > > ss <- function(x) { + sum((x-mean(x))^2) + } > .... > ################################ > N.p <- 1000000 > m.p <- 100 > sd.p <- 10 > > p1 <- rnorm2(N.p, m.p, sd.p) > mean(p1) [1] 100 > sd(p1) [1] 10 > > p2 <- rnorm2(N.p, m.p+20, sd.p) > mean(p2) [1] 120 > sd(p2) [1] 10 > > m.p1 <- mean(p1) > sd.p1 <- sd(p1) > var(p1) [,1] [1,] 100 > > hist(p1, breaks=100, col=rgb(1,1,1,1)) > abline(v=mean(p1),lwd=2) > abline(v=mean(p1)-sd(p1), lwd=2) > abline(v=mean(p1)+sd(p1), lwd=2) > abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') > abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') > > # area bet black = 68% > # between red = 95% > # between green = 99% > > pnorm(m.p1+sd.p1, m.p1, sd.p1) [1] 0.8413447 > pnorm(m.p1-sd.p1, m.p1, sd.p1) [1] 0.1586553 > pnorm(m.p1+sd.p1, m.p1, sd.p1) - + pnorm(m.p1-sd.p1, m.p1, sd.p1) [1] 0.6826895 > > pnorm(m.p1+2*sd.p1, m.p1, sd.p1) - + pnorm(m.p1-2*sd.p1, m.p1, sd.p1) [1] 0.9544997 > > pnorm(m.p1+3*sd.p1, m.p1, sd.p1) - + pnorm(m.p1-3*sd.p1, m.p1, sd.p1) [1] 0.9973002 > > m.p1 [1] 100 > sd.p1 [1] 10 > > (m.p1-m.p1)/sd.p1 [1] 0 > ((m.p1-sd.p1) - m.p1) / sd.p1 [1] -1 > (120-100)/10 [1] 2 > pnorm(1)-pnorm(-1) [1] 0.6826895 > pnorm(2)-pnorm(-2) [1] 0.9544997 > pnorm(3)-pnorm(-3) [1] 0.9973002 > > 1-pnorm(-2)*2 [1] 0.9544997 > pnorm(2)-pnorm(-2) [1] 0.9544997 > > pnorm(120, 100, 10) [1] 0.9772499 > pnorm(2)-pnorm(-2) [1] 0.9544997 > .... > zscore <- (120-100)/10 > pnorm(zscore)-pnorm(-zscore) [1] 0.9544997 > zscore [1] 2 > > > pnorm(-1) [1] 0.1586553 > pnorm(1, 0, 1, lower.tail = F) [1] 0.1586553 > pnorm(110, 100, 10, lower.tail = F) [1] 0.1586553 > zscore <- (110-100)/10 > pnorm(zscore, lower.tail = F) [1] 0.1586553 > > pnorm(118, 100, 10, lower.tail = F) [1] 0.03593032 > pnorm(18/10, lower.tail = F) [1] 0.03593032 > .... > z.p1 <- (p1-mean(p1))/sd(p1) > mean(z.p1) [1] -2.319195e-17 > round(mean(z.p1),10) [1] 0 > sd(z.p1) [1] 1 > pnorm(1.8)-pnorm(-1.8) [1] 0.9281394 > > hist(z.p1, breaks=100, col=rgb(0,0,0,0)) > abline(v=c(m.p1, -1.8, 1.8), col='red') > 1-(pnorm(1.8)-pnorm(-1.8)) [1] 0.07186064 > > .... > pnorm(1)-pnorm(-1) [1] 0.6826895 > 1-(pnorm(-1)*2) [1] 0.6826895 > > pnorm(2)-pnorm(-2) [1] 0.9544997 > 1-(pnorm(-2)*2) [1] 0.9544997 > > 1-(pnorm(-3)*2) [1] 0.9973002 > .... > # > hist(p1, breaks=100, col=rgb(1,1,1,1)) > abline(v=mean(p1),lwd=2) > abline(v=mean(p1)-sd(p1), lwd=2) > abline(v=mean(p1)+sd(p1), lwd=2) > abline(v=c(m.p1-2*sd.p1, m.p1+2*sd.p1), lwd=2, col='red') > abline(v=c(m.p1-3*sd.p1, m.p1+3*sd.p1), lwd=2, col='green') > > # 68% > a <- qnorm(.32/2) > b <- qnorm(1-.32/2) > c(a, b) [1] -0.9944579 0.9944579 > c(-1, 1) [1] -1 1 > > # 95% > c <- qnorm(.05/2) > d <- qnorm(1-.05/2) > c(c, d) [1] -1.959964 1.959964 > c(-2,2) [1] -2 2 > # 99% > e <- qnorm(.01/2) > f <- qnorm(1-.01/2) > c(e,f) [1] -2.575829 2.575829 > c(-3,3) [1] -3 3 > > pnorm(b)-pnorm(a) [1] 0.68 > c(a, b) [1] -0.9944579 0.9944579 > pnorm(d)-pnorm(c) [1] 0.95 > c(c, d) [1] -1.959964 1.959964 > pnorm(f)-pnorm(e) [1] 0.99 > c(e, f) [1] -2.575829 2.575829 > > qnorm(.5) [1] 0 > qnorm(1) [1] Inf > > .... > ################################ > hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5), + main = "histogram of p1 and p2",) > abline(v=mean(p1), col="black", lwd=3) > hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5)) > abline(v=mean(p2), col="violet", lwd=3) > > s.size <- 10 > > means.temp <- c() > s1 <- sample(p1, s.size, replace = T) > mean(s1) [1] 99.64273 > means.temp <- append(means.temp, mean(s1)) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp <- append(means.temp, mean(sample(p1, s.size, replace = T))) > means.temp [1] 99.64273 107.15516 103.81192 103.12311 105.88372 > .... > iter <- 1000000 > # means <- c() > means <- rep(NA, iter) > for (i in 1:iter) { + # means <- append(means, mean(sample(p1, s.size, replace = T))) + means[i] <- mean(sample(p1, s.size, replace = T)) + } > length(means) [1] 1000000 > mean(means) [1] 99.99544 > sd(means) [1] 3.161886 > se.s <- sd(means) > > hist(means, breaks=100, col=rgb(.1, 0, 0, .5)) > abline(v=mean(means), col="red", lwd=2) > .... > # now we want to get sd of this distribution > lo1 <- mean(means)-se.s > hi1 <- mean(means)+se.s > lo2 <- mean(means)-2*se.s > hi2 <- mean(means)+2*se.s > lo3 <- mean(means)-3*se.s > hi3 <- mean(means)+3*se.s > > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > .... > # meanwhile . . . . > se.s [1] 3.161886 > > se.z <- sqrt(var(p1)/s.size) > se.z <- c(se.z) > se.z [1] 3.162278 > > # sd of sample means (sd(means)) > # = se.s > > # when iter value goes to > # unlimited value: > # mean(means) = mean(p1) > # and > # sd(means) = sd(p1) / sqrt(s.size) > # that is, sd(means) = se.z > # This is called CLT (Central Limit Theorem) > mean(means) [1] 99.99544 > mean(p1) [1] 100 > sd(means) [1] 3.161886 > var(p1) [,1] [1,] 100 > sqrt(var(p1)/s.size) [,1] [1,] 3.162278 > se.z [1] 3.162278 > > sd(means) [1] 3.161886 > se.s [1] 3.161886 > se.z [1] 3.162278 > > # because CLT > loz1 <- mean(p1)-se.z > hiz1 <- mean(p1)+se.z > loz2 <- mean(p1)-2*se.z > hiz2 <- mean(p1)+2*se.z > loz3 <- mean(p1)-3*se.z > hiz3 <- mean(p1)+3*se.z > > c(lo1, loz1) [1] 96.83356 96.83772 > c(lo2, loz2) [1] 93.67167 93.67544 > c(lo3, loz3) [1] 90.50978 90.51317 > > c(hi1, hiz1) [1] 103.1573 103.1623 > c(hi2, hiz2) [1] 106.3192 106.3246 > c(hi3, hiz3) [1] 109.4811 109.4868 > > .... > hist(means, + xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > # abline(v=mean(p2), colo='darkgreen', lwd=3) > abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3), + col=c("green","green", "blue", "blue", "orange", "orange"), + lwd=2) > > round(c(lo1, hi1)) [1] 97 103 > round(c(lo2, hi2)) [1] 94 106 > round(c(lo3, hi3)) [1] 91 109 > > round(c(loz1, hiz1)) [1] 97 103 > round(c(loz2, hiz2)) [1] 94 106 > round(c(loz3, hiz3)) [1] 91 109 > .... > m.sample.i.got <- mean(means)+ 1.5*sd(means) > m.sample.i.got [1] 104.7383 > > hist(means, + xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=3) > abline(v=m.sample.i.got, col='darkgreen', lwd=3) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 104.7383 > pnorm(m.sample.i.got, mean(means), sd(means), lower.tail = F) [1] 0.0668072 > pnorm(m.sample.i.got, mean(p1), se.z, lower.tail = F) [1] 0.06701819 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=3) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 0.1339882 > m.sample.i.got [1] 104.7383 > .... > ### one more time > mean(p2) [1] 120 > sd(p2) [1] 10 > sample(p2, s.size) [1] 120.9639 119.8341 134.0344 122.5446 121.4557 113.6820 114.7603 105.2138 122.7027 125.4131 > m.sample.i.got <- mean(sample(p2, s.size)) > m.sample.i.got [1] 120.2492 > > hist(means, + xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)), + col = rgb(1, 0, 0, .5)) > abline(v=mean(means), col="black", lwd=2) > abline(v=m.sample.i.got, col='darkgreen', lwd=2) > > # what is the probablity of getting > # greater than > # m.sample.i.got? > m.sample.i.got [1] 120.2492 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 7.560352e-11 > > # then, what is the probabilty of getting > # greater than m.sample.i.got and > # less than corresponding value, which is > # mean(means) - m.sample.i.got - mean(means) > # (green line) > tmp <- mean(means) - (m.sample.i.got - mean(means)) > abline(v=tmp, col='green', lwd=2) > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F) [1] 1.51207e-10 > ....