## NHANES lipid data
lipids <- read.delim("http://myweb.uiowa.edu/pbreheny/data/lipids.txt")
pop <- lipids$TRG
sam <- sample(pop, 25)
hist(pop, col="gray", border="white", breaks=seq(0, 400, length=99))
hist(sam, col="gray", border="white", breaks=seq(0, 400, length=10))
mean(pop)
mean(sam)

## Our first simulation
N <- 10000          ## Number of simulations to run
n <- 25             ## Sample size
means <- numeric(N) ## Setting up an empty vector
for (i in 1:N) {
  sam <- sample(pop, n)
  means[i] <- mean(sam)
}
hist(means, col="gray", border="white", breaks=seq(0, 400, length=99))

## CLT accuracy
mean(means)

SD <- sd(pop)
SE <- SD/sqrt(n)
sd(means)
SE

z <- sqrt(n) * (means-mean(pop)) / sd(pop)
hist(z, col="gray", border="white", freq=FALSE, breaks=99)
zz <- seq(-4, 4, length=101)
lines(zz, dnorm(zz))

## CLT accuracy part 2
## Probability that a sample mean is less than 100 mg/dL
mean(means <= 100)
pnorm(100, mean(pop), SE)  ## Using the location-scale normal
pnorm((100-mean(pop))/SE) ## Using the standard normal

## 90th percentile
quantile(means, .9)
qnorm(.9, mean(pop), SE) ## Using the location-scale normal
qnorm(.9)*SE + mean(pop) ## Using the standard normal

## Coverage simulation
N <- 10000
n <- 10
pi <- 0.25
x <- rbinom(N, prob=0.25, size=n)
covered <- numeric(N)
for (i in 1:N) {
  ci <- binom.test(x[i], n)$conf
  covered[i] <- (ci[1] < pi) & (pi < ci[2])
}
mean(covered)

## Coverage simulation loop
source("http://myweb.uiowa.edu/pbreheny/571/f14/labs/displayProgressBar.R")
N <- 10000
n <- 20
pi <- c(0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.5, 0.68, 0.84, 0.92, 0.96, 0.98, 0.99)
coverage <- numeric(length(pi))
for (j in 1:length(pi)) {
  x <- rbinom(N, prob=pi[j], size=n)
  covered <- numeric(N)
  for (i in 1:N) {
    ci <- binom.test(x[i], n)$conf
    covered[i] <- (ci[1] < pi[j]) & (pi[j] < ci[2])
  }
  coverage[j] <- mean(covered)
  displayProgressBar(j, length(pi))
}
plot(pi, coverage, type="l", lwd=3, ylim=c(0.8,1))
abline(h=0.95, col="red", lwd=3)

