# NHANES lipid data
lipids <- read.delim("http://myweb.uiowa.edu/pbreheny/data/lipids.txt")
pop <- lipids$TRG
sam <- sample(pop, 25)
hist(pop, col="gray", border="white", breaks=seq(0, 400, 5), las=1)
hist(sam, col="gray", border="white", breaks=seq(0, 400, 50), las=1)
mean(pop)
mean(sam)

# Simulation
N <- 10000          ## Number of simulations to run
n <- 25             ## Sample size
means <- numeric(N) ## Setting up an empty vector
for (i in 1:N) {
  sam <- sample(pop, n)
  means[i] <- mean(sam)
}
hist(means, col="gray", border="white", breaks=seq(0, 400, 5), las=1)

# CLT accuracy
mean(means)

SD <- sd(pop)
SE <- SD/sqrt(n)
sd(means)
SE

z <- sqrt(n) * (means-mean(pop)) / sd(pop)
hist(z, col="gray", border="white", freq=FALSE, breaks=99, las=1)
zz <- seq(-4, 4, length=101)
lines(zz, dnorm(zz))

# CLT accuracy part 2
## Probability that a sample mean is less than 100 mg/dL
mean(means <= 100)
pnorm(100, mean(pop), SE)  ## Using the location-scale normal
pnorm((100-mean(pop))/SE) ## Using the standard normal

## 90th percentile
quantile(means, .9)
qnorm(.9, mean(pop), SE) ## Using the location-scale normal
qnorm(.9)*SE + mean(pop) ## Using the standard normal

# Cystic fibrosis study
cf <- read.delim("http://myweb.uiowa.edu/pbreheny/data/cystic-fibrosis.txt")
Diff <- cf$Placebo - cf$Drug
hist(Diff, col="gray", border="white", las=1)
t.test(Diff)

# NHANES sim mean
N <- 10000          ## Number of simulations to run
n <- 25             ## Sample size
covered <- numeric(N)
for (i in 1:N) {
  sam <- sample(pop, n)
  covered[i] <- t.test(sam, mu=mean(pop))$p.value > 0.05
}
mean(covered)

# NHANES sim variance
N <- 10000          ## Number of simulations to run
n <- 25             ## Sample size
covered <- numeric(N)
for (i in 1:N) {
  sam <- sample(pop, n)
  ci <- var(sam)*(n-1)/qchisq(c(0.975,0.025), n-1)
  covered[i] <- (ci[1] < var(pop)) & (var(pop) < ci[2])
}
mean(covered)
