hist(c(300,2000), 
     prob = T, 
     xlim = c(-3.2,3.2),
     ylim = c(0,0.45),
     xlab = "",
     main = "Standard Normal")

X <- rnorm(3000000, mean = 0, sd = 1)
lines(density(X), lwd = 2, col = "blue") 
abline(v = 0, lwd = 2, lty = 2, col = "red")

nhanes <- read.delim('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/lipids/lipids.txt')

## hist(nhanes$LDL,
##      xlab = "LDL Levels (mg/dL)",
##      main = "Histogram of LDL Levels")

{
hist(nhanes$LDL, prob = T, xlab = "LDL Levels (mg/dL)", main = "Histogram of LDL Levels")
xbar<- mean(nhanes$LDL)
std.dev<- sd(nhanes$LDL)
X <- rnorm(3000000, mean = xbar, sd = std.dev)
lines(density(X), lwd = 2, col = "blue") 
}

xbar <- mean(nhanes$LDL)
std.dev <- sd(nhanes$LDL)

# Using the pnorm function directly
pnorm(160, mean = xbar, sd = std.dev, lower.tail = FALSE)

# Or simply find the z-score first and plug it into the pnorm function
z <- (160 - xbar) / std.dev
pnorm(z, lower.tail = FALSE) # Note that we did not insert a mean/sd argument. R will default these arguments to the standard normal curve, mean  = 0, sd = 1

1 - pnorm(160, mean = xbar, sd = std.dev, lower.tail = TRUE)

sum(nhanes$LDL >= 160) / length(nhanes$LDL)

# Finding the area of the lower tails: P(X < 189) - P(X < 160)
pnorm(189, mean = xbar, sd=std.dev, lower.tail = TRUE) - 
pnorm(160, mean = xbar, sd=std.dev, lower.tail = TRUE)

#OR

# Finding the area of the upper tails: P(X > 160) - P(X > 189)
pnorm(160, mean = xbar, sd=std.dev, lower.tail = FALSE) -
pnorm(189, mean = xbar, sd=std.dev, lower.tail = FALSE)

#OR

# Finding the area outside the desired interval and taking the complement: 1 - (P(X < 160) + P(X > 189))
1 - (pnorm(160, mean = xbar, sd=std.dev, lower.tail = TRUE) +
pnorm(189, mean = xbar, sd=std.dev, lower.tail = FALSE))

# F is a shorthand for FALSE, likewise T is for TRUE
qnorm(p = 0.06872828, mean = xbar, sd=std.dev, lower.tail = F)

TRG <- nhanes$TRG
hist(TRG)
abline(v=mean(TRG), lty=1, lwd = 2, col="blue")
abline(v=median(TRG), lty=2, lwd = 2, col="red")
legend(x = "topright",
       legend = c("Mean","Median"),
       lty = c(1, 2),
       col = c("blue", "red"))

sample10 <- sample(TRG, 10)
sample30 <- sample(TRG, 30)
sample300 <- sample(TRG, 300)

mean(sample10)
mean(sample30)
mean(sample300)

# install.packages("readxl")
library(readxl)
# samps <- read_excel("Book.xlsx", col_names = TRUE)
# hist(samps$Means,
#      main = "Means of 10 subject study",
#      xlab = "")

ten <- NULL
thirty <- NULL
threehundred <- NULL
for (i in 1:30000){
  sample10 <- sample(nhanes$TRG, 10)
  sample30 <- sample(nhanes$TRG, 30)
  sample300 <- sample(nhanes$TRG, 300)

  ten[i] <- mean(sample10)
  thirty[i] <- mean(sample30)
  threehundred[i] <- mean(sample300)
}
hist(ten, main = "Means of 10 subject study", breaks = 15, xlab = "")

hist(thirty, main = "Means of 30 subject study", breaks = 15, xlab = "")
hist(threehundred, main = "Means of 300 subject study", breaks = 15, xlab = "")

x <- matrix(c(mean(ten), sd(ten), mean(thirty), sd(thirty), mean(threehundred), sd(threehundred), mean(nhanes$TRG), sd(nhanes$TRG)), nrow = 4, byrow = T)
rownames(x) <- c("sample of ten", "sample of thirty", "sample of three-hundred", "population")
colnames(x) <- c("mean", "sd")
x

# Study with 10 subjects
# Exact standard error based on the sampling distribution
sd(TRG)/sqrt(10)

# Study with 30 subjects
# Exact standard error based on the sampling distribution
sd(TRG)/sqrt(30)

# Study with 100 subjects
# Exact standard error based on the sampling distribution
sd(TRG)/sqrt(300)

mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

pnorm(q = 123, mean = mean_LDL, sd = sd_LDL, lower.tail = F)

mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

pnorm(q = 126, mean = mean_LDL, sd = sd_LDL, lower.tail = T) - pnorm(q = 118, mean = mean_LDL, sd = sd_LDL, lower.tail = T)

mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

qnorm(p = 0.2, mean = mean_LDL, sd = sd_LDL, lower.tail = F)

mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

pnorm(q = 123, mean = mean_LDL, sd = se_LDL, lower.tail = F)

mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

pnorm(q = 126, mean = mean_LDL, sd = se_LDL, lower.tail = T) - pnorm(q = 118, mean = mean_LDL, sd = se_LDL, lower.tail = T)

mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

qnorm(0.025, mean = mean_LDL, sd = se_LDL)
qnorm(0.975, mean = mean_LDL, sd = se_LDL)
