## -----------------------------------------------------------------------------
#| include: false
library(googlesheets4)
library(ggplot2)
library(flextable)


## -----------------------------------------------------------------------------
#| echo: false
#| message: false
nhanes <- read.delim('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/lipids/lipids.txt')

true_mean <- mean(nhanes$TRG)
true_sd <- sd(nhanes$TRG)

# plot original data
ggplot(nhanes, aes(TRG)) +
  theme_classic() +
  geom_histogram(
    fill="lightgrey", 
    color = "grey"
  ) +
  geom_segment(
    x=mean(nhanes$TRG), 
    y = 0, 
    yend = 400, 
    color = "firebrick", 
    linetype = 2,
    linewidth = 1
  ) +
  geom_segment(
    x=median(nhanes$TRG), 
    y = 0, 
    yend = 400, 
    color = "blue3", 
    linetype = 3,
    linewidth = 1
  ) +
  annotate(
    "text", 
    x = true_mean + 10, y = 420, # Positioned slightly above the line
    label = "Mean", color = "firebrick", fontface = "bold"
  ) +
  annotate(
    "text", 
    x = median(nhanes$TRG)-10, y = 420, 
    label = "Median", color = "blue3", fontface = "bold"
  ) +
  labs(
    title= "3026 Women in the NHANES Study\nWe will consider them the Population", 
    x="Triglyceride levels"
  )


## -----------------------------------------------------------------------------
#| eval: false
# names <- c( # put some names in the vector
#   "maggie",
#   "joe",
#   "jill",
#   "steve",
#   "Karen"
# )
# 
# n <- 2 # choose some number you want to sample
# sample(names, 2) # randomly choose n people from the group (population)


## -----------------------------------------------------------------------------
#| eval: false

# # read in the data
# nhanes <- read.delim('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/lipids/lipids.txt')
# 
# # block 1
# sample10 <- sample(nhanes$TRG, 10)
# mean(sample10)
# 
# # block 2
# sample30 <- sample(nhanes$TRG, 30)
# mean(sample30)
# 
# # block 3
# sample300 <- sample(nhanes$TRG, 300)
# mean(sample300)


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false
# Begin Demo
# Connect to Google spreadsheet
gs4_auth(
  email = "joshwells0417@gmail.com", # type in your gmail here
  scopes = "spreadsheets.readonly") 



# ----------------------------------
# data from sheet
clt <- read_sheet("https://docs.google.com/spreadsheets/d/1ZyfVeIVco_KJlK_MclUdPSiBDhE91GeCj2fXgheZJec/edit?usp=sharing")

# ----------------------------------

# simulation
N <- 100 # suppose 20 students show up and input values
# clt <- data.frame(
#   samp_means_10 = replicate(N, mean(sample(nhanes$TRG, 10))), 
#   samp_means_30 = replicate(N, mean(sample(nhanes$TRG, 30))),
#   samp_means_300 = replicate(N, mean(sample(nhanes$TRG, 300)))
# )

#-----------------------------------

# this is to keep the results from showing until we post them :)
# clt <- data.frame(
#   samp_means_10 = replicate(N, 0), 
#   samp_means_30 = replicate(N, 0),
#   samp_means_300 = replicate(N, 0)
# )
# -----------------------------------


# sampling distribution summary stats
summary <- data.frame(
  samp_mean = sapply(clt, mean), 
  samp_se = sapply(clt, sd), 
  true_mean = rep(true_mean, 3), 
  true_se = c(true_sd/sqrt(10), true_sd/sqrt(30), true_sd/sqrt(300))
)


## -----------------------------------------------------------------------------
#| echo: false
#| message: false
#| warning: false

ggplot(clt, aes(x = samp_means_10)) +
  theme_classic() +
  scale_x_continuous(limits = c(0, 400)) +
  labs(
    title = "Distribution of Sample Means (n=10) with Normal Curve", 
    x = "Mean Triglyceride Levels",
    y = "Density"
  )+
  # Overlay the Normal Curve suggestion by Google Gemini
  # Change y-axis to density
  geom_histogram(
    aes(y = after_stat(density)), 
    fill = "lightgrey", color = "grey"
  )  +
  stat_function(
    fun = dnorm, 
    args = list(
      mean = mean(clt$samp_means_10, na.rm = TRUE), 
      sd = sd(clt$samp_means_10, na.rm = TRUE)
    ),
    color = "dodgerblue4", linewidth = 1
  ) + 
  # end of gemini suggestion
  geom_vline(xintercept = true_mean, color = "firebrick", lty = 2) +
  annotate("text", 
           x=true_mean, y = 0.022, 
           label = "True Mean", color = "firebrick")

summary[1, ] |> flextable()


## -----------------------------------------------------------------------------
#| echo: false
#| message: false
#| warning: false

ggplot(clt, aes(x = samp_means_30)) +
  theme_classic() +
  scale_x_continuous(limits = c(0, 400)) +
  labs(
    title = "Distribution of Sample Means (n=30) with Normal Curve", 
    x = "Mean Triglyceride Levels",
    y = "Density"
  )+
  # Overlay the Normal Curve suggestion by Google Gemini
  # Change y-axis to density
  geom_histogram(
    aes(y = after_stat(density)), 
    fill = "lightgrey", color = "grey"
  )  +
  stat_function(
    fun = dnorm, 
    args = list(
      mean = mean(clt$samp_means_30, na.rm = TRUE), 
      sd = sd(clt$samp_means_30, na.rm = TRUE)
    ),
    color = "dodgerblue4", linewidth = 1
  ) + 
  # end of gemini suggestion
  geom_vline(xintercept = true_mean, color = "firebrick", lty = 2) +
  annotate("text", 
           x=true_mean, y = 0.032, 
           label = "True Mean", color = "firebrick")

summary[2, ] |> flextable()


## -----------------------------------------------------------------------------
#| echo: false
#| message: false
#| warning: false

ggplot(clt, aes(x = samp_means_300)) +
  theme_classic() +
  scale_x_continuous(limits = c(0, 400)) +
  labs(
    title = "Distribution of Sample Means (n=300) with Normal Curve", 
    x = "Mean Triglyceride Levels",
    y = "Density"
  )+
  # Overlay the Normal Curve suggestion by Google Gemini
  # Change y-axis to density
  geom_histogram(
    aes(y = after_stat(density)), 
    fill = "lightgrey", color = "grey"
  )  +
  stat_function(
    fun = dnorm, 
    args = list(
      mean = mean(clt$samp_means_300, na.rm = TRUE), 
      sd = sd(clt$samp_means_300, na.rm = TRUE)
    ),
    color = "dodgerblue4", linewidth = 1
  ) + 
  # end of gemini suggestion
  geom_vline(xintercept = true_mean, color = "firebrick", lty = 2) +
  annotate("text", 
           x=true_mean, y = .1, 
           label = "True Mean", color = "firebrick")

summary[3, ] |> flextable()


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

# standardize x = 123
z <- (123 - mean_LDL) / sd_LDL

# use pnorm to find area under standard normal curve
1 - pnorm(z)


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

# standardize x1= 118 and x2= 126
z1 <- (118 - mean_LDL) / sd_LDL
z2 <- (126 - mean_LDL) / sd_LDL

pnorm(z2) - pnorm(z1)


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
sd_LDL <- sd(nhanes$LDL)

# find z value such that 20% is higher
z1 <- qnorm(.2, lower.tail=F)

# alternatively
z1 <- abs(qnorm(.2))

# unstandardize (put onto original scale)
z1 * sd_LDL + mean_LDL


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

# standardize x=123
z <- (123 - mean_LDL) / se_LDL

# looking for probability greater than
1 - pnorm(z)


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

# standardize x=118 and x=126
z1 <- (118 - mean_LDL) / se_LDL
z2 <- (126 - mean_LDL) / se_LDL

# find area between these z values
pnorm(z2) - pnorm(z1)


## -----------------------------------------------------------------------------
mean_LDL <- mean(nhanes$LDL)
se_LDL <- sd(nhanes$LDL) / sqrt(50)

# find z values that mark the quantiles 0.025 and 0.975
z_lower <- qnorm(0.025)
z_upper <- qnorm(0.975)

# transform z values onto scale of the data
x1 <- z_lower * se_LDL + mean_LDL
x2 <- z_upper * se_LDL + mean_LDL

c(lower_bound = x1, upper_bound = x2) |> round(digits = 3)

