# Setup
britdoc <- read.delim('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/britdoc/britdoc.txt')
Time <- britdoc$PersonYears / 1000
X <- model.matrix( ~ Age + Smoking, britdoc)
y <- britdoc$Deaths

# MLE: Simple
beta <- rep(0, ncol(X))
for (i in 1:30) {
  eta <- drop(log(Time) + X %*% beta)
  mu <- exp(eta)
  W <- diag(mu)
  Info <- crossprod(X, W) %*% X
  beta <- beta + solve(Info) %*% crossprod(X, y-mu)
}

# Compare glm and homespun
cat('beta')
drop(beta) |> vapply(sprintf, '', fmt='%.4f') |> cat()
fit <- glm(Deaths ~ offset(log(PersonYears/1000)) + Age + Smoking, britdoc, family=poisson)
cat('coef(fit):')
coef(fit) |> vapply(sprintf, '', fmt='%.4f') |> cat()

# Improvement 1: Monitor convergence
converged <- function(old, new, eps=1e-6) {
  all(abs(old-new) < eps)}
beta <- rep(0, ncol(X)); iter <- 0
repeat {
  old <- beta
  iter <- iter + 1
  eta <- drop(log(Time) + X %*% beta)
  mu <- exp(eta)
  W <- diag(mu)
  Info <- crossprod(X, W) %*% X
  beta <- old + solve(Info) %*% crossprod(X, y-mu)
  if (converged(old, beta) | iter > 50) break
}

# Improvement 2: Better inititial value
pois_fit <- function(X, y, Time, eps=1e-6) {
  beta <- c(log(sum(y) / sum(Time)), rep(0, ncol(X)-1))
  iter <- 0
  repeat {
    old <- beta
    iter <- iter + 1
    eta <- drop(log(Time) + X %*% beta)
    mu <- exp(eta)
    W <- diag(mu)
    Info <- crossprod(X, W) %*% X
    beta <- old + solve(Info) %*% crossprod(X, y-mu)
    if (converged(old, beta, eps=eps)) break
  }
  eta <- drop(log(Time) + X %*% beta)
  loglik <- drop(crossprod(y, eta) - sum(exp(eta)))
  return(list(loglik=loglik, beta=drop(beta), iter=iter, r=y-exp(eta), eta=eta, Info=Info))
}
res <- pois_fit(X, y, Time)

# Newton's method: animation
par(mar=c(4.5, 4.5, 0.1, 0.1))
score <- function(beta) {
  out <- numeric(length(beta))
  for (j in 1:length(beta)) {
    out[j] <- crossprod(x, d - t*exp(x*beta[j]))
  }
  out
}
tangent_line <- function(beta, next_point=FALSE) {
  points(beta, score(beta), col='slateblue', pch=19)
  w <- t*exp(x*beta)
  intercept <- score(beta) + crossprod(x, w*x) * beta
  slope <- -crossprod(x, w*x)
  abline(intercept, slope)
  if (next_point) {
    points(-intercept/slope, 0, pch=19)
    abline(h=0, lwd=2, lty=2, col='gray')
  }
}
set.seed(7)
n <- 100
d <- rbinom(n, 1, 0.5)
t <- rexp(n)
x <- runif(n, -1, 1)
bb <- seq(-3, 3, 0.01)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
b <- -2
tangent_line(-2)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
b <- -2
tangent_line(-2, next_point=TRUE)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
tangent_line(-0.758)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
tangent_line(-0.758, next_point=TRUE)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
tangent_line(0.185)
plot(bb, score(bb), type='l', lwd=3, col='slateblue', las=1, bty='n', xlab=expression(beta), ylab=expression(u(beta)))
tangent_line(0.185, next_point=TRUE)

# Wald tests
z <- res$beta / sqrt(diag(solve(res$Info)))
p <- 2*pnorm(-abs(z))

# Wald intervals
SE <- sqrt(diag(solve(res$Info)))
lwr <- res$beta + qnorm(0.025) * SE
upr <- res$beta + qnorm(0.975) * SE

# Wald vs summary
p
cbind(lwr, upr)
summary(fit)$coef

# LRT
X0 <- model.matrix( ~ Age, britdoc)
X1 <- model.matrix( ~ Age + Smoking, britdoc)
ll0 <- pois_fit(X0, y, Time)$loglik
ll1 <- pois_fit(X1, y, Time)$loglik
x <- 2*(ll1-ll0)
pchisq(x, 1, lower.tail=FALSE)

## ----LRT_ANOVA, echo=TRUE, results='hide'-------------------------------------
null <- glm(Deaths ~ offset(log(PersonYears/1000))
            + Age, britdoc, family=poisson)
anova(null, fit, test='Chisq')

# LR confidence interval
lr_chisq <- function(b, X, x, y, Time, ll1) {
  out <- double(length(b))
  for (j in length(b)) {
    res <- pois_fit(X, y, exp(log(Time) + x*b[j]))
    out[j] <- 2*(ll1-res$loglik)
  }
  out
}
f <- function(b) qchisq(0.95, 1) -
  lr_chisq(b, X[,1:5], X[,6], y, Time, ll1)
lwr <- uniroot(f, interval=c(-5, res$beta[6]))$root

# Score test
x <- X[, 6]
X0 <- X[, -6]
res0 <- pois_fit(X0, y, Time)
W <- diag(exp(res0$eta))
v <- solve(crossprod(X, W) %*% X)[6,6]
z <- drop(crossprod(x, res0$r)) * sqrt(v)
2*pnorm(-abs(z))

v <- 1/(crossprod(x, W) %*% x -
        crossprod(x, W) %*% X0 %*%
        solve(res0$Info) %*%
        crossprod(X0, W) %*% x)

# Comparison table
wald_test <- function(res, j) {
z <- res$beta / sqrt(diag(solve(res$Info)))
2*pnorm(-abs(z[j]))
}
wald_ci <- function(res, j) {
SE <- sqrt(diag(solve(res$Info)))[j]
lwr <- res$beta[j] + qnorm(0.025) * SE
upr <- res$beta[j] + qnorm(0.975) * SE
cbind(lwr, upr)
}
lr_test <- function(res1, res0) {
x <- 2*(res1$loglik-res0$loglik)
pchisq(x, 1, lower.tail=FALSE)
}
lr_ci <- function(res1, X, y, Time, j) {
lr_chisq <- function(b, X, x, y, Time, ll1) {
  out <- double(length(b))
  for (j in length(b)) {
    res <- pois_fit(X, y, exp(log(Time) + x*b[j]))
    out[j] <- 2*(ll1-res$loglik)
  }
  out
}
f <- function(b) lr_chisq(b, X[,-j], X[,j], y, Time, res1$loglik) - qchisq(0.95, 1)
lwr <- uniroot(f, interval=c(-5, res$beta[6]))$root
upr <- uniroot(f, interval=c(res$beta[6], 5))$root
c(lwr, upr)
}
score_test <- function(X, y, Time, j) {
x <- X[, j]
X0 <- X[, -j]
res0 <- pois_fit(X0, y, Time)
W <- diag(exp(res0$eta))
v <- solve(crossprod(X, W) %*% X)[6,6] # Same as
v <- 1/(crossprod(x, W) %*% x - crossprod(x, W) %*% X0 %*% solve(res0$Info) %*% crossprod(X0, W) %*% x)
z <- drop(crossprod(x, res0$r)) * drop(sqrt(v))
2*pnorm(-abs(z))
}
score_ci <- function(X, y, Time, j) {
score_chisq <- function(b, X, x) {
  out <- double(length(b))
  for (j in length(b)) {
    res <- pois_fit(X, y, exp(log(Time) + x*b))
    W <- diag(exp(res$eta))
    v <- 1/(crossprod(x, W) %*% x - crossprod(x, W) %*% X %*% solve(res$Info) %*% crossprod(X, W) %*% x)
    out[j] <- drop(crossprod(x, res$r)) * sqrt(v)
  }
  out^2
}
f <- function(b) score_chisq(b, X[,-j], X[,j]) - qchisq(0.95, 1)
lwr <- uniroot(f, interval=c(-5, res$beta[6]))$root
upr <- uniroot(f, interval=c(res$beta[6], 5))$root
c(lwr, upr)
}
res <- pois_fit(X, y, Time)
res0 <- pois_fit(model.matrix( ~ Age, britdoc), y, Time)
Tab <- rbind(c(exp(wald_ci(res, 6)), wald_test(res, 6)),
           c(exp(lr_ci(res, X, y, Time, 6)), lr_test(res, res0)),
           c(exp(score_ci(X, y, Time, 6)), score_test(X, y, Time, 6)))
rownames(Tab) <- c('Wald', 'LR', 'Score')
knitr::kable(Tab, 'latex', booktabs=TRUE, digits=c(4, 4, 5), escape=FALSE,
           col.names = c('Lower', 'Upper', '$p$')) |>
kableExtra::kable_styling(position = "center") |>
kableExtra::add_header_above(c(" " = 1, "95% interval" = 2))

