library(hdrm)
library(grpreg)
library(mgcv)
library(splines)
library(ggplot2)
library(sparsepca)
library(Rgraphviz)  # Bioconductor; I only used it to plot graphical models
library(glasso)
library(data.table)
bmd <- fread('https://hastie.su.domains/ElemStatLearn/datasets/bone.data') |>
  _[age <= 18]
bmd[, sex := stringr::str_to_title(gender)]

# Problems with polynomial regression
form <- list(
  formula(y ~ x + I(x^2)),
  formula(y ~ x + I(x^2) + I(x^3) + I(x^4)),
  formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6)))
for (J in 1:4) {
  f <- function(x) {
    dnorm(x, 0.5, 0.18)
  }
  main <- list(
    expression("Up to " * x^2),
    expression("Up to " * x^4),
    expression("Up to " * x^6)
  )
  N <- 50
  n <- 100
  xx <- seq(0, 1, len = 101)
  par(mfrow = c(1, 3))
  for (j in 1:3) {
    plot(
      xx, f(xx), col = "red", lwd = 3, type = "l", ylim = c(-1, 3),
      xlab = "x", ylab = "f(x)", main = main[[j]], bty = 'n'
    )
    if (j < J) {
      for (i in 1:N) {
        set.seed(i)
        x <- runif(n)
        y <- rnorm(n, f(x))
        fit <- lm(form[[j]])
        lines(xx, predict(fit, data.frame(x = xx)), col = '#B3B3B380')
      }
    }
    lines(xx, f(xx), col = "red", lwd = 3)
  }
}

# Piecewise constant
pc_fit <- function(s) {
  sub <- subset(bmd, sex == s)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  c1 <- mean(y[x < knots[1]])
  c2 <- mean(y[x >= knots[1] & x < knots[2]])
  c3 <- mean(y[x >= knots[2]])
  list(df1 = data.frame(x1 = min(x), x2 = knots[1], y1 = c1, y2 = c1, sex = s),
       df2 = data.frame(x1 = knots[1], x2 = knots[2], y1 = c2, y2 = c2, sex = s),
       df3 = data.frame(x1 = knots[2], x2 = max(x), y1 = c3, y2 = c3, sex = s))
}
f <- pc_fit('Female')
m <- pc_fit('Male')
kdt <- bmd[, .(knots = quantile(age, probs = (1:2) / 3)), sex]
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~sex) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept = knots), data = kdt, lty = 2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear
pl_fit <- function(s) {
  sub <- subset(bmd, sex == s)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  ind1 <- x < knots[1]
  ind2 <- x >= knots[1] & x < knots[2]
  ind3 <- x >= knots[2]
  x1 <- x[ind1]
  x2 <- x[ind2]
  x3 <- x[ind3]
  fit1 <- lm(y[ind1]~x1)
  fit2 <- lm(y[ind2]~x2)
  fit3 <- lm(y[ind3]~x3)
  yy1 <- predict(fit1, data.frame(x1=c(min(x), knots[1])))
  yy2 <- predict(fit2, data.frame(x2=c(knots[1], knots[2])))
  yy3 <- predict(fit3, data.frame(x3=c(knots[2], max(x))))
  list(df1 = data.frame(x1=min(x), x2=knots[1], y1=yy1[1], y2=yy1[2], sex = s),
       df2 = data.frame(x1=knots[1], x2=knots[2], y1=yy2[1], y2=yy2[2], sex = s),
       df3 = data.frame(x1=knots[2], x2=max(x), y1=yy3[1], y2=yy3[2], sex = s))
}
f <- pl_fit('Female')
m <- pl_fit('Male')
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~sex) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~ sex) + geom_point(col = 'gray') + 
  geom_smooth(method = lm, formula = y ~ bs(x, knots = quantile(x, 1:2/3), degree=1), se=FALSE) + 
  geom_vline(aes(xintercept = knots), data = kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Quadratic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~ sex) + geom_point(col='gray') + 
  geom_smooth(method = lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=2), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Cubic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~ sex) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=3), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~ sex) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, knots = quantile(x, 1:2/3)), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty = 2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline: 6 df
kdt <- bmd[, .(knots = quantile(age, probs=(0:5)/5)), sex]
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~ sex) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, df = 5), se=FALSE) + 
  geom_vline(aes(xintercept = knots), data = kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Overlay
ggplot(bmd, aes(age, spnbmd, group = sex, color = sex)) +
  geom_point(alpha = 0.5) + 
  geom_smooth(method = lm, formula = y ~ ns(x, df = 5), se = FALSE) +
  xlab('Age') + ylab("Relative change in spinal BMD") +
  guides(color = guide_legend(title = ''))

# SPAM: Rat eye data
Scheetz2006 <- read_data(Scheetz2006)
X <- Scheetz2006$X
y <- Scheetz2006$y

# Restrict to Chr5
ind <- which(Scheetz2006$fData$Chr == 5)

# Construct basis expansions
B <- expand_spline(X[, ind])
dim(X[,ind])  # 857 columns
dim(B$X)      # 857*3 columns

# Fit
penalty <- c("grLasso", "grMCP", "grSCAD")
cvfit <- vector("list", length(penalty))
lambda.min <- c(0.05, 0.15, 0.1)
for (i in 1:length(penalty)) {
  cvfit[[i]] <- cv.grpreg(B, y, penalty=penalty[i], lambda.min=lambda.min[i], seed=1)
}
names(cvfit) <- penalty

# Solution paths
par(mfrow=c(3,1))
for (i in 1:3) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit)
  abline(v=l, lty=2)
}

# Norm paths
par(mfrow=c(3,1))
for (i in 1:3) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit, norm=TRUE)
  abline(v=l)
}

# Number of genes, R^2, for group lasso and group MCP
ngenes_gl <- cvfit[['grLasso']] |> predict(type='groups') |> length()
rsq_gl <- summary(cvfit[['grLasso']])$r.squared |> max()
ngenes_gm <- cvfit[['grMCP']] |> predict(type='groups') |> length()
rsq_gm <- summary(cvfit[['grMCP']])$r.squared |> max()

# SPAM: Group lasso
normPath <- function(cvfit, i, ...) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit, norm=TRUE, bty="n", ...)
  abline(v=l)
}
normPath(cvfit, 1)

# SPAM: Group MCP
normPath(cvfit, 2)

# PNISR
col <- pal(3)
plot(X[,"1373534_at"], y, pch=19, cex=.8, xlab="PNISR", ylab="TRIM32", las=1, bty='n', col='gray70')
plot_spline(cvfit[[1]], "1373534_at", lambda = cvfit[[1]]$lambda.min, type='conditional', add=TRUE, col=col[1])
plot_spline(cvfit[[2]], "1373534_at", lambda = cvfit[[2]]$lambda.min, type='conditional', add=TRUE, col=col[2])
plot_spline(cvfit[[3]], "1373534_at", lambda = cvfit[[3]]$lambda.min, type='conditional', add=TRUE, col=col[3])
text(8.9, 8.4, "Group lasso")
text(8.9, 8.2, "Group SCAD")
text(9, 7.6, "Group MCP")

# Tukey illustration
X <- read.delim('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/tukey/tukey.txt')
SVD <- svd(X)
U <- SVD$u
D <- diag(SVD$d)
V <- SVD$v
plotme <- function(q){
  Z <- U[,1:q]%*%tcrossprod(D[1:q,1:q],V[,1:q])
  Z <- pmax(Z, 0)
  Z <- pmin(Z, 1)
  par(mar=c(0,0,0,0))
  plot(NA, xlim=0:1, ylim=0:1, type="n", bty='n', xaxs='i', yaxs='i')
  rasterImage(as.raster(Z), 0, 0, 1, 1)
}
plotme(1)
plotme(2)
plotme(4)
plotme(8)
plotme(16)
plotme(32)
plotme(64)
plotme(128)

# Object size
s1 <- object.size(SVD)
s2 <- object.size(U[,1:64]) + object.size(SVD$d[1:64]) + object.size(V[,1:64])
s3 <- object.size(U[,1:128]) + object.size(SVD$d[1:128]) + object.size(V[,1:128])
print(s1, units="Mb")
print(s2, units="Mb")
print(s3, units="Mb")

# fit <- spca(X, k = 5, 0.1, scale = TRUE) # or use std(X)
# V <- fit$loadings
# Z <- X %*% V  # Principal components

# WHOARI: Sparse PCA
attach_data(whoari)
fit <- spca(std(X), 5, 0.05, verbose = FALSE)

V <- fit$loadings
for (j in 1:ncol(V)) {
  ind <- which(V[,j] != 0)
  v <- V[ind,j]
  names(v) <- colnames(X)[ind]
  print(v)
}

# WHOARI: Associations with PCs
pcData <- as.data.frame(std(X) %*% V)
fit <- lm(y~., pcData)
ci_plot(fit, tau = c(-1, -1, -1, -1, -1), sort = FALSE, xlab = expression(beta))

# Conditional independence graph
v <- letters[1:7]
el <- list(
  a = c('b', 'c'),
  b = c('a', 'c'),
  c = c('a', 'b', 'd', 'e'),
  d = c('c', 'f', 'g'),
  e = c('c', 'f'),
  f = c('d', 'e'),
  g = c('d')
)
g <- graphNEL(v, el)
col <- rep(c("#FF4E37FF", 'gray', "#008DFFFF"), c(3,2,2))
names(col) <- v
lab <- as.character(1:7)
names(lab) <- v
nAttrs <- list(fillcolor = col, label = lab)
plot(g, nodeAttrs = nAttrs, 'neato')

# Protein network data
x <- fread('https://raw.githubusercontent.com/IowaBiostat/data-sets/main/Sachs2005/Sachs2005.txt')
s <- var(x)
fit <- glasso(s, 0.1)
plot_glasso <- function(fit) {
  # Construct graph
  v <- colnames(x)
  p <- length(v)
  el <- vector('list', p)
  names(el) <- v
  for (j in 1:p) {
    el[[j]] <- setdiff(v[which(fit$wi[j,] != 0)], v[j])
  }
  g <- graphNEL(v, el)
  plot(g, nodeAttrs = nAttrs, 'circo')
}
lam <- c(0.001, 0.01, 0.02, 0.05, 0.1, 0.15)
par(mfrow = c(2,3), oma = c(0,0,1,0))
for (i in 1:6) {
  plot_glasso(glasso(s, lam[i]))
  mtext(bquote(lambda == .(lam[i])))
}

