library(hdrm)
library(grpreg)
library(mgcv)
library(splines)
library(data.table)
library(ggplot2)
library(sparsepca)
library(glinternet)
library(Rgraphviz)  # This is a Bioconductor package...you only need it for plotting the graphical models
library(glasso)
bmd <- fread('https://s3.amazonaws.com/pbreheny-data-sets/bmd.txt')

# Problems with polynomial regression
form <- list(
  formula(y ~ x + I(x^2)),
  formula(y ~ x + I(x^2) + I(x^3) + I(x^4)),
  formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6)))
for (J in 1:4) {
  f <- function(x){dnorm(x,0.5,0.18)}
  main <- list(expression("Up to "*x^2), expression("Up to "*x^4), expression("Up to "*x^6))
  N <- 50
  n <- 100
  xx <- seq(0, 1, len=101)
  par(mfrow=c(1, 3))
  for (j in 1:3) {
    plot(xx, f(xx), col="red", lwd=3, type="l", ylim=c(-1,3), xlab="x", ylab="f(x)", main=main[[j]])
    if (j < J) {
      for (i in 1:N) {
        set.seed(i)
        x <- runif(n)
        y <- rnorm(n,f(x))
        fit <- lm(form[[j]])
        lines(xx, predict(fit,data.frame(x=xx)), col='#B3B3B380')
      }
    }
    lines(xx,f(xx),col="red",lwd=3)
  }
}

# Piecewise constant
pc_fit <- function(g) {
  sub <- subset(bmd, gender==g)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  c1 <- mean(y[x < knots[1]])
  c2 <- mean(y[x >= knots[1] & x < knots[2]])
  c3 <- mean(y[x >= knots[2]])
  list(df1 = data.frame(x1=min(x), x2=knots[1], y1=c1, y2=c1, gender=g),
       df2 = data.frame(x1=knots[1], x2=knots[2], y1=c2, y2=c2, gender=g),
       df3 = data.frame(x1=knots[2], x2=max(x), y1=c3, y2=c3, gender=g))
}
f <- pc_fit('female')
m <- pc_fit('male')
kdt <- bmd[, .(knots=quantile(age, probs=(1:2)/3)), gender]
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear
pl_fit <- function(g) {
  sub <- subset(bmd, gender==g)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  ind1 <- x < knots[1]
  ind2 <- x >= knots[1] & x < knots[2]
  ind3 <- x >= knots[2]
  x1 <- x[ind1]
  x2 <- x[ind2]
  x3 <- x[ind3]
  fit1 <- lm(y[ind1]~x1)
  fit2 <- lm(y[ind2]~x2)
  fit3 <- lm(y[ind3]~x3)
  yy1 <- predict(fit1, data.frame(x1=c(min(x), knots[1])))
  yy2 <- predict(fit2, data.frame(x2=c(knots[1], knots[2])))
  yy3 <- predict(fit3, data.frame(x3=c(knots[2], max(x))))
  list(df1 = data.frame(x1=min(x), x2=knots[1], y1=yy1[1], y2=yy1[2], gender=g),
       df2 = data.frame(x1=knots[1], x2=knots[2], y1=yy2[1], y2=yy2[2], gender=g),
       df3 = data.frame(x1=knots[2], x2=max(x), y1=yy3[1], y2=yy3[2], gender=g))
}
f <- pl_fit('female')
m <- pl_fit('male')
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=1), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Quadratic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=2), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Cubic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=3), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, knots=quantile(x, 1:2/3)), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline, 6 df
kdt <- bmd[, .(knots=quantile(age, probs=(0:5)/5)), gender]
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, df=5), se=FALSE) + 
  geom_vline(aes(xintercept=knots), data=kdt, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Overlay
ggplot(bmd, aes(age, spnbmd, group=gender, color=gender)) + geom_point(alpha=0.5) + 
  geom_smooth(method=lm, formula=y~ns(x, df=5), se=FALSE) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# SPAM: Rat eye data
attachData(Scheetz2006)

# Restrict to Chr5
ind <- which(fData$Chr==5)

# Construct basis expansions
B <- expand_spline(X[, ind])
dim(X[,ind])  # 857 columns
dim(B$X)      # 857*3 columns

# Fit
if (file.exists('cache/cvfit.rds')) {
  cvfit <- readRDS('cache/cvfit.rds')
} else {
  penalty <- c("grLasso", "grMCP", "grSCAD")
  cvfit <- vector("list", length(penalty))
  lambda.min <- c(0.05, 0.15, 0.1)
  for (i in 1:length(penalty)) {
    cvfit[[i]] <- cv.grpreg(B, y, penalty=penalty[i], lambda.min=lambda.min[i], seed=1)
  }
  names(cvfit) <- penalty
  saveRDS(cvfit, file='cache/cvfit.rds')
}

# Solution paths
par(mfrow=c(3,1))
for (i in 1:3) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit)
  abline(v=l, lty=2)
}

# Norm paths
par(mfrow=c(3,1))
for (i in 1:3) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit, norm=TRUE)
  abline(v=l)
}

# Number of genes, R^2, for group lasso and group MCP
ngenes_gl <- cvfit[['grLasso']] |> predict(type='groups') |> length()
rsq_gl <- summary(cvfit[['grLasso']])$r.squared |> max()
ngenes_gm <- cvfit[['grMCP']] |> predict(type='groups') |> length()
rsq_gm <- summary(cvfit[['grMCP']])$r.squared |> max()

# SPAM: Group lasso
normPath <- function(cvfit, i, ...) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit, norm=TRUE, bty="n", ...)
  abline(v=l)
}
normPath(cvfit, 1)

# SPAM: Group MCP
normPath(cvfit, 2)

# PNISR
col <- pal(3)
plot(X[,"1373534_at"], y, pch=19, cex=.8, xlab="PNISR", ylab="TRIM32", las=1, bty='n', col='gray70')
plot_spline(cvfit[[1]], "1373534_at", lambda = cvfit[[1]]$lambda.min, type='conditional', add=TRUE, col=col[1])
plot_spline(cvfit[[2]], "1373534_at", lambda = cvfit[[2]]$lambda.min, type='conditional', add=TRUE, col=col[2])
plot_spline(cvfit[[3]], "1373534_at", lambda = cvfit[[3]]$lambda.min, type='conditional', add=TRUE, col=col[3])
text(8.9, 8.4, "Group lasso")
text(8.9, 8.2, "Group SCAD")
text(9, 7.6, "Group MCP")

# Tukey illustration
X <- fread('https://s3.amazonaws.com/pbreheny-data-sets/tukey.txt')
SVD <- svd(X)
U <- SVD$u
D <- diag(SVD$d)
V <- SVD$v
plotme <- function(q){
  Z <- U[,1:q]%*%tcrossprod(D[1:q,1:q],V[,1:q])
  Z <- pmax(Z, 0)
  Z <- pmin(Z, 1)
  par(mar=c(0,0,0,0))
  plot(NA, xlim=0:1, ylim=0:1, type="n", bty='n', xaxs='i', yaxs='i')
  rasterImage(as.raster(Z), 0, 0, 1, 1)
}
plotme(1)
plotme(2)
plotme(4)
plotme(8)
plotme(16)
plotme(32)
plotme(64)
plotme(128)

# Object size
s1 <- object.size(SVD)
s2 <- object.size(U[,1:64]) + object.size(SVD$d[1:64]) + object.size(V[,1:64])
s3 <- object.size(U[,1:128]) + object.size(SVD$d[1:128]) + object.size(V[,1:128])
print(s1, units="Mb")
print(s2, units="Mb")
print(s3, units="Mb")

# WHOARI: Sparse PCA
attachData(whoari)
fit <- spca(std(X), 5, 0.05, verbose=FALSE)

V <- fit$loadings
for (j in 1:ncol(V)) {
  ind <- which(V[,j] != 0)
  v <- V[ind,j]
  names(v) <- colnames(X)[ind]
  print(v)
}

# WHOARI: Associations with PCs
pcData <- as.data.frame(std(X) %*% V)
fit <- lm(y~., pcData)
CIplot(fit, tau=c(-1,-1,-1,-1,-1), sort=FALSE, xlab=expression(beta))

# Interactions
# Simulate some data
n <- 70
p <- 20
set.seed(2)
X <- matrix(rnorm(n*p), n, p)
y <- X[,1] + X[,4] - X[,1]*X[,2] + rnorm(n)

# Fit models
XX <- model.matrix(~(.)^2, as.data.frame(X))[,-1]
cvfit.lasso <- cv.glmnet(XX, y)
cvfit.glint <- glinternet.cv(X, y, rep(1, p))

# Number of main effects and interactions selected
b <- coef(cvfit.lasso, s=cvfit.lasso$lambda.min)[-1]
(las_main <- sum(b[1:20]!=0))
(las_int <- sum(b[-(1:20)]!=0))
b <- coef(cvfit.glint$glinternetFit)[[which.min(cvfit.glint$cvErr)]]
(gli_main <- length(b$mainEffects$cont))
(gli_int <- nrow(b$interactions$contcont))

# Maximum R^2
(las_rsq <- max(1-2*cvfit.glint$cvErr/var(y)))
(gli_rsq <- max(1-cvfit.lasso$cvm/var(y)))

# Conditional independence graph
v <- letters[1:7]
el <- list(
  a=c('b','c'),
  b=c('a','c'),
  c=c('a','b','d','e'),
  d=c('c','f','g'),
  e=c('c','f'),
  f=c('d','e'),
  g=c('d')
)
g <- graphNEL(v, el)
col <- rep(c("#FF4E37FF", 'gray', "#008DFFFF"), c(3,2,2))
names(col) <- v
lab <- as.character(1:7)
names(lab) <- v
nAttrs <- list(fillcolor=col, label=lab)
plot(g, nodeAttrs=nAttrs, 'neato')

# Protein network data
X <- fread('https://s3.amazonaws.com/pbreheny-data-sets/Sachs2005.txt')
S <- var(X)
fit <- glasso(S, 0.1)
plot_glasso <- function(fit) {
  # Construct graph
  v <- colnames(X)
  p <- length(v)
  el <- vector('list', p)
  names(el) <- v
  for (j in 1:p) {
    el[[j]] <- setdiff(v[which(fit$wi[j,] != 0)], v[j])
  }
  g <- graphNEL(v, el)
  plot(g, nodeAttrs=nAttrs, 'circo')
}
lam <- c(0.001, 0.01, 0.02, 0.05, 0.1, 0.15)
par(mfrow=c(2,3), oma=c(0,0,1,0))
for (i in 1:6) {
  plot_glasso(glasso(S, lam[i]))
  mtext(bquote(lambda==.(lam[i])))
}
