library(hdrm)
library(grpreg)
library(mgcv)
library(splines)
library(ggplot2)
library(sparsepca)
library(glinternet)
library(Rgraphviz)  # This is a Bioconductor package...you only need it for plotting the graphical models
library(glasso)


# Splines -----------------------------------------------------------------

bmd <- read.delim('https://s3.amazonaws.com/pbreheny-data-sets/bmd.txt')

## Slide 5
form <- list(formula(y~x+I(x^2)),formula(y~x+I(x^2)+I(x^3)+I(x^4)),formula(y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5)+I(x^6)))
f <- function(x){dnorm(x,0.5,0.18)}
main <- list(expression("Up to "*x^2),expression("Up to "*x^4),expression("Up to "*x^6))
N <- 50
n <- 100
xx <- seq(0,1,len=101)
op <- par(mfrow=c(1,3))
for (j in 1:3) {
  plot(xx,f(xx),col="red",lwd=3,type="l",ylim=c(-1,3),xlab="x",ylab="f(x)",main=main[[j]])
  for (i in 1:N) {
    x <- runif(n)
    y <- rnorm(n,f(x))
    fit <- lm(form[[j]])
    lines(xx,predict(fit,data.frame(x=xx)),col=rgb(.7,.7,.7,alpha=.5))
  }
}
par(op)

# Piecewise constant
pc_fit <- function(g) {
  sub <- subset(bmd, gender==g)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  c1 <- mean(y[x < knots[1]])
  c2 <- mean(y[x >= knots[1] & x < knots[2]])
  c3 <- mean(y[x >= knots[2]])
  list(df1 = data.frame(x1=min(x), x2=knots[1], y1=c1, y2=c1, gender=g),
       df2 = data.frame(x1=knots[1], x2=knots[2], y1=c2, y2=c2, gender=g),
       df3 = data.frame(x1=knots[2], x2=max(x), y1=c3, y2=c3, gender=g))
}
f <- pc_fit('female')
m <- pc_fit('male')
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept=x1), data=f$df2, lty=2) +
  geom_vline(aes(xintercept=x2), data=f$df2, lty=2) +
  geom_vline(aes(xintercept=x1), data=m$df2, lty=2) +
  geom_vline(aes(xintercept=x2), data=m$df2, lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear
pl_fit <- function(g) {
  sub <- subset(bmd, gender==g)
  x <- sub$age
  y <- sub$spnbmd
  knots <- quantile(x, 1:2/3)
  ind1 <- x < knots[1]
  ind2 <- x >= knots[1] & x < knots[2]
  ind3 <- x >= knots[2]
  x1 <- x[ind1]
  x2 <- x[ind2]
  x3 <- x[ind3]
  fit1 <- lm(y[ind1]~x1)
  fit2 <- lm(y[ind2]~x2)
  fit3 <- lm(y[ind3]~x3)
  yy1 <- predict(fit1, data.frame(x1=c(min(x), knots[1])))
  yy2 <- predict(fit2, data.frame(x2=c(knots[1], knots[2])))
  yy3 <- predict(fit3, data.frame(x3=c(knots[2], max(x))))
  list(df1 = data.frame(x1=min(x), x2=knots[1], y1=yy1[1], y2=yy1[2], gender=g),
       df2 = data.frame(x1=knots[1], x2=knots[2], y1=yy2[1], y2=yy2[2], gender=g),
       df3 = data.frame(x1=knots[2], x2=max(x), y1=yy3[1], y2=yy3[2], gender=g))
}
f <- pl_fit('female')
m <- pl_fit('male')
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=f$df3, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df1, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df2, color="#3366FF", lwd=1) +
  geom_segment(aes(x=x1, xend=x2, y=y1, yend=y2), data=m$df3, color="#3366FF", lwd=1) +
  geom_vline(aes(xintercept=quantile(age, 1/3)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 2/3)), lty=2) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# Piecewise linear splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=1), se=FALSE) + 
  geom_vline(aes(xintercept=quantile(age, 1/3)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 2/3)), lty=2) + 
  xlab('Age') + ylab("Relative change in spinal BMD")

# Quadratic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=2), se=FALSE) + 
  geom_vline(aes(xintercept=quantile(age, 1/3)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 2/3)), lty=2) + 
  xlab('Age') + ylab("Relative change in spinal BMD")

# Cubic splines
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~bs(x, knots=quantile(x, 1:2/3), degree=3), se=FALSE) + 
  geom_vline(aes(xintercept=quantile(age, 1/3)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 2/3)), lty=2) + 
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, knots=quantile(x, 1:2/3)), se=FALSE) + 
  geom_vline(aes(xintercept=quantile(age, 1/3)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 2/3)), lty=2) + 
  xlab('Age') + ylab("Relative change in spinal BMD")

# Natural cubic spline, 6 df
ggplot(bmd, aes(age, spnbmd)) + facet_grid(~gender) + geom_point(col='gray') + 
  geom_smooth(method=lm, formula=y~ns(x, df=5), se=FALSE) + 
  geom_vline(aes(xintercept=quantile(age, 0)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 1/5)), lty=2) + 
  geom_vline(aes(xintercept=quantile(age, 2/5)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 3/5)), lty=2) + 
  geom_vline(aes(xintercept=quantile(age, 4/5)), lty=2) +
  geom_vline(aes(xintercept=quantile(age, 1)), lty=2) + 
  xlab('Age') + ylab("Relative change in spinal BMD")

# Overlay
ggplot(bmd, aes(age, spnbmd, group=gender, color=gender)) + geom_point(alpha=0.5) + 
  geom_smooth(method=lm, formula=y~ns(x, df=5), se=FALSE) +
  xlab('Age') + ylab("Relative change in spinal BMD")

# SPAM: Rat eye data
attachData(Scheetz2006)

# Restrict to Chr5
ind <- which(fData$Chr==5)
df <- 3
BB <- sapply(as.data.frame(X[,ind]), ns, simplify="array", df=df)
B <- t(apply(BB, 1, rbind))
group <- rep(1:length(ind), each=df)
Gene <- fData[ind, 'Symbol']

# Fit
penalty <- c("grLasso", "grMCP", "grSCAD")
cvfit <- vector("list", length(penalty))
lambda.min <- c(0.05, 0.15, 0.1)
set.seed(1)
for (i in 1:length(penalty)) cvfit[[i]] <- cv.grpreg(B, y, group, penalty=penalty[i], lambda.min=lambda.min[i])
names(cvfit) <- penalty

# Solution paths
op <- par(mfrow=c(3,1))
for (i in 1:3) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit)
  abline(v=l)
  g <- predict(fit, lambda=l, type="groups")
  print(g)
  print(predict(fit, lambda=l, type="norm")[g])
}
par(op)

# Norm paths
normPath <- function(cvfit, i, ...) {
  fit <- cvfit[[i]]$fit
  l <- cvfit[[i]]$lambda.min
  plot(fit, norm=TRUE, bty="n", ...)
  abline(v=l)
}
normPath(cvfit, 1)
normPath(cvfit, 2)

## Fig
Lines <- function(id, fit, l, ...) {
  x <- X[,ind[id]]
  xx <- seq(min(x), max(x), len=101)
  BB <- predict(ns(x, df=df), newx=xx)
  ind <- 1+((id-1)*df+1):((id-1)*df+df)
  yhat <- as.numeric(BB %*% coef(fit, lambda=l)[ind])
  i <- which.min(abs(xx-mean(x)))
  yhat <- yhat + mean(y) - yhat[i]
  lines(xx, yhat, ...)
}
col <- hdrm:::pal(3)
id <- 187
op <- par(mar=c(4,4,1,1))
plot(X[,ind[id]],y, pch=19, cex=.7, xlab="PNISR", ylab="TRIM32", las=1, bty='n')
for (i in 1:3) {Lines(id, cvfit[[i]]$fit, cvfit[[i]]$lambda.min, lwd=3, col=col[i])}
text(8.9, 8.4, "Group lasso")
text(8.9, 8.2, "Group SCAD")
text(9, 7.6, "Group MCP")
par(op)


# Principal components ----------------------------------------------------

# Tukey illustration
X <- read.delim('https://s3.amazonaws.com/pbreheny-data-sets/tukey.txt')
SVD <- svd(X)
U <- SVD$u
D <- diag(SVD$d)
V <- SVD$v
plotme <- function(q){
  Z <- U[,1:q]%*%tcrossprod(D[1:q,1:q],V[,1:q])
  Z <- pmax(Z, 0)
  Z <- pmin(Z, 1)
  op <- par(mar=c(0,0,0,0))
  plot(NA, xlim=0:1, ylim=0:1, type="n", bty='n', xaxs='i', yaxs='i')
  rasterImage(as.raster(Z), 0, 0, 1, 1)
  par(op)
}
plotme(1)
plotme(2)
plotme(4)
plotme(8)
plotme(16)
plotme(32)
plotme(64)
plotme(128)
s1 <- object.size(SVD)
s2 <- object.size(U[,1:64]) + object.size(SVD$d[1:64]) + object.size(V[,1:64])
s3 <- object.size(U[,1:128]) + object.size(SVD$d[1:128]) + object.size(V[,1:128])
print(s1, units="Mb")
print(s2, units="Mb")
print(s3, units="Mb")

# Analysis of WHO-ARI data
attachData(whoari)
fit <- spca(std(X), 5, 0.05, verbose=FALSE)

# Print non-zero loadings
V <- fit$loadings
for (j in 1:ncol(V)) {
  ind <- which(V[,j] != 0)
  v <- V[ind,j]
  names(v) <- colnames(X)[ind]
  print(v)
}

# Regress pneumonia score on principal components
pcData <- as.data.frame(std(X) %*% V)
fit <- lm(y~., pcData)
CIplot(fit, tau=c(-1,-1,-1,-1,-1), sort=FALSE, xlab=expression(beta))
summary(fit)

# Interactions ------------------------------------------------------------

# Simulate some data
n <- 70
p <- 20
set.seed(2)
X <- matrix(rnorm(n*p), n, p)
y <- X[,1] + X[,4] - X[,1]*X[,2] + rnorm(n)

# Fit models
XX <- model.matrix(~(.)^2, as.data.frame(X))[,-1]
cvfit.lasso <- cv.glmnet(XX, y)
cvfit.glint <- glinternet.cv(X, y, rep(1, p))

# Number of main effects and interactions selected
b <- coef(cvfit.lasso, s=cvfit.lasso$lambda.min)[-1]
sum(b[1:20]!=0)
sum(b[-(1:20)]!=0)
b <- coef(cvfit.glint$glinternetFit)[[which.min(cvfit.glint$cvErr)]]
length(b$mainEffects$cont)
nrow(b$interactions$contcont)

# Maximum R^2
max(1-2*cvfit.glint$cvErr/var(y))
max(1-cvfit.lasso$cvm/var(y))


# Graphical lasso ---------------------------------------------------------

# Illustration
v <- letters[1:7]
el <- list(
  a=c('b','c'),
  b=c('a','c'),
  c=c('a','b','d','e'),
  d=c('c','f','g'),
  e=c('c','f'),
  f=c('d','e'),
  g=c('d')
)
g <- graphNEL(v, el)
col <- rep(c("#FF4E37FF", 'gray', "#008DFFFF"), c(3,2,2))
names(col) <- v
lab <- as.character(1:7)
names(lab) <- v
nAttrs <- list(fillcolor=col, label=lab)
plot(g, nodeAttrs=nAttrs, 'neato')

# Protein network data
X <- read.delim('https://s3.amazonaws.com/pbreheny-data-sets/Sachs2003.txt')
S <- var(X)
fit <- glasso(S, 0.1)
plot_glasso <- function(fit) {
  # Construct graph
  v <- colnames(X)
  p <- length(v)
  el <- vector('list', p)
  names(el) <- v
  for (j in 1:p) {
    el[[j]] <- setdiff(v[which(fit$wi[j,] != 0)], v[j])
  }
  g <- graphNEL(v, el)
  plot(g, nodeAttrs=nAttrs, 'circo')
}
lam <- c(0.001, 0.01, 0.02, 0.05, 0.1, 0.15)
op <- par(mfrow=c(2,3), oma=c(0,0,1,0))
for (i in 1:6) {
  plot_glasso(glasso(S, lam[i]))
  mtext(bquote(lambda==.(lam[i])))
}
par(op)
