library(hdrm)

# Data (running examples)
attachData(bcTCGA)
X.TCGA <- X
y.TCGA <- y
Data <- Ex9.1(seed=72)
X <- Data$X
y <- Data$y
n <- nrow(X)
p <- ncol(X)
varType <- Data$varType

# Semi-penalized likelihood ratio test (no existing package yet)
splrt <- function(j, Data, lam) {
  X <- Data$X
  y <- Data$y
  L <- length(lam)
  fit0 <- ncvreg(X[,-j], y, lambda=lam, penalty='lasso')
  w <- rep(1, ncol(X))
  w[j] <- 0
  fit1 <- ncvreg(X, y, penalty.factor=w, lambda=lam, penalty='lasso')
  stat <- fit0$loss[L]-fit1$loss[L]
  pchisq(stat, 1, lower.tail=FALSE)
}

# Fit and inference
cvfit <- cv.ncvreg(Data$X, Data$y, penalty='lasso')
p <- sapply(1:60, splrt, Data=Data, lam = cvfit$lambda[1:cvfit$min])
DF <- data.frame(b=coef(cvfit)[-1], p=p, adj=p.adjust(p, method='BH'))
rownames(DF) <- colnames(Data$X)
head(DF[order(DF$p),], n=20)
fit <- cvfit$fit
s <- summary(fit, lambda=cvfit$lambda.min)
cbind(s$table[,c('Estimate', 'mfdr')], p=DF[rownames(s$table), 'p'], adj=DF[rownames(s$table), 'adj'])

# TCGA (takes a while)
cvfit <- cv.ncvreg(X.TCGA, y.TCGA, penalty='lasso')
ind <- predict(cvfit, type='vars')
p <- sapply(ind, splrt, Data=list(X=X.TCGA, y=y.TCGA), lam = cvfit$lambda[1:cvfit$min])
sort(p)

# Sample splitting --------------------------------------------------------

# Single split
set.seed(1)
ind <- as.logical(sample(rep(0:1, each=n/2)))
cvfit <- cv.glmnet(X[ind,], y[ind])
b <- coef(cvfit, s=cvfit$lambda.min)[-1]
tapply(b!=0, varType, sum)
XX <- X[!ind,which(b!=0)]
fit <- lm(y[!ind]~XX)
summary(fit)
summ <- summary(fit)$coefficients[-1,]
var.id <- gsub("XX", "", rownames(summ))
pval <- rep(1, ncol(X)); names(pval) <- colnames(X)
pval[var.id] <- summ[,4]
tapply(pval < 0.05, varType, sum)

# Multi-split
res <- Fig9.1(Data)
tapply(res < 0.05, varType, sum)

# TCGA (time-consuming)
p <- Ex9.2()
head(sort(p))


# Stability selection -----------------------------------------------------

# Example data
res <- Fig9.2(Data)
s <- res$Stability[,20]
s[s > 0.6]

# FDR Bound
fit <- glmnet(X, y)
EV <- res$Selected^2/(ncol(X)*(2*.8-1))
FDR <- EV/sapply(predict(fit, type="nonzero"), length)
max(which(FDR < .1))
which(res$Stability[,max(which(FDR < .1))] > .8)
which(res$Stability[,max(which(FDR < .3))] > .8)

# TCGA (takes a while)
res <- Fig9.3()
S <- res$Stability
s <- apply(S, 1, max)
head(sort(s[s>0.6], decreasing=TRUE), n=20)

# FDR Bound
fit <- glmnet(X.TCGA, y.TCGA)
EV <- res$Selected^2/(ncol(X.TCGA)*(2*.8-1))
FDR <- EV/sapply(predict(fit, type="nonzero"), length)
which(res$Stability[,max(which(FDR < .1))] > .8)
which(res$Stability[,max(which(FDR < .3))] > .8)


# Bootstrap (currently this section diverges from the book a bit) ---------

# Pairs
library(ggplot2)
B <- matrix(NA, 100, ncol(X), dimnames=list(1:100, colnames(X)))
pb <- txtProgressBar(1, 100, style=3)
for (i in 1:100) {
  ind <- sample(1:n, replace=TRUE)
  XX <- X[ind,]
  yy <- y[ind]
  fit.i <- glmnet(XX, yy, lambda=0.07)
  B[i,] <- as.numeric(coef(fit.i)[-1])
  setTxtProgressBar(pb, i)
}
boxplot(B[,varType=="A"], col="gray", pch="|", horizontal=TRUE, las=1, frame.plot=FALSE, cex=0.5, las=1, ylim=c(-1.5,1.5))
boxplot(B[,varType=="B"], col="gray", pch="|", horizontal=TRUE, las=1, frame.plot=FALSE, cex=0.5, las=1, ylim=c(-1,1))
bz <- apply(B!=0, 2, mean)

# Residuals
cvfit <- cv.glmnet(X, y)
SD <- sqrt(cvfit$cvm[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, newx=X, s=cvfit$lambda.min))
B <- matrix(NA, 1000, ncol(X), dimnames=list(1:1000, colnames(X)))
pb <- txtProgressBar(1, 100, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- glmnet(X, yy, lambda=0.07)
  B[i,] <- as.numeric(coef(fit.i)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI[c(1,2,3,5,4,6),], sort=FALSE, xlab=expression(beta))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

# Bootstrap "p-values"
boot_sel <- apply(B!=0, 2, mean)
DF <- data.frame(Var=colnames(X), Type=varType, bs=100*boot_sel, bz=100*(1-boot_sel))
ggplot(DF, aes(Var, bs, fill=varType)) + geom_bar(stat='identity') + coord_flip() + xlab('') + ylab('Selection percentage')
ggplot(DF, aes(Var, bz, fill=varType)) + geom_bar(stat='identity') + coord_flip() + xlab('') + ylab('Percent zero')

# Residuals, large lam
cvfit <- cv.glmnet(X, y)
SD <- sqrt(cvfit$cvm[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, newx=X, s=cvfit$lambda.min))
B <- matrix(NA, 1000, ncol(X), dimnames=list(1:1000, colnames(X)))
pb <- txtProgressBar(1, 1000, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- glmnet(X, yy, lambda=0.35)
  B[i,] <- as.numeric(coef(fit.i)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI[c(1,2,3,5,4,6),], sort=FALSE, xlab=expression(beta), xlim=c(-1.5,1.5))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

# MCP
set.seed(4)
cvfit <- cv.ncvreg(X, y)
SD <- sqrt(cvfit$cve[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, X=X))
B <- matrix(NA, 1000, ncol(X), dimnames=list(1:1000, colnames(X)))
pb <- txtProgressBar(1, 1000, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- ncvreg(X, yy)
  B[i,] <- as.numeric(coef(fit.i, lambda=cvfit$lambda.min)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI[c(1,2,3,5,4,6),], sort=FALSE, xlab=expression(beta), xlim=c(-1.5,1.5))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

Bb <- B[,varType=="B"]
CI <- cbind(apply(Bb, 2, median), t(apply(Bb, 2, quantile, probs=c(.025, .975))))
CIplot(CI, labels=paste0("B", 1:12), sort=FALSE, xlab=expression(beta))
