source('http://myweb.uiowa.edu/pbreheny/7600/s16/notes/fun.R')

# TCGA data ---------------------------------------------------------------

load(url('http://myweb.uiowa.edu/pbreheny/data/bcTCGA.RData'))
p <- ncol(X)
colnames(X) <- paste0("V", 1:p)
X.TCGA <- X
y.TCGA <- y

# Generate example data ---------------------------------------------------

set.seed(1)
n <- 100
p1 <- 6
N <- 100
p <- 60
K <- 3
bb <- numeric(6*K)
bb[(0:5)*K+1] <- c(1,-1,0.5,0.5,-0.5,-0.5)
require(Matrix)
A <- matrix(0.5, 3, 3) + 0.5*diag(3)
B <- diag(p-K*p1)
Sigma <- bdiag(A, A, A, A, A, A, B)
R <- chol(Sigma)

X <- as.matrix(matrix(rnorm(n * p), n, p) %*% R)
colnames(X) <- paste0("V", 1:p)
y <- X[,1:(p1*K)] %*% bb + rnorm(n)

varType <- vector("character", p)
varType[(0:5)*K+1] <- "A"
varType[c((0:5)*K+2, (0:5)*K+3)] <- "B"
varType[(p1*K+1):p] <- "C"


# FIR (for comparison) ----------------------------------------------------

# Lasso
require(ncvreg)
fit <- ncvreg(X, y, penalty="lasso")
FIR <- fir(fit)
b <- coef(fit, which=max(which(FIR$FIR < 0.1)))[-1]
tapply(b!=0, varType, sum)
b <- coef(fit, which=max(which(FIR$FIR < 0.05)))[-1]
tapply(b!=0, varType, sum)

# MCP
require(ncvreg)
fit <- ncvreg(X, y)
FIR <- fir(fit)
b <- coef(fit, which=max(which(FIR$FIR < 0.1)))[-1]
tapply(b!=0, varType, sum)
b <- coef(fit, which=max(which(FIR$FIR < 0.05)))[-1]
tapply(b!=0, varType, sum)


# Sample splitting --------------------------------------------------------

# Single split (Slides 5-6)
set.seed(1)
ind <- as.logical(sample(rep(0:1, each=n/2)))
require(glmnet)
cvfit <- cv.glmnet(X[ind,], y[ind])
b <- coef(cvfit, s=cvfit$lambda.min)[-1]
tapply(b!=0, varType, sum)
XX <- X[!ind,which(b!=0)]
fit <- lm(y[!ind]~XX)
summary(fit)
summ <- summary(fit)$coefficients[-1,]
var.id <- as.numeric(gsub("XXV", "", rownames(summ)))
pval <- rep(1, p)
pval[var.id] <- summ[,4]
tapply(pval < 0.05, varType, sum)
tapply(pval < 0.1, varType, sum)

# Multi-split (Slide 10)
pb <- txtProgressBar(1, 100, style=3)
P <- matrix(1, 100, p)
for (i in 1:100) {
  ind <- as.logical(sample(rep(0:1, each=n/2)))
  require(glmnet)
  cvfit <- cv.glmnet(X[ind,], y[ind])
  b <- coef(cvfit, s=cvfit$lambda.min)[-1]
  XX <- X[!ind,which(b!=0)]
  fit <- lm(y[!ind]~XX)
  summ <- summary(fit)$coefficients[-1,]
  var.id <- as.numeric(gsub("XXV", "", rownames(summ)))
  P[i, var.id] <- summ[,4]
  setTxtProgressBar(pb, i)
}
boxplot(apply(P, 2, median)[(0:5)*K+1],
        apply(P, 2, median)[c((0:5)*K+2, (0:5)*K+3)],
        apply(P, 2, median)[(p1*K+1):p], col="gray", frame.plot=FALSE, pch=19,
        names=LETTERS[1:3], las=1, ylim=c(0,1), ylab="p")
p.agg <- apply(P, 2, median)
tapply(p.agg < 0.05, varType, sum)

# TCGA (Slide 12) (this takes a while)
pb <- txtProgressBar(1, 100, style=3)
P <- matrix(1, 100, ncol(X.TCGA))
for (i in 1:100) {
  ind <- as.logical(sample(rep(0:1, each=length(y.TCGA)/2)))
  require(glmnet)
  cvfit <- cv.glmnet(X.TCGA[ind,], y.TCGA[ind], nfolds=5)
  b <- coef(cvfit, s=cvfit$lambda.min)[-1]
  XX <- X.TCGA[!ind,which(b!=0)]
  yy <- y.TCGA[!ind]
  fit <- lm(yy~XX)
  summ <- summary(fit)$coefficients[-1,]
  var.id <- as.numeric(gsub("XXV", "", rownames(summ)))
  P[i, var.id] <- summ[,4]
  setTxtProgressBar(pb, i)
}
p.agg <- apply(P, 2, median)
sum(p.agg < .05)


# Stability selection -----------------------------------------------------

# Slide 15
require(glmnet)
fit <- glmnet(X, y)
pb <- txtProgressBar(1, 100, style=3)
SS <- array(NA, dim=c(100, p, length(fit$lambda)), dimnames=list(1:100, colnames(X), fit$lambda))
Q <- matrix(NA, 100, length(fit$lambda))
for (i in 1:100) {
  ind <- as.logical(sample(rep(0:1, each=n/2)))
  fit.i <- glmnet(X[ind,], y[ind], lambda=fit$lambda)
  SS[i,,] <- as.matrix(coef(fit.i)[-1,]!=0)
  Q[i,] <- sapply(predict(fit.i, type="nonzero"), length)
  setTxtProgressBar(pb, i)
}
S <- apply(SS, 2:3, mean)
q <- apply(Q, 2, mean)

# FDR Bound (Slide 17)
EV <- q^2/(p*(2*.9-1))
FDR <- EV/sapply(predict(fit, type="nonzero"), length)
max(which(FDR < .1))
which(S[,max(which(FDR < .3))] > .9)

# Slide 15
l <- fit$lambda
col <- rep("gray", p)
col[varType=="A"] <- "red"
matplot(l, t(S), type="l", lty=1, xlim=rev(range(l)), col=col, lwd=2, las=1, bty="n",
        xlab=expression(lambda), ylab="Stability")

# TCGA (Slide 16)
require(glmnet)
fit <- glmnet(X.TCGA, y.TCGA, lambda.min=0.1)
pb <- txtProgressBar(1, 100, style=3)
SS <- array(NA, dim=c(100, ncol(X.TCGA), length(fit$lambda)), dimnames=list(1:100, colnames(X.TCGA), fit$lambda))
Q <- matrix(NA, 100, length(fit$lambda))
for (i in 1:100) {
  ind <- as.logical(sample(rep(0:1, each=length(y.TCGA)/2)))
  fit.i <- glmnet(X.TCGA[ind,], y.TCGA[ind], lambda=fit$lambda)
  SS[i,,] <- as.matrix(coef(fit.i)[-1,]!=0)
  Q[i,] <- sapply(predict(fit.i, type="nonzero"), length)
  setTxtProgressBar(pb, i)
}
S <- apply(SS, 2:3, mean)
q <- apply(Q, 2, mean)
l <- fit$lambda
col <- rep("gray", ncol(X.TCGA))
col[which(apply(S, 1, max) > 0.6)] <- "red"
matplot(l, t(S), type="l", lty=1, xlim=rev(range(l)), col=col, lwd=2, las=1, bty="n",
        xlab=expression(lambda), ylab="Stability")

# FDR bound (bottom of slide 17)
EV <- q^2/(p*(2*.9-1))
FDR <- EV/sapply(predict(fit, type="nonzero"), length)
max(which(FDR < .1))
fit$lambda[max(which(FDR < .1))]


# Bootstrap ---------------------------------------------------------------

# Slide 19
cvfit <- cv.glmnet(X, y)
SD <- sqrt(cvfit$cvm[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, newx=X, s=cvfit$lambda.min))
B <- matrix(NA, 1000, p)
pb <- txtProgressBar(1, 1000, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- glmnet(X, yy, lambda=0.07)
  B[i,] <- as.numeric(coef(fit.i)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI, labels=paste0("A", 1:6), sort=FALSE, xlab=expression(beta))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

# Bootstrap "p-values"
bz <- apply(B==0, 2, mean)
bz[varType=="A"]
bz[varType=="B"]
bz[varType=="C"]
barplot(bz[varType=="A"], border="white", horiz=TRUE, names=paste0("A", 1:6), las=1, xlim=c(0,1), xlab="Proportion equal to zero"); abline(v=0.05, lwd=2, lty=2, xpd=0)
barplot(bz[varType=="B"], border="white", horiz=TRUE, names=paste0("B", 1:12), las=1, xlim=c(0,1), xlab="Proportion equal to zero"); abline(v=0.05, lwd=2, lty=2, xpd=0)
barplot(bz[varType=="C"], border="white", horiz=TRUE, names=paste0("C", 1:42), las=1, xlim=c(0,1), xlab="Proportion equal to zero"); abline(v=0.05, lwd=2, lty=2, xpd=0)

# Slide 23
cvfit <- cv.glmnet(X, y)
SD <- sqrt(cvfit$cvm[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, newx=X, s=cvfit$lambda.min))
B <- matrix(NA, 1000, p)
pb <- txtProgressBar(1, 1000, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- glmnet(X, yy, lambda=0.35)
  B[i,] <- as.numeric(coef(fit.i)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI, labels=paste0("A", 1:6), sort=FALSE, xlab=expression(beta), xlim=c(-1.5,1.5))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

# Slide 25
require(ncvreg)
cvfit <- cv.ncvreg(X, y)
SD <- sqrt(cvfit$cve[cvfit$lambda==cvfit$lambda.min])
mu <- as.numeric(predict(cvfit, X=X))
B <- matrix(NA, 1000, p)
pb <- txtProgressBar(1, 1000, style=3)
for (i in 1:1000) {
  yy <- mu + rnorm(n, sd=SD)
  fit.i <- ncvreg(X, yy)
  B[i,] <- as.numeric(coef(fit.i, lambda=cvfit$lambda.min)[-1])
  setTxtProgressBar(pb, i)
}
Ba <- B[,varType=="A"]
CI <- cbind(apply(Ba, 2, median), t(apply(Ba, 2, quantile, probs=c(.025, .975))))
CIplot(CI, labels=paste0("A", 1:6), sort=FALSE, xlab=expression(beta))
lines(c(1,1), c(5.5, 6.5), col="gray", lty=2, lwd=2, xpd=1)
lines(c(-1,-1), c(4.5, 5.5), col="gray", lty=2, lwd=2)
lines(c(0.5,0.5), c(2.5, 4.5), col="gray", lty=2, lwd=2)
lines(c(-0.5,-0.5), c(0.5, 2.5), col="gray", lty=2, lwd=2)

# Bootstrap intervals for class "B" variables
Bb <- B[,varType=="B"]
CI <- cbind(apply(Bb, 2, median), t(apply(Bb, 2, quantile, probs=c(.025, .975))))
CIplot(CI, labels=paste0("B", 1:12), sort=FALSE, xlab=expression(beta))
