load(url("http://myweb.uiowa.edu/pbreheny/data/bcTCGA.RData"))
require(glmnet)
require(ncvreg)


# Breast cancer study -----------------------------------------------------

# Fit models: Enet
R <- S <- matrix(NA, 2, 4)
seed <- 1
set.seed(seed); cvfit1 <- cv.glmnet(X, y)
set.seed(seed); cvfit2 <- cv.glmnet(X, y, alpha=0.75)
set.seed(seed); cvfit3 <- cv.glmnet(X, y, alpha=0.5)
set.seed(seed); cvfit4 <- cv.glmnet(X, y, alpha=0.25)
R[1,] <- c(1-min(cvfit1$cvm)/var(y),
           1-min(cvfit2$cvm)/var(y),
           1-min(cvfit3$cvm)/var(y),
           1-min(cvfit4$cvm)/var(y))
S[1,] <- c(length(predict(cvfit1, type='nonzero')[[1]]),
           length(predict(cvfit2, type='nonzero')[[1]]),
           length(predict(cvfit3, type='nonzero')[[1]]),
           length(predict(cvfit4, type='nonzero')[[1]]))

# Fit models: Mnet
seed <- 1
cvfit5 <- cv.ncvreg(X, y, seed=seed)
cvfit6 <- cv.ncvreg(X, y, alpha=0.75, seed=seed)
cvfit7 <- cv.ncvreg(X, y, alpha=0.5, seed=seed)
cvfit8 <- cv.ncvreg(X, y, alpha=0.25, seed=seed)
R[2,] <- c(1-min(cvfit5$cve)/var(y),
           1-min(cvfit6$cve)/var(y),
           1-min(cvfit7$cve)/var(y),
           1-min(cvfit8$cve)/var(y))
S[2,] <- c(predict(cvfit5, 'nvars'),
           predict(cvfit6, 'nvars'),
           predict(cvfit7, 'nvars'),
           predict(cvfit8, 'nvars'))

# Table (Slide 9)
cbind(c(R[1,], R[2,]), c(S[1,], S[2,]))

# Correlation (Slide 11)
A <- cor(X)
rho <- A[upper.tri(A)]
hist(rho, breaks=seq(-1,1,by=.05), col="gray", border="white", main="", las=1)
mean(abs(rho)<0.4)

# Figure (Slide 12)
par(mfrow=c(1,3), mar=c(4, 4, 2, 0), cex=1)

plot(cvfit6$fit, bty='n')
mtext(expression(alpha==0.75), line=0.5)
abline(v=cvfit6$lambda.min, lty=2, lwd=2)

plot(cvfit7$fit, bty='n')
mtext(expression(alpha==0.5), line=0.5)
abline(v=cvfit7$lambda.min, lty=2, lwd=2)

plot(cvfit8$fit, bty='n')
mtext(expression(alpha==0.25), line=0.5)
abline(v=cvfit8$lambda.min, lty=2, lwd=2)

# Rat data ----------------------------------------------------------------

load(url("http://myweb.uiowa.edu/pbreheny/data/Scheetz2006.RData"))
require(ncvreg)

## Restrict to n w/ highest variance
n <- 5000
ind <- order(apply(X, 2, var), decreasing=TRUE)[1:n]
names(ind) <- colnames(X)[ind]
X <- X[,ind]

## Fit
seed <- 1
cvfit <- vector("list", 8)
cvfit[[1]] <- cv.ncvreg(X, y, dfmax=35, penalty="lasso", seed=seed, nfolds=30)
cvfit[[2]] <- cv.ncvreg(X, y, dfmax=35, penalty="lasso", alpha=.75, seed=seed, nfolds=30)
cvfit[[3]] <- cv.ncvreg(X, y, dfmax=35, penalty="lasso", alpha=.5, seed=seed, nfolds=30)
cvfit[[4]] <- cv.ncvreg(X, y, dfmax=50, penalty="lasso", alpha=.25, seed=seed, nfolds=30)
cvfit[[5]] <- cv.ncvreg(X, y, dfmax=35, seed=seed, nfolds=30)
cvfit[[6]] <- cv.ncvreg(X, y, dfmax=35, alpha=.75, seed=seed, nfolds=30)
cvfit[[7]] <- cv.ncvreg(X, y, dfmax=35, alpha=.5, seed=seed, nfolds=30)
cvfit[[8]] <- cv.ncvreg(X, y, dfmax=35, alpha=.25, seed=seed, nfolds=30)
R <- matrix(sapply(cvfit, function(x) 1-min(x$cve)/var(y)), 4, 2)
S <- matrix(sapply(cvfit, function(x) predict(x, type="nvars")), 4, 2)

# Table (Slide 17)
cbind(c(R[,1], R[,2]), c(S[,1], S[,2]))

# Correlation (Slide 18)
A <- cor(X)
rho <- A[upper.tri(A)]
hist(rho, breaks=seq(-1,1,by=.05), col="gray", border="white", main="", las=1)
mean(abs(rho)<0.4)
mean(abs(rho)>0.6)

# Which genes are those?
b <- coef(cvfit[[8]])[-1]
b[b!=0]
fData[names(b[b!=0]),]
