library(hdrm)

# Setup
N <- 100
n <- 25
p <- 100
xnam <- paste0("V", 1:p)
form <- as.formula(paste("y ~ ", paste(xnam, collapse= "+")))
if (!file.exists('df.rds')) {
  set.seed(1)
  yobs <- yhat_lasso <- yhat_forward <- yhat_lm <- matrix(NA, N, n)
  v <- numeric(N)
  pb <- progress::progress_bar$new(total = N)
  for (i in 1:N) {
    X <- matrix(runif(n*p), n, p)
    dat <- as.data.frame(X)
    y <- rnorm(n)
    yobs[i,] <- y
    
    # lm
    fit <- lm(y~V1+V2+V3+V4+V5, dat)
    yhat_lm[i,] <- fit$fitted
    
    # Forward
    fit0 <- lm(y~1, data=dat)
    fit <- step(fit0, scope=form, direction="forward", trace=0, k=log(n), steps=5)
    yhat_forward[i,] <- fit$fitted
    
    # Lasso
    fit <- glmnet(X, y)
    yhat_lasso[i,] <- predict(fit, X, s=0.2)
    v[i] <- nrow(predict(fit, s=0.2, type='nonzero'))
    pb$tick()
  }
  saveRDS(mget(c('yobs', 'yhat_lasso', 'yhat_lm', 'yhat_forward', 'v')), 'df.rds')
}
out <- readRDS('df.rds')
sum(vapply(1:n, function(i) cov(out$yobs[,i], out$yhat_lm[,i]), numeric(1)))       #  5 df
sum(vapply(1:n, function(i) cov(out$yobs[,i], out$yhat_forward[,i]), numeric(1)))  #  19 df
sum(vapply(1:n, function(i) cov(out$yobs[,i], out$yhat_lasso[,i]), numeric(1)))    #  5 df
mean(out$v)                                                                        # 10 df

# fit <- ncvreg(X, y, penalty="lasso")
# AIC(fit)
# BIC(fit)

# AIC, BIC for pollution data
par(mar=c(4, 4, 2, 0.5))
Fig2.6()

for (i in 1:5) {
  par(bty="n", mar=rep(0, 4))
  plot(c(0, 5), c(-0.1, 1.1), type="n",xaxt="n",yaxt="n",xlab="",ylab="")
  for (j in 1:5) {
    if (i==j) polygon(j-1+c(0,1,1,0),c(0,0,1,1),col="white")
    else polygon(j-1+c(0,1,1,0),c(0,0,1,1),col="gray80")
    text(j-1+.5,.5,labels=j)
  }
}

# cvfit <- cv.glmnet(X, y)
# plot(cvfit)

# Example: Code to fit the lasso model to the pollution data
attach_data('pollution')
X <- std(X)
fit <- glmnet(X, y)
plot(fit, label=TRUE)
cvfit <- cv.glmnet(X, y, nfolds=length(y))
plot(cvfit)

# Version shown in class
Fig2.7()

# Value of lambda minimizing CV error
cvfit$lambda.min

# Number of nonzero coefficients
nrow(predict(fit, type='nonzero', s=cvfit$lambda.min))

# Range of lambda values within 1 SE of min
ind <- which(cvfit$lambda==cvfit$lambda.min)
range(fit$lambda[which(cvfit$cvm < cvfit$cvup[ind])])

cvfit <- cv.glmnet(X, y)
rsq <- 1-cvfit$cvm/var(y)

# cvfit <- cv.ncvreg(X, y, penalty="lasso")
# plot(cvfit, type="rsq")

par(mar=c(4, 5 ,0.5, 0.5))
Fig2.9()

# ncvreg summary
cvfit <- cv.ncvreg(X, y, penalty="lasso")

summary(cvfit)
# lasso-penalized linear regression with n=60, p=15
# At minimum cross-validation error (lambda=1.4949):
# -------------------------------------------------
#   Nonzero coefficients: 11
#   Cross-validation error (deviance): 1628.06
#   R-squared: 0.57
#   Signal-to-noise ratio: 1.34
#   Scale estimate (sigma): 40.349

