# ------------------------------------------------------------------------------ # -------- BlockBoost functions for (approx.) normal outcomes ------------------ # -------- 01/11/2011 ------------------ # ------------------------------------------------------------------------------ # This file contains R-code and auxiliary functions for # blockwise boosting as described in Tutz & Gertheiss (2010). blockboost <- function(x, y, k, penalty = c("ridge","diff"), crit = c("CV","GCV","RSS"), constant = T, lambda, mfinal = 20) { # The blockwise Boost with input: # x: data matrix with rows corresponding to training samples # and columns corresponding to predictors. # y: vector of response values. # k: block size (must be > 1). # penalty: the type of penalty, one of "ridge" or "diff". # crit: the selection criterion, one of "CV", "GCV" # or "RSS" (residual sum of squares). # constant: logical, shall a constant be included? # lambda: penalty parameter. # mfinal: number of boosting iterations. # Return is a list with: # alphahat: a vector of fitted intercept values, with entry m corresponding # to the intercept after m boosting iterations. # Betahat: a matrix with column m corresponding to the coefficient vector # estimated after m boosting iterations. # yhat: fitted response values. # aic: a vector of AIC values, with entry m corresponding to AIC # after m boosting iterations. ## check block size if (k < 2 | k%%1 != 0) stop("block size k must be natural > 1") ## type of penalty and selection criterion penalty <- switch(match.arg(penalty, c("ridge","diff")), ridge="ridge", diff="diff") crit <- switch(match.arg(crit, c("CV","GCV","RSS")), CV="CV", GCV="GCV", RSS="RSS") ## Length of training and test data ny <- length(y) ## Design and penalty matrices (the constant is not penalized) if (penalty=="diff") { D1 <- cbind(diag(-1,k-1),0) + cbind(0,diag(1,k-1)) D1 <- rbind(c(1,rep(0,k-1)),D1,c(rep(0,k-1),1)) Omega <- t(D1)%*%D1 } else { Omega <- diag(1,k) } if (constant) { x <- cbind(1,x) Omega <- rbind(0,cbind(0,Omega)) } ## Length of the coefficient vector (incl.constant) nz <- ncol(x) ## Initialization yhat <- numeric(ny) Betahat <- matrix(0,nrow=nz,ncol=mfinal) aic <- numeric(mfinal) ## Boosting Iterations for (m in 1:mfinal) { ## Computation of working response ywork <- y-yhat ## Fitting the weak learner if (k < 400) Update <- smallblocklearner(yvalues=ywork, zlearn=x, Omega=Omega, lambda=lambda, crit=crit, constant=constant, Hat=T) else Update <- bigblocklearner(yvalues=ywork, zlearn=x, Omega=Omega, lambda=lambda, crit=crit, constant=constant, Hat=T) ## Updating yhat <- yhat + Update$learn if (m==1) Betahat[,1] <- Update$betahat else Betahat[,m] <- Betahat[,m-1] + Update$betahat ## AIC if (m==1) { Bm <- Update$Hat Bmpart <- diag(1,ny) - Bm } else { Bmpart <- (diag(1,ny) - Update$Hat)%*%Bmpart Bm <- diag(1,ny) - Bmpart } trBm <- sum(diag(Bm)) aic[m] <- log(mean((y-yhat)^2)) + (1+trBm/ny)/(1-(trBm + 2)/ny) } ## Intercept if (constant) { alphahat <- as.numeric(Betahat[1,]) Betahat <- Betahat[-1,] } else { alphahat <- 0 } ## Output: return(list("alphahat"=alphahat,"Betahat"=Betahat,"yhat"=yhat,"aic"=aic)) } # -------- auxiliary functions ------------------------------------------------- library(lars) # some functions from the lars package are needed smallblocklearner <- function(yvalues, w=1, zlearn, ztest=NULL, Omega, lambda, Hat=F, constant=T, crit="RSS") { # The weak learner which selects blocks of k adjacent predictors and fits # a (weighted) generalized ridge regression model. # This implementation should be used for small block sizes k # Input: # yvalues: real valued training response values # w: vector of weights # zlearn: training design matrix # ztest: test design matrix # Omega: penalty matrix # lambda: penalty parameter # Hat: logical, shall the hat matrix be returned? # constant: logical, do the first column of z and first column/row # of Omega correspond to the (not penalized) intercept? # crit: the selection criterion, one of "CV", "GCV" or "RSS", default # is "RSS". # Return is a list with: # betahat: the estimated coefficient vector # learn: a vector of fitted training response values # test: a vector of fitted test response values # Hat: the hat matrix ## Initialization nz <- ncol(zlearn) if (constant) { k <- ncol(Omega) - 1 Selected <- k+1 XWX <- t(zlearn[,1:(k+1)])%*%(zlearn[,1:(k+1)]*w) XWX[lower.tri(XWX)] <- 0 Omega[lower.tri(Omega)] <- 0 XWy <- t(zlearn[,1:(k+1)])%*%(yvalues*w) } else { k <- ncol(Omega) Selected <- k XWX <- t(zlearn[,1:k])%*%(zlearn[,1:k]*w) XWX[lower.tri(XWX)] <- 0 Omega[lower.tri(Omega)] <- 0 XWy <- t(zlearn[,1:k])%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(chol(XWX + lambda*Omega)) betahat <- Hh%*%XWy if (constant) {H <- zlearn[,1:(k+1)]%*%Hh%*%t(zlearn[,1:(k+1)]*w)} else {H <- zlearn[,1:k]%*%Hh%*%t(zlearn[,1:k]*w)} } else { betahat <- chol2inv(chol(XWX + lambda*Omega))%*%XWy H <- NULL } if (constant) {yhat <- zlearn[,1:(k+1)]%*%betahat} else {yhat <- zlearn[,1:k]%*%betahat} if (crit == "CV") { Ih <- 1 - diag(H) cv <- sum((w*(yvalues - yhat)/Ih)^2) } else if (crit == "GCV") { Ih <- 1 - diag(H) gcv <- sum(w*(yvalues - yhat)^2)/sum(Ih)^2 } else {rss <- sum(w*(yvalues - yhat)^2)} ## Finding the right variable block istart <- ifelse(constant,k+2,k+1) for(i in istart:nz) { ## Matrix update if (constant) { XWX[1,2:k] <- XWX[1,3:(k+1)] XWX[2:k,2:k] <- XWX[3:(k+1),3:(k+1)] XWX[1,k+1] <- sum(w*zlearn[,i]) XWX[k+1,k+1] <- sum(w*zlearn[,i]^2) XWX[2:k,k+1] <- t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w) XWy[2:k] <- XWy[3:(k+1)] XWy[k+1] <- zlearn[,i]%*%(yvalues*w) } else { XWX[1:(k-1),1:(k-1)] <- XWX[2:k,2:k] XWX[k,k] <- sum(w*zlearn[,i]^2) XWX[1:(k-1),k] <- t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w) XWy[1:(k-1)] <- XWy[2:k] XWy[k] <- zlearn[,i]%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(chol(XWX + lambda*Omega)) betahati <- Hh%*%XWy } else { betahati <- chol2inv(chol(XWX + lambda*Omega))%*%XWy } if (constant) {yhati <- zlearn[,c(1,(i-k+1):i)]%*%betahati} else {yhati <- zlearn[,(i-k+1):i]%*%betahati} if (crit == "CV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) cvi <- sum((w*(yvalues - yhati)/Ihi)^2) ## Finding the best model in terms of minimum CV if(cvi < cv) { cv <- cvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else if (crit == "GCV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) gcvi <- sum(w*(yvalues - yhati)^2)/sum(Ihi)^2 ## Finding the best model in terms of minimum GCV if(gcvi < gcv) { gcv <- gcvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else { rssi <- sum(w*(yvalues - yhati)^2) ## Finding the best model in terms of minimum RSS if(rssi < rss) { rss <- rssi Selected <- i betahat <- betahati yhat <- yhati if (Hat) { if (constant) {H <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {H <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} } } } } ## Coefficients if (constant) {betahat <- c(betahat[1],rep(0,Selected-k-1),betahat[-1],rep(0,nz-Selected))} else {betahat <- c(rep(0,Selected-k),betahat,rep(0,nz-Selected))} ## Test response values if (length(ztest)>0) {test <- as.numeric(ztest%*%betahat)} else {test <- NULL} ## Output return(list("betahat"=betahat,"learn"=as.numeric(yhat), "test"=test,"Hat"=H)) } bigblocklearner <- function(yvalues, w=1, zlearn, ztest=NULL, Omega, lambda, Hat=F, constant=T, crit="RSS") { # The weak learner which selects blocks of k adjacent predictors and fits # a (weighted) generalized ridge regression model. # This implementation should be used for large block sizes k # Input: # yvalues: real valued training response values # w: vector of weights # zlearn: training design matrix # ztest: test design matrix # Omega: penalty matrix # lambda: penalty parameter # Hat: logical, shall the hat matrix be returned? # constant: logical, do the first column of z and first column/row # of Omega correspond to the (not penalized) intercept? # crit: the selection criterion, one of "CV", "GCV" or "RSS", default # is "RSS". # Return is a list with: # betahat: the estimated coefficient vector # learn: a vector of fitted training response values # test: a vector of fitted test response values # Hat: the hat matrix ## Initialization nz <- ncol(zlearn) if (constant) { k <- ncol(Omega) - 1 Selected <- k+1 } else { k <- ncol(Omega) Selected <- k } betahat <- numeric(nz) ## Finding the right variable block ## Initialization if (constant) { cXX <- chol(t(zlearn[,1:(k+1)])%*%(zlearn[,1:(k+1)]*w) + lambda*Omega) XWy <- t(zlearn[,1:(k+1)])%*%(yvalues*w) } else { cXX <- chol(t(zlearn[,1:k])%*%(zlearn[,1:k]*w) + lambda*Omega) XWy <- t(zlearn[,1:k])%*%(yvalues*w) } if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(cXX) betahat <- Hh%*%XWy if (constant) {H <- zlearn[,1:(k+1)]%*%Hh%*%t(zlearn[,1:(k+1)]*w)} else {H <- zlearn[,1:k]%*%Hh%*%t(zlearn[,1:k]*w)} } else { betahat <- backsolve(cXX,backsolve(cXX,XWy,transpose=T)) H <- NULL } if (constant) {yhat <- zlearn[,1:(k+1)]%*%betahat} else {yhat <- zlearn[,1:k]%*%betahat} if (crit == "CV") { Ih <- 1 - diag(H) cv <- sum((w*(yvalues - yhat)/Ih)^2) } else if (crit == "GCV") { Ih <- 1 - diag(H) gcv <- sum(w*(yvalues - yhat)^2)/sum(Ih)^2 } else {rss <- sum(w*(yvalues - yhat)^2)} ## Finding the right variable block istart <- ifelse(constant,k+2,k+1) for(i in istart:nz) { ## chol/matrix update if (constant) { cXX <- upchol(-downdateR(cXX,2),c(sum(w*zlearn[,i]), t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w),sum(w*zlearn[,i]^2))+ lambda*Omega[,k+1]) XWy[2:k] <- XWy[3:(k+1)] XWy[k+1] <- zlearn[,i]%*%(yvalues*w) } else { cXX <- upchol(-downdateR(cXX,1), c(t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w),sum(w*zlearn[,i]^2))+ lambda*Omega[,k]) XWy[1:(k-1)] <- XWy[2:k] XWy[k] <- zlearn[,i]%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(cXX) betahati <- Hh%*%XWy } else { betahati <- backsolve(cXX,backsolve(cXX,XWy,transpose=T)) } if (constant) {yhati <- zlearn[,c(1,(i-k+1):i)]%*%betahati} else {yhati <- zlearn[,(i-k+1):i]%*%betahati} if (crit == "CV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) cvi <- sum((w*(yvalues - yhati)/Ihi)^2) ## Finding the best model in terms of minimum CV if(cvi < cv) { cv <- cvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else if (crit == "GCV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) gcvi <- sum(w*(yvalues - yhati)^2)/sum(Ihi)^2 ## Finding the best model in terms of minimum GCV if(gcvi < gcv) { gcv <- gcvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else { rssi <- sum(w*(yvalues - yhati)^2) ## Finding the best model in terms of minimum RSS if(rssi < rss) { rss <- rssi Selected <- i betahat <- betahati yhat <- yhati if (Hat) { if (constant) {H <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {H <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} } } } } ## Coefficients if (constant) {betahat <- c(betahat[1],rep(0,Selected-k-1),betahat[-1],rep(0,nz-Selected))} else {betahat <- c(rep(0,Selected-k),betahat,rep(0,nz-Selected))} ## Test response values if (length(ztest)>0) {test <- as.numeric(ztest%*%betahat)} else {test <- NULL} ## Output return(list("betahat"=betahat,"learn"=as.numeric(yhat), "test"=test,"Hat"=H)) } upchol <- function(R,x) { # Update Cholesky factorization R of A if a column (and row) x # is added to A p <- ncol(R) + 1 r <- backsolve(R,x[-p],transpose=T) rpp <- x[p] - sum(r^2) R <- cbind(rbind(R,0),c(r,sqrt(rpp))) R }