# ------------------------------------------------------------------------------ # -------- BlockBoost functions for binary outcomes ---------------------------- # -------- 01/26/2009 ---------------------------- # ------------------------------------------------------------------------------ # This file contains modifications of the logitBoost R-code (R-package # boost 1.0-0 written by Marcel Dettling, 2008) and auxiliary functions # for blockwise boosting. logitblockboost <- function(xlearn, ylearn, xtest, k, penalty = c("ridge","diff"), lambda, mfinal = 20) { # The blockwise logitBoost with input: # xlearn: data matrix with rows corresponding to training samples # and columns corresponding to predictors. # ylearn: vector of training class labels, 0/1-coded. # xtest: data matrix with the same structure as xlearn but containing # test samples. # k: block size (must be > 1). # penalty: the type of penalty, one of "ridge" or "diff". # lambda: penalty parameter. # mfinal: number of boosting iterations. # Return is a list with: # alphahat: a vector of fitted intercept values, with entry m corresponding # to the intercept after m boosting iterations. # Betahat: a matrix with column m corresponding to the coefficient vector # estimated after m boosting iterations. # ptest: estimated probabilities of being class 1 given the test cases. ## check block size if (k < 2 | k%%1 != 0) stop("block size k must be natural > 1") ## type of penalty penalty <- switch(match.arg(penalty, c("ridge","diff")), ridge="ridge", diff="diff") ## Length of training and test data learn <- nrow(xlearn) test <- nrow(xtest) ## Design and penalty matrices (the constant is not penalized) xlearn <- cbind(1,xlearn) xtest <- cbind(1,xtest) if (penalty=="diff") { D1 <- cbind(diag(-1,k-1),0) + cbind(0,diag(1,k-1)) D1 <- rbind(c(1,rep(0,k-1)),D1,c(rep(0,k-1),1)) Omega <- t(D1)%*%D1 } else { Omega <- diag(1,k) } Omega <- cbind(0,rbind(0,Omega)) ## Length of the coefficient vector (incl.constant) nz <- dim(xlearn)[2] ## Initialization Flearn <- numeric(learn) Ftest <- numeric(test) plearn <- rep(1/2, learn) ptest <- matrix(0, test, mfinal) Betahat <- matrix(0,nrow=nz,ncol=mfinal+1) ## Boosting Iterations for (m in 1:mfinal) { ## Computation of working response and weights w <- pmax(plearn*(1-plearn), 1e-24) ywork <- (ylearn-plearn)/w ## Fitting the weak learner if (k < 100) Update <- smallblocklearner(ywork, w, xlearn, xtest, Omega, lambda) else Update <- bigblocklearner(ywork, w, xlearn, xtest, Omega, lambda) flearn <- Update$learn ftest <- Update$test ## Updating and probabilities Flearn <- Flearn + (1/2)*flearn Ftest <- Ftest + (1/2)*ftest plearn <- 1/(1+exp((-2)*Flearn)) ptest[,m] <- 1/(1+exp((-2)*Ftest)) Betahat[,m+1] <- Betahat[,m] + Update$betahat } # Constant alphahat <- as.numeric(Betahat[1,]) Betahat <- Betahat[-1,] ## Output: return(list("alphahat"=alphahat,"Betahat"=Betahat,"ptest"=ptest)) } cvlogitblockboost <- function(xlearn, ylearn, k, lambda, mfinal = 30, K=3, penalty = c("ridge","diff"), printFold = F) { # K-fold cross validation of blockwise logitBoost with input: # xlearn: data matrix with rows corresponding to training samples # and columns corresponding to predictors. # ylearn: vector of training class labels, 0/1-coded. # k: vector of block sizes k (each k must be > 1) # lambda: vector of penalty parameters # mfinal: maximum number of boosting iterations # K: number of folds for cross validation # penalty: the type of penalty, one of "ridge" or "diff". # printFold: logical, print the actual fold? # Return is a 3 dimensional array with number of misclassified observations, # and dimensions corresponding to # [values of k, values of lambda, number of boosting iterations] n <- length(ylearn) nK <- floor(n/K) ind <- sample(n,n) result <- array(0,dim=c(length(k),length(lambda),mfinal)) dimnames(result) <- list(k,lambda,1:mfinal) for (kj in seq(along=k)) { for (lam in seq(along=lambda)) { for (i in 1:K) { if (printFold) cat("Fold",i,"\n") # test observations if (i < K) indi <- ind[(i-1)*nK+(1:nK)] else indi <- ind[((i-1)*nK+1):n] # fitting modellogitBB <- logitblockboost(xlearn=xlearn[-indi,], ylearn=ylearn[-indi], xtest=xlearn[indi,], k=k[kj], penalty=penalty, lambda=lambda[lam], mfinal=mfinal) # misclassification for (m in 1:mfinal) { result[kj,lam,m] <- result[kj,lam,m] + sum(abs(ylearn[indi]+1 - apply(cbind(1-modellogitBB$ptest[,m], modellogitBB$ptest[,m]),1,which.max))) } } } } return(result) } # -------- auxiliary functions ------------------------------------------------- library(lars) # some functions from the lars package are needed smallblocklearner <- function(yvalues, w=1, zlearn, ztest=NULL, Omega, lambda, Hat=F, constant=T, crit="RSS") { # The weak learner which selects blocks of k adjacent predictors and fits # a (weighted) generalized ridge regression model. # This implementation should be used for small block sizes k # Input: # yvalues: real valued training response values # w: vector of weights # zlearn: training design matrix # ztest: test design matrix # Omega: penalty matrix # lambda: penalty parameter # Hat: logical, shall the hat matrix be returned? # constant: logical, do the first column of z and first column/row # of Omega correspond to the (not penalized) intercept? # crit: the selection criterion, one of "CV", "GCV" or "RSS", default # is "RSS". # Return is a list with: # betahat: the estimated coefficient vector # learn: a vector of fitted training response values # test: a vector of fitted test response values # Hat: the hat matrix ## Initialization nz <- ncol(zlearn) if (constant) { k <- ncol(Omega) - 1 Selected <- k+1 XWX <- t(zlearn[,1:(k+1)])%*%(zlearn[,1:(k+1)]*w) XWX[lower.tri(XWX)] <- 0 Omega[lower.tri(Omega)] <- 0 XWy <- t(zlearn[,1:(k+1)])%*%(yvalues*w) } else { k <- ncol(Omega) Selected <- k XWX <- t(zlearn[,1:k])%*%(zlearn[,1:k]*w) XWX[lower.tri(XWX)] <- 0 Omega[lower.tri(Omega)] <- 0 XWy <- t(zlearn[,1:k])%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(chol(XWX + lambda*Omega)) betahat <- Hh%*%XWy if (constant) {H <- zlearn[,1:(k+1)]%*%Hh%*%t(zlearn[,1:(k+1)]*w)} else {H <- zlearn[,1:k]%*%Hh%*%t(zlearn[,1:k]*w)} } else { betahat <- chol2inv(chol(XWX + lambda*Omega))%*%XWy H <- NULL } if (constant) {yhat <- zlearn[,1:(k+1)]%*%betahat} else {yhat <- zlearn[,1:k]%*%betahat} if (crit == "CV") { Ih <- 1 - diag(H) cv <- sum((w*(yvalues - yhat)/Ih)^2) } else if (crit == "GCV") { Ih <- 1 - diag(H) gcv <- sum(w*(yvalues - yhat)^2)/sum(Ih)^2 } else {rss <- sum(w*(yvalues - yhat)^2)} ## Finding the right variable block istart <- ifelse(constant,k+2,k+1) for(i in istart:nz) { ## Matrix update if (constant) { XWX[1,2:k] <- XWX[1,3:(k+1)] XWX[2:k,2:k] <- XWX[3:(k+1),3:(k+1)] XWX[1,k+1] <- sum(w*zlearn[,i]) XWX[k+1,k+1] <- sum(w*zlearn[,i]^2) XWX[2:k,k+1] <- t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w) XWy[2:k] <- XWy[3:(k+1)] XWy[k+1] <- zlearn[,i]%*%(yvalues*w) } else { XWX[1:(k-1),1:(k-1)] <- XWX[2:k,2:k] XWX[k,k] <- sum(w*zlearn[,i]^2) XWX[1:(k-1),k] <- t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w) XWy[1:(k-1)] <- XWy[2:k] XWy[k] <- zlearn[,i]%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(chol(XWX + lambda*Omega)) betahati <- Hh%*%XWy } else { betahati <- chol2inv(chol(XWX + lambda*Omega))%*%XWy } if (constant) {yhati <- zlearn[,c(1,(i-k+1):i)]%*%betahati} else {yhati <- zlearn[,(i-k+1):i]%*%betahati} if (crit == "CV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) cvi <- sum((w*(yvalues - yhati)/Ihi)^2) ## Finding the best model in terms of minimum CV if(cvi < cv) { cv <- cvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else if (crit == "GCV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) gcvi <- sum(w*(yvalues - yhati)^2)/sum(Ihi)^2 ## Finding the best model in terms of minimum GCV if(gcvi < gcv) { gcv <- gcvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else { rssi <- sum(w*(yvalues - yhati)^2) ## Finding the best model in terms of minimum RSS if(rssi < rss) { rss <- rssi Selected <- i betahat <- betahati yhat <- yhati if (Hat) { if (constant) {H <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {H <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} } } } } ## Coefficients if (constant) {betahat <- c(betahat[1],rep(0,Selected-k-1),betahat[-1],rep(0,nz-Selected))} else {betahat <- c(rep(0,Selected-k),betahat,rep(0,nz-Selected))} ## Test response values if (length(ztest)>0) {test <- as.numeric(ztest%*%betahat)} else {test <- NULL} ## Output return(list("betahat"=betahat,"learn"=as.numeric(yhat), "test"=test,"Hat"=H)) } bigblocklearner <- function(yvalues, w=1, zlearn, ztest=NULL, Omega, lambda, Hat=F, constant=T, crit="RSS") { # The weak learner which selects blocks of k adjacent predictors and fits # a (weighted) generalized ridge regression model. # This implementation should be used for large block sizes k # Input: # yvalues: real valued training response values # w: vector of weights # zlearn: training design matrix # ztest: test design matrix # Omega: penalty matrix # lambda: penalty parameter # Hat: logical, shall the hat matrix be returned? # constant: logical, do the first column of z and first column/row # of Omega correspond to the (not penalized) intercept? # crit: the selection criterion, one of "CV", "GCV" or "RSS", default # is "RSS". # Return is a list with: # betahat: the estimated coefficient vector # learn: a vector of fitted training response values # test: a vector of fitted test response values # Hat: the hat matrix ## Initialization nz <- ncol(zlearn) if (constant) { k <- ncol(Omega) - 1 Selected <- k+1 } else { k <- ncol(Omega) Selected <- k } betahat <- numeric(nz) ## Finding the right variable block ## Initialization if (constant) { cXX <- chol(t(zlearn[,1:(k+1)])%*%(zlearn[,1:(k+1)]*w) + lambda*Omega) XWy <- t(zlearn[,1:(k+1)])%*%(yvalues*w) } else { cXX <- chol(t(zlearn[,1:k])%*%(zlearn[,1:k]*w) + lambda*Omega) XWy <- t(zlearn[,1:k])%*%(yvalues*w) } if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(cXX) betahat <- Hh%*%XWy if (constant) {H <- zlearn[,1:(k+1)]%*%Hh%*%t(zlearn[,1:(k+1)]*w)} else {H <- zlearn[,1:k]%*%Hh%*%t(zlearn[,1:k]*w)} } else { betahat <- backsolve(cXX,backsolve(cXX,XWy,transpose=T)) H <- NULL } if (constant) {yhat <- zlearn[,1:(k+1)]%*%betahat} else {yhat <- zlearn[,1:k]%*%betahat} if (crit == "CV") { Ih <- 1 - diag(H) cv <- sum((w*(yvalues - yhat)/Ih)^2) } else if (crit == "GCV") { Ih <- 1 - diag(H) gcv <- sum(w*(yvalues - yhat)^2)/sum(Ih)^2 } else {rss <- sum(w*(yvalues - yhat)^2)} ## Finding the right variable block istart <- ifelse(constant,k+2,k+1) for(i in istart:nz) { ## chol/matrix update if (constant) { cXX <- upchol(-downdateR(cXX,2),c(sum(w*zlearn[,i]), t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w),sum(w*zlearn[,i]^2))+ lambda*Omega[,k+1]) XWy[2:k] <- XWy[3:(k+1)] XWy[k+1] <- zlearn[,i]%*%(yvalues*w) } else { cXX <- upchol(-downdateR(cXX,1), c(t(zlearn[,(i-k+1):(i-1)])%*%(zlearn[,i]*w),sum(w*zlearn[,i]^2))+ lambda*Omega[,k]) XWy[1:(k-1)] <- XWy[2:k] XWy[k] <- zlearn[,i]%*%(yvalues*w) } ## (Weighted) generalized ridge regression if (Hat | crit == "CV" | crit == "GCV") { Hh <- chol2inv(cXX) betahati <- Hh%*%XWy } else { betahati <- backsolve(cXX,backsolve(cXX,XWy,transpose=T)) } if (constant) {yhati <- zlearn[,c(1,(i-k+1):i)]%*%betahati} else {yhati <- zlearn[,(i-k+1):i]%*%betahati} if (crit == "CV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) cvi <- sum((w*(yvalues - yhati)/Ihi)^2) ## Finding the best model in terms of minimum CV if(cvi < cv) { cv <- cvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else if (crit == "GCV") { if (constant) {Hi <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {Hi <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} Ihi <- 1 - diag(Hi) gcvi <- sum(w*(yvalues - yhati)^2)/sum(Ihi)^2 ## Finding the best model in terms of minimum GCV if(gcvi < gcv) { gcv <- gcvi Selected <- i betahat <- betahati yhat <- yhati H <- Hi } } else { rssi <- sum(w*(yvalues - yhati)^2) ## Finding the best model in terms of minimum RSS if(rssi < rss) { rss <- rssi Selected <- i betahat <- betahati yhat <- yhati if (Hat) { if (constant) {H <- zlearn[,c(1,(i-k+1):i)]%*%Hh%*%t(zlearn[,c(1,(i-k+1):i)]*w)} else {H <- zlearn[,(i-k+1):i]%*%Hh%*%t(zlearn[,(i-k+1):i]*w)} } } } } ## Coefficients if (constant) {betahat <- c(betahat[1],rep(0,Selected-k-1),betahat[-1],rep(0,nz-Selected))} else {betahat <- c(rep(0,Selected-k),betahat,rep(0,nz-Selected))} ## Test response values if (length(ztest)>0) {test <- as.numeric(ztest%*%betahat)} else {test <- NULL} ## Output return(list("betahat"=betahat,"learn"=as.numeric(yhat), "test"=test,"Hat"=H)) } upchol <- function(R,x) { # Update Cholesky factorization R of A if a column (and row) x # is added to A p <- ncol(R) + 1 r <- backsolve(R,x[-p],transpose=T) rpp <- x[p] - sum(r^2) R <- cbind(rbind(R,0),c(r,sqrt(rpp))) R }