# Variable selection using CIC # Additive isotonic regression # y=f(x1)+f(x2)+...+f(x_L)+Za+e # xmat is n x L full column rank -- continuous, constrained predictors # dir contains L direction indicators: TRUE if increasing, FALSE if decreasing # Z is n by p matrix of categorical, unconstrained predictors -- # levels must be numbered consecutively from 1 # finds model selection criteria over subsets of columns of xmat and zmat # # uses EXPECTED VALUES of df for CIC -- computationally intensive! # nsim = number of simulations for calculation of E(df) addiso=function(y,xmat,dir,zmat,nsim){ n=length(y) capl=length(xmat)/n np=length(zmat)/n nv=np+capl cval=1.5 # create delta matrix delta=matrix(nrow=(n-1)*capl,ncol=n) varind=matrix(FALSE,nrow=capl,ncol=(n-1)*capl) nd=0 for(l in 1:capl){ del=makedelta(xmat[,l]) if(!dir[l]){del=-del} nl=length(del)/n delta[(nd+1):(nd+nl),]=del varind[l,(nd+1):(nd+nl)]=TRUE nd=nd+nl } delta=delta[1:nd,] varind=varind[,1:nd] # make dummy matrix for the categorical predictors nlev=1:(np+1) for(i in 1:np){nlev[i+1]=max(zmat[,i])-1} ndum=sum(nlev)-np+1 cmat=matrix(0,nrow=n,ncol=ndum) cmat[,1]=1:n*0+1 cind=matrix(FALSE,nrow=np,ncol=ndum);cind[,1]=TRUE for(ivar in 1:np){ for(ilev in 1:(nlev[ivar+1])){ column=sum(nlev[1:ivar])+ilev cmat[zmat[,ivar]==ilev,column]=1 cind[ivar,column]=TRUE } } # make delta matrix have rows orthogonal to column space of cmat pcmat=cmat%*%solve(t(cmat)%*%cmat)%*%t(cmat) delta=delta-t(pcmat%*%t(delta)) # loop through all subsets of columns of xmat and zmat nmod=2^nv cic=1:nmod adf=1:nmod # start with empty set! fit0=mean(y);df0=1 sse0=sum((y-fit0)^2) cic[1]=log(sse0)+log(2/(n-cval)+1) for(i in 2:nmod){ binrep=getbin(i,nv) usedel=1:nd<0 for(l in 1:capl){if(binrep[l]==1){usedel[varind[l,]]=TRUE}} delsub=delta[usedel,] if(sum(usedel>0)){ proj=hinge0(y,delsub) dfcone=proj$df ycone=proj$yhat }else{ycone=1:n*0;dfcone=0} usedum=1:(ndum)<0 usedum[1]=TRUE for(l in 1:np){ if(binrep[l+capl]==1){usedum[cind[l,]]=TRUE} } dmat=cmat[,usedum] dproj=dmat%*%solve(t(dmat)%*%dmat)%*%t(dmat) theta=ycone+dproj%*%y df=dfcone+sum(usedum) sse=sum((y-theta)^2) # now get expected values of df under H_0:not used if(sum(usedel)>0){ usedf=1:nsim for(isim in 1:nsim){ ysim=rnorm(n) psim=hinge0(ysim,delsub) cdf=psim$df usedf[isim]=cdf+sum(usedum) } adf[i]=mean(usedf)+sum(usedum) }else{adf[i]=sum(usedum)} cic[i]=log(sse)+log(2*adf[i]/(n-cval*adf[i])+1) xsend=cbind(xmat[,binrep[1:capl]==1],dmat) dflin=length(xsend)/n } ans=new.env() ans$theta=theta ans$df=adf ans$cic=cic ans } # make cone edges makedelta=function(x){ n=length(x) xs=sort(x) xu=1:n*0 # find unique x values nu=1;xu[1]=xs[1];sm=1e-12 for(i in 2:n){ if(xs[i]>xs[i-1]+sm){ nu=nu+1 xu[nu]=xs[i] } } delta=matrix(nrow=nu-1,ncol=n) for(i in 1:(nu-1)){ delta[i,x<=xu[i]]=0;delta[i,x>xu[i]]=1 delta[i,]=delta[i,]-mean(delta[i,]) } delta } # cone projection hinge0=function(y,delta){ n=length(y) m=length(delta)/n sm=1e-8 h=1:m<0 obs=1:m check=0 b2=delta%*%y if(max(b2)>sm){ i=min(obs[b2==max(b2)]) h[i]=TRUE } if(max(b2)(-sm)) { check=1 theta=t(xmat)%*%a b2=delta%*%(y-theta) obs=1:m if(max(b2)>sm){ i=min(obs[b2==max(b2)]) check=0 h[i]=TRUE } } } coefs=1:m*0 coefs[h]=a ans=new.env() ans$yhat=theta ans$nrep=nrep ans$df=sum(h) ans$coefs=coefs ans } ################################################### # function to find binary representation of number # ################################################## getvars=function(num){ i=num digits=0 power=0 while(digits==0){ if(i<2^power){digits=power} power=power+1 } binry=1:digits*0 if(num>0){binry[1]=1} i=i-2^(digits-1) power=digits-2 for(p in power:0){ if(i>=2^p){ i=i-2^p binry[digits-p]=1 } } binry } getbin=function(num,capl){ br=getvars(num-1) digits=length(br) binrep=1:capl*0 binrep[(capl-digits+1):capl]=br binrep }