#There are 5 functions here: hpcc,clust.var.spline, # plot.pclust,penalty,vdist hpcc<-function(x,clust.init,alpha=.4,plot.iter=F,plot.result=T, df=5,force=F,forcenum=1) { # This function returns the final classification and the list of # total variance at each merge (which aids determination of the # correct number of clusters without use of a penalty function). #x is the data in two columns (each row is a data point) #clust.init is the initial clustering; it has the same length as the # number of rows in x, with each entry giving the number of the cluster # to which that point initially belongs. #alpha control the relative weight of variance along the curve and # variance about the curve #plot.iter determines whether to plot intermediate results at each # iteration #plot.result determines whether to produce a plot of the final result #df specifies the number of degrees of freedom to use in each feature # cluster #force=T causes clustering to continue until the number of clusters # is equal to forcenum, instead of stopping when the total variance # increases # plot.pclust is called to plot the result if plot.result=T. # functions clust.var.spline, vdist, and penalty are also called. # trace/debugging info can be printed by uncommenting the cat # statements # x is an n x 2 matrix of points n<-length(x[,1]) p<-length(x[1,]) #p must be 2 #uses clust.init as initial clustering. class1<-clust.init if(plot.result==T) {plot.pclust(x,class1)} #initialize variables. lastmerge<-c(0,100) clvars<-0 mergevars<-0 totalvar.track<-rep(0,50) totalvar.counter<-0 done<-F while(done==F) { classes<-unique(class1) clnum<-length(classes) lastm<-min(lastmerge) lastn<-max(lastmerge) oldclvars<-clvars clvars<-rep(0,clnum) # Find variance of each cluster for(i in 1:clnum) { #cat("Clvars for cluster ",i,": ") if (ilastn) {clvars[i]<-oldclvars[i+1]} else { #singletons are labeled 0 by mclust. We arbitrarily give them #a large variance to force their inclusion in other clusters. if(classes[i]==0) {clvars[i]<-100} else { clvars[i]<-clust.var.spline(x[class1==classes[i],],plot.me=plot.iter, dg=df,alpha)} }} #cat(clvars[i],"\n") } #calculate variance for each possible merge #alternate approaches: # 1. calculate merge variances only until one is found that will # reduce total variance (could be adapted to simulated annealing # for large numbers of clusters) # 2. could randomly choose merges to evaluate # 3. only look at merges of clusters which are "close" oldmergevars<-mergevars mergevars<-matrix(0,clnum,clnum) for(i in 1:clnum) { for(j in i:clnum) { #cat("mergevars of ",i,j," : ") if(i==j){mergevars[i,j]<-clvars[i]} else { if(ilastn&&j>lastn) {mergevars[i,j]<-oldmergevars[i+1,j+1]} else { mergevars[i,j]<- clust.var.spline(rbind(x[class1==classes[i],], x[class1==classes[j],]),plot.me=plot.iter, dg=df,alpha) } } } #cat(mergevars[i,j],"\n") }} #choose merge to minimize total variance and update class1 #if no merge will reduce variance and force=F, we are done nomergevar<-sum(clvars)+penalty(clnum) currentmin<-rep(0,3) #variance, clust num 1, clust num 2 currentmin[1]<-1000 # this is just for initialization totalvars<-mergevars #initialize totalvars for(i in 1:clnum) { for(j in i:clnum) { if (j==i) {totalvars[i,j]<-nomergevar} else { totalvars[i,j]<-mergevars[i,j]+sum(clvars)-clvars[i]-clvars[j]+penalty(clnum-1) if(totalvars[i,j]nomergevar) { if(force==F||clnum==forcenum) { cat("Done!\n") done<-T} else { #cat("Forcing a merge.\n") #cat("If we stop merging now, the total variance is: ",nomergevar,"\n") #cat("Merging ",currentmin[2],currentmin[3], # " will yield total variance of ",currentmin[1],"\n") lastmerge<-c(currentmin[2],currentmin[3]) class1[class1==classes[currentmin[2]]]<-classes[currentmin[3]] if(plot.result==T) {plot.pclust(x,class1) }}} else { if(clnum==forcenum) {cat(" Princlust Done!/n") done<-T } else{ #cat("If we stop merging now, the total variance is: ",nomergevar,"\n") #cat("Merging ",currentmin[2],currentmin[3]," will yield total # variance of ",currentmin[1],"\n") lastmerge<-c(currentmin[2],currentmin[3]) class1[class1==classes[currentmin[2]]]<-classes[currentmin[3]] if(plot.result==T) {plot.pclust(x,class1)}}} } if(plot.result==T) {plot.pclust(x,class1) } classes<-class1 return(classes,totalvar.track,x) } #------------------------------------------------------------------ clust.var.spline<-function(x,plot.me=F,dg=5,alpha=.4) { #This is the function which calls principal.curve # #If plot.me is T, then the iterative plots of the principal curve #will be displayed as they are being fit. #if there are fewer than 7 points in a cluster, we fit a principal component #line instead of a principal curve. #fewer than seven points if(length(x[,1])<7) { temp<-prcomp(cbind(x[,1],x[,2])) temp.slope<-temp$rot[2,1]/temp$rot[1,1] temp.int<-mean(x[,2])-(temp.slope)*(mean(x[,1])) temp.coef<-c(temp.int,temp.slope) a<-projpoints(x,temp.coef) if(plot.me) {plot(x[,1],x[,2]) abline(temp.coef) points(a,pch=2) } vabout<-vdist(x,a) nn<-length(x[,1]) epsilon<-a[1:(nn-1),]-a[2:nn,] mu<-apply(epsilon,2,mean) mu<-matrix(mu,byrow=T,ncol=2,nrow=nn-1) valong<-(.5)*vdist(epsilon,mu) } #more than seven points else { temp<-principal.curve(x,plot.true=plot.me,df=dg,maxit=3) vabout<-temp$dist nn<-length(temp$s[,1]) # For closed principal curves, the nn index in epsilon should wrap # around, so length(epsilon) = nn. For open curves, we have nn-1 segments. # We must put the points in the s matrix in the correct order before # calculating variance along the curve. The ordering is provided by # the $tag value from principal.curve() news<-temp$s[temp$tag,] epsilon<-news[1:(nn-1),]-news[2:nn,] mu<-apply(epsilon,2,mean) mu<-matrix(mu,byrow=T,ncol=2,nrow=nn-1) valong<-(.5)*vdist(epsilon,mu) } total<-alpha*vabout+valong total} #------------------------------------------------------------------ plot.pclust<-function(x,classif) { #x is the data that was used by hpcc (an n x 2 matrix) #classif is the classification (output of hpcc) #a graphics device must already be open classes<-unique(classif) clnum<-length(classes) plot(x,type="n",xlab="X",ylab="Y") for(i in 1:clnum) { points(x[classif==classes[i],],pch=as.character(i)) } fnord<-23 fnord} #------------------------------------------------------------------ penalty<-function(k) { #A penalty can be specified here. k is the number of clusters. final<-0 final } #------------------------------------------------------------------ vdist<-function(x,y) { #x, y are n x p matrices. Each row is a point. total<-0 n<-length(x[,1]) p<-length(x[1,]) for(i in 1:n) { for(j in 1:p) {total<-total+((x[i,j]-y[i,j])^2) } } total} #------------------------------------------------------------------