#This file has two functions: cem and calc.likelihood cem<-function(x,df=5,clust.init=rep(0,(length(x[,1]))),trace=F, max.iter=12,plot.me=F,trim=0,mix=T,var.bound=-1) { #x is the data in two columns (each row is a data point) #clust.init is the initial clustering; it has the same length as the # number of rows in x, with each entry giving the number of the cluster # to which that point initially belongs. The feature clusters are # numbered by consecutive integers starting with 1; 0 indicates the # noise cluster. #trace=T causes debugging information to be printed #max.iter is the number of EM iterations to do #plot.me=T causes results to be plotted #trim can be used to compute a robust estimate of the variance about the # curve for any feature cluster. The trim amount should be between # 0 and 0.4. # mix determines whether to use the likelihood or the mixture # likelihood (recommended) for each point when forming the new # clustering at each iteration. The default value T uses the # mixture likelihood. #var.bound is a lower bound on the estimate of the variance about the # curve for any feature cluster. If var.bound < 0, it is computed # automatically based on the range of the data. Setting var.bound # equal to zero removes any constraint from the variance estimate. n<-length(x[,1]) area<-(max(x[,1])-min(x[,1]))*(max(x[,2])-min(x[,2])) done<-F iterations<-0 # initialize the clustering and the variables which will keep track of # results at each iteration clust<-clust.init clust.log<-rep(0,max.iter) overall.loglikelihood<-rep(0,max.iter) all.curvevars<-matrix(rep(0,max.iter*length(unique(clust))), ncol=length(unique(clust))) all.varalong<-matrix(rep(0,max.iter*length(unique(clust))), ncol=length(unique(clust))) all.meanalong<-matrix(rep(0,max.iter*length(unique(clust))), ncol=length(unique(clust))) all.curvelengths<-matrix(rep(0,max.iter*length(unique(clust))), ncol=length(unique(clust))) all.mixprop<-matrix(rep(0,max.iter*length(unique(clust))), ncol=length(unique(clust))) #compute the variance bound, if needed if (var.bound<0) { var.bound<- (max(c(range(x[,1])[2]-range(x[,1])[1], range(x[,2])[2]-range(x[,2])[1]))/4000)^2 } ### MAIN PROGRAM LOOP ### while (done==F) { #initialize loop variables last.clust<-clust classes<-unique(clust) num.of.classes<-length(classes) curvelengths<-rep(0,num.of.classes) curvevars<-rep(0,num.of.classes) varalong<-rep(0,num.of.classes) meanalong<-rep(0,num.of.classes) sqdists<-matrix(rep(0,n*num.of.classes),ncol=num.of.classes) distalong<-matrix(rep(NA,n*num.of.classes),ncol=num.of.classes) likelihoods<-matrix(rep(0,n*num.of.classes),ncol=num.of.classes) likelihoods.mix<-matrix(rep(0,n*num.of.classes),ncol=num.of.classes) like.along<-matrix(rep(0,n*num.of.classes),ncol=num.of.classes) like.about<-matrix(rep(0,n*num.of.classes),ncol=num.of.classes) if (trace) {cat("Clustering\n",clust,"\n")} if (plot.me) { plot(x,xlab="X",ylab="Y",main="CEM Clustering",pch=" ")} #------------------------------------------------------ #fit curve and get parameter estimates for each cluster for ( j in 1:num.of.classes) { if (classes[j]>0) { #this is a feature cluster filter<-(clust==classes[j]) n.curve <- length(x[filter, 1]) if (n.curve!=sum(filter)) {stop("Filter error.")} curve <- principal.curve(x[filter,],maxit=5, smoother= "smooth.spline", df = df) if (plot.me) { plot(x,xlab="X",ylab="Y") points(x[clust==classes[j],],pch=classes[j]+3) points(x[clust!=classes[j],]) lines(curve) } curvelengths[j] <- (curve$lambda[curve$tag])[n.curve] if (trace) { cat("\n","j= ",j," \n") cat("classes[j]= ",classes[j]," \n") cat("n.curve= ",n.curve," \n") cat("curvelengths[j]= ",curvelengths[j]," \n") cat("length(curve$lambda) ",length(curve$lambda)," \n") cat("length(curve$tag)",length(curve$tag)," \n") cat("curve$lambda[curve$tag]",curve$lambda[curve$tag]," \n") } #naive variance estimate: curvevars[j] <- (curve$dist)/(n.curve - 1) #or robust estimate from trimmed variance temp.dists<-x[filter,] temp.dists<-((temp.dists[,1]-curve$s[,1])^2)+ ((temp.dists[,2]-curve$s[,2])^2) temp.dists<-temp.dists[order(temp.dists)] if (trim>0) { trimlow<-ceiling((trim/2)*length(temp.dists)) trimhigh<-floor((1-(trim/2))*length(temp.dists)) temp.dists<-temp.dists[c(trimlow:trimhigh)] } #check variance bound curvevars[j] <-sum(temp.dists)/(length(temp.dists)-1) if (curvevars[j]0) { ### uniform*normal method: like.about[i,j]<-(1/(sqrt(2*pi*curvevars[j])))* (exp((-1/2)*(sqdists[i,j]/curvevars[j]))) like.along[i,j]<-(1/curvelengths[j]) mix.prop<-(sum(clust==classes[j]))/n likelihoods.mix[i,j]<- mix.prop*like.about[i,j]*like.along[i,j] likelihoods[i,j]<- like.about[i,j]*like.along[i,j] } if (classes[j]==0) { mix.prop<-(sum(clust==classes[j]))/n likelihoods[i,j]<-1/area likelihoods.mix[i,j]<-mix.prop/area like.along[i,j]<-0 like.about[i,j]<-0 } } } #--------------------------------------------------------- #save all parameter estimates all.curvevars[iterations+1,]<-curvevars all.varalong[iterations+1,]<-varalong all.meanalong[iterations+1,]<-meanalong all.curvelengths[iterations+1,]<-curvelengths mix.prop<-rep(NA,num.of.classes) for (j in 1:num.of.classes) {mix.prop[j]<-(sum(clust==classes[j]))/n} all.mixprop[iterations+1,]<-mix.prop #--------------------------------------------------------- # make new clustering if (trace) { for (j in 1:num.of.classes) { cat("Likelihoods for cluster ",j,"\n",round(likelihoods[,j],dig=4),"\n\n") } cat("\n\nVar along: ",varalong,"\n") cat("\nVar about: ",curvevars,"\n") cat("\nMean along: ",meanalong,"\n") } if (mix) { for (i in 1:n) { largest<-max(likelihoods.mix[i,]) for (j in 1:num.of.classes) { if (likelihoods.mix[i,j]==largest) {clust[i]<-classes[j]} } }} if (!mix) { for (i in 1:n) { largest<-max(likelihoods[i,]) for (j in 1:num.of.classes) { if (likelihoods[i,j]==largest) {clust[i]<-classes[j]} } }} #--------------------------------------------------------- # test for convergence of clustering or max number of iterations iterations<-iterations+1 sameness.counter<-0 for(i in 1:n) {if (last.clust[i]==clust[i]) {sameness.counter<-sameness.counter+1}} clust.log[iterations]<-sameness.counter if(sameness.counter==n) {done<-T} if(iterations>=max.iter) {done<-T} temp<-0 for (i in 1:n) { temp<-temp+log(sum(likelihoods.mix[i,])) } overall.loglikelihood[iterations]<-temp if (overall.loglikelihood[iterations]>=max(overall.loglikelihood[1:iterations])) { clust.best<-clust } } # end of while loop -- END OF MAIN PROGRAM LOOP #--------------------------------------------------------- return(varalong,meanalong,curvelengths,curvevars,likelihoods,likelihoods.mix, clust,iterations,clust.log,x,df,var.bound,like.along,like.about, distalong, sqdists,clust.best,overall.loglikelihood, all.curvevars,all.varalong,all.meanalong,all.curvelengths,all.mixprop) } #--------------------------------------------------------- #--------------------------------------------------------- calc.likelihood<- function(cem.obj, mix.best = T) { #cem.obj is the output from cem(). it includes: #x is the data #df is the degrees of freedom #clust is the classification (0 indicates the noise cluster) #likelihoods is a matrix of likelihoods (one column per cluster) #if trim was used in cem, then the likelihood calculated here will #reflect it #mix.best is used to indicate if the best overall loglikelihood (over #all iterations of CEM that were run) should be used n <- length(cem.obj$x[, 1]) if(mix.best == F) { num.clusters <- length(unique(cem.obj$clust)) } if(mix.best == T) { num.clusters <- length(unique(cem.obj$clust.best)) } cluster.sizes <- rep(0, num.clusters) score <- 0 if(mix.best == F) { for(i in 1:num.clusters) { cluster.sizes[i] <- sum(cem.obj$clust == unique(cem.obj$clust)[i]) } #now the entries of cluster.sizes correspond to each column of the #likelihood matrix (regardless of ordering) for(i in 1:n) { this.lik <- 0 for(j in 1:num.clusters) { this.lik <- this.lik + ((cluster.sizes[j]/n) * cem.obj$likelihoods[i, j]) } score <- score + log(this.lik) } } if(mix.best == T) { score <- max(cem.obj$overall.loglikelihood[cem.obj$ overall.loglikelihood != 0]) } #assume n-1 curve clusters, 1 noise cluster. #estimate curves, mixture proportions, noise num.parameters <- ((num.clusters - 1) * (cem.obj$df + 2)) + ( num.clusters - 1) + 1 bic <- (2 * score) - (num.parameters * log(n)) bic2 <- (2 * score) - (2 * num.parameters * log(n)) #2= num of dimensions (bic2 is experimental) return(score, bic, bic2) }