##################################################################################
## Cutoff-Based Tree Node Selection for Hierarchical Threshold Clustering (HTC) ##
##################################################################################
## Author: Thomas Girke
## Last update: May 4, 2007
## Utility: (A1) Cut a tree/dendrogram at all height levels to obtain clusters for all of its nodes.
##          (A2) Remove all duplicated clusters.
##          (B1) Select clusters based on a given cutoff in an all-against-all comparison of their members. 

## How to run the script:
##	source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/selectNodes.R") 	
##	y <- matrix(rnorm(500), 100, 5, dimnames=list(paste("g", 1:100, sep=""), paste("t", 1:5, sep=""))) # Test sample
##	c <- cor(t(y), method="pearson"); d <- as.dist(1-c); hr <- hclust(d, method = "complete", members=NULL) # Test sample
##	Uniclusters <- nodes2Cluster(myhclust=hr, rootdist=2) # The argument 'rootdist' defines the starting distance from the root of the tree to avoid testing of very large clusters.
## 	clusterQCselect <- clusterQC(clusterlist=Uniclusters, simMatrix=as.matrix(c), method="min", cutoff=0.7, output="df")

## (A) Define nodes2Cluster function that cuts tree at all height levels and removes all duplicated clusters
nodes2Cluster <- function(myhclust=hr, rootdist=2) {
	## Define looping data frame to limit memory requirements for cutting very large trees.
	## The cuts are performed here in slices of 100 cut levels. This corresponds to 100 iterations for a tree with 10,000 items.
	rootdist  <- rootdist # Define the minimum distance from root of tree to avoid testing of very large clusters
	looplim <- length(myhclust$height)-rootdist
	loopDF <- data.frame(from=seq(1, looplim, by=100), to=c((seq(1, looplim, by=100)-1)[-1], looplim))
	UniclustersAll <- NULL
	for(i in 1:length(loopDF[,1])) {
		## (1) Cut tree at all height levels that are defined in 'hr$height'
		cutheightslice <- myhclust$height[loopDF[i,1]:loopDF[i,2]]
		Allcuts <- sapply(cutheightslice, function(x) cutree(myhclust, h=x))

		## (2) Remove all duplicated clusters in the 'Allcuts' matrix and return the unique cluster set as a list object.
		loopvec <- 1:length(Allcuts[1,])
		Uniclusters <- unlist(lapply(loopvec, function(x) as.vector(tapply(names(Allcuts[,x]), as.vector(Allcuts[,x]), paste, collapse=" "))))
		Uniclusters <- unique(Uniclusters)
		UniclustersAll <- c(UniclustersAll, Uniclusters)
		UniclustersAll <- unique(UniclustersAll)
		cat("\n Number of tree levels processed: <=", i*100, "\n")
	}
	UniclustersAll <- strsplit(UniclustersAll, " ")
	names(UniclustersAll) <- 1:length(UniclustersAll)
	cat("\n Finished duplicate removal", "\n")
	UniclustersAll
}

## (B) Select clusters based on a given cutoff in an all-against-all comparison of their members.
clusterQC <- function(clusterlist=Uniclusters, simMatrix=c, method="min", single=1, cutoff=0.7, output="df") {
	## Append to 'clusterlist' all items as size one clusters that are not represented by a cluster of size one. This prevents that certain items are entirely removed from the end result. 
	size_one <- sapply(clusterlist, length)
	singlets <- unlist(clusterlist[size_one==1])
	allmembers <- unique(unlist(clusterlist))
	missing_singlets <- allmembers[!allmembers %in% singlets]
	clusterlist <- c(clusterlist, as.list(missing_singlets))
	names(clusterlist) <- 1:length(clusterlist)

	## Cluster scoring
	desc_size_index <- rev(order(sapply(clusterlist, length))) # sort clusters by size
	clusterlist <- clusterlist[desc_size_index] # sort clusters by size
	clusterlist_cluster <- clusterlist[sapply(clusterlist, length)>1] # remove single item clusters
	clusterlist_singlet <- clusterlist[sapply(clusterlist, length)==1] # store single item clusters
	clusterQCvalueDF <- sapply(clusterlist_cluster, function(x) summary(as.vector(as.dist(simMatrix[x, x]))))
	clusterQCvalueDF <- t(clusterQCvalueDF)
	singlet_data <- matrix(single, nrow=length(clusterlist_singlet), ncol=6, dimnames=list(names(clusterlist_singlet), 1:6)) # The default distance value for single item clusters is 1. The 'single' argument allows to change this value. 
	clusterQCvalueDF <- rbind(clusterQCvalueDF, singlet_data)
	colnames(clusterQCvalueDF) <- c("min", "qu1", "median", "mean", "qu3", "max")
	
	## Remove all clusters with pairs that do not meet the cutoff criterion
	clusterQCvalues <- clusterQCvalueDF[,method] # Possible cutoff calculation methods are: "min", "qu1", "median", "mean", "qu3", "max".
	clusterlist <- clusterlist[clusterQCvalues >= cutoff]
	cat("\n Finished removal of clusters below cutoff", "\n")
	
	## Sort list by descending cluster size
	desc_size_index <- rev(order(sapply(clusterlist, length)))
	clusterlist <- clusterlist[desc_size_index]
	updatelist <- clusterlist
	
	## Hierarchical cluster removal from large to small clusters	
	for(i in 1:length(clusterlist)) {
		if(i <= length(updatelist)) {
			remove_index <- sapply(updatelist, function(x) sum(updatelist[[i]] %in% x))
			remove_index[i] <- 0
		 	updatelist <- updatelist[remove_index==0]
		 }
		cat( "\n Loop no:", i, "\n")
	}

	## Sort list by descending cluster size
	sort_index <- rev(order(sapply(updatelist, length)))
	updatelist <- updatelist[sort_index]
	
	## Add cluster stats data
	final_cluster_list <- list(clusters=updatelist, cluster_stats=clusterQCvalueDF[names(updatelist),])
	names(final_cluster_list$clusters) <- 1:length(final_cluster_list$clusters)
	rownames(final_cluster_list$cluster_stats) <- 1:length(final_cluster_list$cluster_stats[,1])
	
	## Return results as list
	if(output=="list") {
		return(final_cluster_list)
	}
	
	## Return results as data frame
	if(output=="df") {
		final_cluster_df <- data.frame(ID=unlist(final_cluster_list$clusters), CL_ID=rep(names(final_cluster_list$clusters), sapply(final_cluster_list$clusters, length)))
		counts <- as.data.frame(table(final_cluster_df$CL_ID))
		final_cluster_df <- merge(final_cluster_df, counts, by.x="CL_ID", by.y=1, all.x=T)
		final_cluster_df <- merge(final_cluster_df, final_cluster_list$cluster_stats, by.x="CL_ID", by.y=0, all.x=T)
		final_cluster_df <- final_cluster_df[,c(2,1,3:length(final_cluster_df))]
		names(final_cluster_df)[3] <- "CL_SZ"
		final_cluster_df <- final_cluster_df[order(-final_cluster_df$CL_SZ, final_cluster_df$CL_ID),]
		return(final_cluster_df)
	}
}