################################################################################## ## Cutoff-Based Tree Node Selection for Hierarchical Threshold Clustering (HTC) ## ################################################################################## ## Author: Thomas Girke ## Last update: May 4, 2007 ## Utility: (A1) Cut a tree/dendrogram at all height levels to obtain clusters for all of its nodes. ## (A2) Remove all duplicated clusters. ## (B1) Select clusters based on a given cutoff in an all-against-all comparison of their members. ## How to run the script: ## source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/selectNodes.R") ## y <- matrix(rnorm(500), 100, 5, dimnames=list(paste("g", 1:100, sep=""), paste("t", 1:5, sep=""))) # Test sample ## c <- cor(t(y), method="pearson"); d <- as.dist(1-c); hr <- hclust(d, method = "complete", members=NULL) # Test sample ## Uniclusters <- nodes2Cluster(myhclust=hr, rootdist=2) # The argument 'rootdist' defines the starting distance from the root of the tree to avoid testing of very large clusters. ## clusterQCselect <- clusterQC(clusterlist=Uniclusters, simMatrix=as.matrix(c), method="min", cutoff=0.7, output="df") ## (A) Define nodes2Cluster function that cuts tree at all height levels and removes all duplicated clusters nodes2Cluster <- function(myhclust=hr, rootdist=2) { ## Define looping data frame to limit memory requirements for cutting very large trees. ## The cuts are performed here in slices of 100 cut levels. This corresponds to 100 iterations for a tree with 10,000 items. rootdist <- rootdist # Define the minimum distance from root of tree to avoid testing of very large clusters looplim <- length(myhclust$height)-rootdist loopDF <- data.frame(from=seq(1, looplim, by=100), to=c((seq(1, looplim, by=100)-1)[-1], looplim)) UniclustersAll <- NULL for(i in 1:length(loopDF[,1])) { ## (1) Cut tree at all height levels that are defined in 'hr$height' cutheightslice <- myhclust$height[loopDF[i,1]:loopDF[i,2]] Allcuts <- sapply(cutheightslice, function(x) cutree(myhclust, h=x)) ## (2) Remove all duplicated clusters in the 'Allcuts' matrix and return the unique cluster set as a list object. loopvec <- 1:length(Allcuts[1,]) Uniclusters <- unlist(lapply(loopvec, function(x) as.vector(tapply(names(Allcuts[,x]), as.vector(Allcuts[,x]), paste, collapse=" ")))) Uniclusters <- unique(Uniclusters) UniclustersAll <- c(UniclustersAll, Uniclusters) UniclustersAll <- unique(UniclustersAll) cat("\n Number of tree levels processed: <=", i*100, "\n") } UniclustersAll <- strsplit(UniclustersAll, " ") names(UniclustersAll) <- 1:length(UniclustersAll) cat("\n Finished duplicate removal", "\n") UniclustersAll } ## (B) Select clusters based on a given cutoff in an all-against-all comparison of their members. clusterQC <- function(clusterlist=Uniclusters, simMatrix=c, method="min", single=1, cutoff=0.7, output="df") { ## Append to 'clusterlist' all items as size one clusters that are not represented by a cluster of size one. This prevents that certain items are entirely removed from the end result. size_one <- sapply(clusterlist, length) singlets <- unlist(clusterlist[size_one==1]) allmembers <- unique(unlist(clusterlist)) missing_singlets <- allmembers[!allmembers %in% singlets] clusterlist <- c(clusterlist, as.list(missing_singlets)) names(clusterlist) <- 1:length(clusterlist) ## Cluster scoring desc_size_index <- rev(order(sapply(clusterlist, length))) # sort clusters by size clusterlist <- clusterlist[desc_size_index] # sort clusters by size clusterlist_cluster <- clusterlist[sapply(clusterlist, length)>1] # remove single item clusters clusterlist_singlet <- clusterlist[sapply(clusterlist, length)==1] # store single item clusters clusterQCvalueDF <- sapply(clusterlist_cluster, function(x) summary(as.vector(as.dist(simMatrix[x, x])))) clusterQCvalueDF <- t(clusterQCvalueDF) singlet_data <- matrix(single, nrow=length(clusterlist_singlet), ncol=6, dimnames=list(names(clusterlist_singlet), 1:6)) # The default distance value for single item clusters is 1. The 'single' argument allows to change this value. clusterQCvalueDF <- rbind(clusterQCvalueDF, singlet_data) colnames(clusterQCvalueDF) <- c("min", "qu1", "median", "mean", "qu3", "max") ## Remove all clusters with pairs that do not meet the cutoff criterion clusterQCvalues <- clusterQCvalueDF[,method] # Possible cutoff calculation methods are: "min", "qu1", "median", "mean", "qu3", "max". clusterlist <- clusterlist[clusterQCvalues >= cutoff] cat("\n Finished removal of clusters below cutoff", "\n") ## Sort list by descending cluster size desc_size_index <- rev(order(sapply(clusterlist, length))) clusterlist <- clusterlist[desc_size_index] updatelist <- clusterlist ## Hierarchical cluster removal from large to small clusters for(i in 1:length(clusterlist)) { if(i <= length(updatelist)) { remove_index <- sapply(updatelist, function(x) sum(updatelist[[i]] %in% x)) remove_index[i] <- 0 updatelist <- updatelist[remove_index==0] } cat( "\n Loop no:", i, "\n") } ## Sort list by descending cluster size sort_index <- rev(order(sapply(updatelist, length))) updatelist <- updatelist[sort_index] ## Add cluster stats data final_cluster_list <- list(clusters=updatelist, cluster_stats=clusterQCvalueDF[names(updatelist),]) names(final_cluster_list$clusters) <- 1:length(final_cluster_list$clusters) rownames(final_cluster_list$cluster_stats) <- 1:length(final_cluster_list$cluster_stats[,1]) ## Return results as list if(output=="list") { return(final_cluster_list) } ## Return results as data frame if(output=="df") { final_cluster_df <- data.frame(ID=unlist(final_cluster_list$clusters), CL_ID=rep(names(final_cluster_list$clusters), sapply(final_cluster_list$clusters, length))) counts <- as.data.frame(table(final_cluster_df$CL_ID)) final_cluster_df <- merge(final_cluster_df, counts, by.x="CL_ID", by.y=1, all.x=T) final_cluster_df <- merge(final_cluster_df, final_cluster_list$cluster_stats, by.x="CL_ID", by.y=0, all.x=T) final_cluster_df <- final_cluster_df[,c(2,1,3:length(final_cluster_df))] names(final_cluster_df)[3] <- "CL_SZ" final_cluster_df <- final_cluster_df[order(-final_cluster_df$CL_SZ, final_cluster_df$CL_ID),] return(final_cluster_df) } }