############################################################### ## Computation of Similarity Measures for Clustering Results ## ############################################################### ## Author: Thomas Girke ## Last update: June 29, 2009 ## Utility: speed and memory efficient computation of similarity ## coefficients for comparing two clustering results C and K. ## The current set of methods includes the Jaccard Index, the Rand ## Index and the Adjusted Rand Index. ## Definition of Indices: ## Jaccard Index: J(C,K) = a/(a+b+c) ## Rand Index: R(C,K) = (a+d)/(a+b+c+d) ## Adjusted Rand Index: AR(C,K) = (2*(a*d-c*b)) / ((a+b)*(b+d)+(a+c)*(c+d)) ## a = number of pairs that cluster together in both C and K ## b = number of pairs that cluster together in C, but not K ## c = number of pairs that cluster together in K, but not C ## d = number of pairs that are not joined into clusters in K and C ## How to run the cindex() function: ## cindex(clV1=clV1, clV2=clV2, self=FALSE, minSZ=1, method="jaccard") ## # clV1/clV2: named vectors where the cluster IDs are values and the item labels are names ## # method: supported similarity methods are "jaccard", "rand" and "arand" ## # self: FALSE to ignore or TRUE to include clusters with single items ## # minSZ: selection of a minimum cluster size ## More detailed instructions are available on this page: ## http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_BioCondManual.html#clustering_jaccard ## Compute Jaccard and Rand Indices for two clustering results stored in named vectors. cindex <- function(clV1=clV1, clV2=clV2, self=FALSE, minSZ=1, method="jaccard") { ## (A) Conversions to assure proper data structure ## Allow selection of a minimum cluster size. if(minSZ > 1) { CLSZ <- table(clV1) clV1 <- clV1[clV1 %in% names(CLSZ[CLSZ >= minSZ])] CLSZ <- table(clV2) clV2 <- clV2[clV2 %in% names(CLSZ[CLSZ >= minSZ])] } ## Remove non-common labels in clV1 and clV2 clV1 <- clV1[names(clV1) %in% names(clV2)] clV2 <- clV2[names(clV2) %in% names(clV1)] ## (B) Compute matrix that provides the common pairs among the two clustering results cpairMA <- sapply(unique(clV1), function(x) sapply(unique(clV2), function(z) sum(names(clV1[clV1==x]) %in% names(clV2[clV2==z])))) colnames(cpairMA) <- unique(clV1); rownames(cpairMA) <- unique(clV2) olMA <- cpairMA # Copy for export. To debug result, use: sum(names(clV1[clV1==2]) %in% names(clV2[clV2==28])) if(self==TRUE) { # Plus cpairMA/2 includes self comparisons and minus cpairMA/2 excludes them. cpairMA <- cpairMA^2/2 + cpairMA/2 } if(self==FALSE) { cpairMA <- cpairMA^2/2 - cpairMA/2 } ## (C) Compute Similarity Indices if(self==TRUE) { NpairsCL1 <- tapply(names(clV1), clV1, length); NpairsCL1 <- NpairsCL1^2/2 + NpairsCL1/2 NpairsCL2 <- tapply(names(clV2), clV2, length); NpairsCL2 <- NpairsCL2^2/2 + NpairsCL2/2 } if(self==FALSE) { NpairsCL1 <- tapply(names(clV1), clV1, length); NpairsCL1 <- NpairsCL1^2/2 - NpairsCL1/2 NpairsCL2 <- tapply(names(clV2), clV2, length); NpairsCL2 <- NpairsCL2^2/2 - NpairsCL2/2 } ja <- sum(cpairMA) jb <- sum(NpairsCL1[colnames(cpairMA)] - colSums(cpairMA)) jc <- sum(NpairsCL2[rownames(cpairMA)] - rowSums(cpairMA)) ## Return results as list if(method=="jaccard") { jindex <- ja/(ja+jb+jc) return(list(intersects=olMA, variables=unlist(list(a=ja, b=jb, c=jc)), Jaccard_Index=jindex)) } if(method=="rand") { Nitems <- length(names(clV1)) # clV1 and clV2 contain same items (see above) if(self==TRUE) { jd <- (Nitems^2/2 + Nitems/2) - ja - jb -jc } if(self==FALSE) { jd <- (Nitems^2/2 - Nitems/2) - ja - jb -jc } rindex <- (ja+jd)/(ja+jb+jc+jd) return(list(intersects=olMA, variables=unlist(list(a=ja, b=jb, c=jc, d=jd)), Rand_Index=rindex)) } if(method=="arand") { Nitems <- length(names(clV1)) # clV1 and clV2 contain same items (see above) if(self==TRUE) { jd <- (Nitems^2/2 + Nitems/2) - ja - jb -jc } if(self==FALSE) { jd <- (Nitems^2/2 - Nitems/2) - ja - jb -jc } arindex <- (2*(ja*jd - jc*jb)) / ((ja+jb)*(jb+jd)+(ja+jc)*(jc+jd)) return(list(intersects=olMA, variables=unlist(list(a=ja, b=jb, c=jc, d=jd)), Adjusted_Rand_Index=arindex)) } }