########################################################################## ## Exercise: Large-scale expression array analysis with complex queries ## ########################################################################## ## Generate sample expression data sets (here 20 files) ExpIDs <- paste("Exp", 1:20, ".xls", sep="") for(i in ExpIDs) { Int <- runif(12000, min=10, max=100) PMA <- sample(c("P","M","A"), 12000, replace = TRUE) ID <- paste(rep("hs_", 12000), 1:12000, sep="") Exp <- data.frame(row.names=ID, Int, PMA) write.table(Exp, i, quote=F, col.names = NA, sep="\t") cat("Created file:", i, "\n") } ## Define analysis parameters ExpDef <- data.frame(File=ExpIDs, Rep=sort(rep(1:(length(ExpIDs)/2), 2)), Comp=sort(rep(1:(length(ExpIDs)/4), 4))) ## Assemble the two data types in separate data frames in the order defined by ExpDef if(exists("Exp")) { rm(Exp) } for(i in 1:length(ExpDef$File)) { temp <- read.table(as.character(ExpDef$File[i]), header=T, row.names=1, sep="\t") if(!exists("Exp")) { my_order <- row.names(temp) } if(!exists("Exp")) { Exp <- temp[,-c(1:2)] } # Exp will be created in first loop temp <- temp[my_order,] # Sorts all imported data sets as specified in my_order Exp <- cbind(Exp, temp) names(Exp)[(length(Exp)-1):length(Exp)] <- paste(names(temp), ExpDef$File[i], sep="_") cat("Loop status:", names(Exp), "\n") } ## Some formatting steps ExpInt <- Exp[ ,grep("^Int_", names(Exp))] names(ExpInt) <- gsub("Int_", "", names(ExpInt)) ExpInt <- ExpInt[,ExpDef$File] # Assigns column order of ExpDef ExpPMA <- Exp[ ,grep("^PMA_", names(Exp))] names(ExpPMA) <- gsub("PMA_", "", names(ExpPMA)) ExpPMA <- ExpPMA[,ExpDef$File] # Assigns column order of ExpDef ## Calculate ratios ExpInt <- ExpInt[ ,as.vector(ExpDef[,1])] mean_df <- t(apply(ExpInt, 1, function(x) tapply(x, ExpDef$Rep, mean))) colnames(mean_df) <- as.vector(tapply(ExpDef$File, ExpDef$Rep, paste, collapse="_")) mean_df1 <- mean_df[ ,seq(1, length(mean_df[1,]), by=2)] mean_df2 <- mean_df[ ,seq(2, length(mean_df[1,]), by=2)] ratio_df <- mean_df1/mean_df2 colnames(ratio_df) <- paste(colnames(mean_df1), colnames(mean_df2), sep="_") ratio_df <- log2(ratio_df) ## Generate PMA data frame ExpPMA <- ExpPMA[ ,as.vector(ExpDef[,1])] ExpPMAlog <- ExpPMA=="P" | ExpPMA=="M" PMA_df <- t(apply(ExpPMAlog, 1, function(x) tapply(x, ExpDef$Comp, sum))) colnames(PMA_df) <- as.vector(tapply(ExpDef$File, ExpDef$Comp, paste, collapse="_")) ## Query strategy, e.g.: ## fold difference >=2 ## AND ## P|M in X comparisons query_df <- (ratio_df >= 1 | ratio_df <= -1) & PMA_df>=2 rowsum_vec <- rowSums(query_df) ratio_df[rowsum_vec > 4,] PMA_df[rowsum_vec > 4,] write.table(ratio_df[rowsum_vec > 2,], "my_query.xls", quote=F, sep="\t", col.names = NA) ## Clean up directory unlink(ExpIDs) ## Task convert script into a function with arguments that allows to use different query parameters.