##########################################################################
## Exercise: Large-scale expression array analysis with complex queries ##
##########################################################################

## Generate sample expression data sets (here 20 files)
ExpIDs <- paste("Exp", 1:20, ".xls", sep="")
for(i in ExpIDs) {
	Int <- runif(12000, min=10, max=100)
	PMA <- sample(c("P","M","A"), 12000, replace = TRUE)
	ID <- paste(rep("hs_", 12000), 1:12000, sep="")
	Exp <- data.frame(row.names=ID, Int, PMA)
	write.table(Exp, i, quote=F, col.names = NA, sep="\t")
	cat("Created file:", i, "\n")
}

## Define analysis parameters
ExpDef <- data.frame(File=ExpIDs, Rep=sort(rep(1:(length(ExpIDs)/2), 2)), Comp=sort(rep(1:(length(ExpIDs)/4), 4)))

## Assemble the two data types in separate data frames in the order defined by ExpDef 
if(exists("Exp")) { rm(Exp) }
for(i in 1:length(ExpDef$File)) {
	temp <- read.table(as.character(ExpDef$File[i]), header=T, row.names=1, sep="\t")
	if(!exists("Exp")) { my_order <- row.names(temp) }
	if(!exists("Exp")) { Exp <- temp[,-c(1:2)] } # Exp will be created in first loop
	temp <- temp[my_order,] # Sorts all imported data sets as specified in my_order
	Exp <- cbind(Exp, temp)
	names(Exp)[(length(Exp)-1):length(Exp)] <- paste(names(temp), ExpDef$File[i], sep="_")
	cat("Loop status:", names(Exp), "\n")
}

## Some formatting steps
ExpInt <- Exp[ ,grep("^Int_", names(Exp))]
names(ExpInt) <- gsub("Int_", "", names(ExpInt))
ExpInt <- ExpInt[,ExpDef$File] # Assigns column order of ExpDef
ExpPMA <- Exp[ ,grep("^PMA_", names(Exp))]
names(ExpPMA) <- gsub("PMA_", "", names(ExpPMA))
ExpPMA <- ExpPMA[,ExpDef$File] # Assigns column order of ExpDef

## Calculate ratios
ExpInt <- ExpInt[ ,as.vector(ExpDef[,1])]
mean_df <- t(apply(ExpInt, 1, function(x) tapply(x, ExpDef$Rep, mean)))
colnames(mean_df) <- as.vector(tapply(ExpDef$File, ExpDef$Rep, paste, collapse="_"))
mean_df1 <- mean_df[ ,seq(1, length(mean_df[1,]), by=2)]
mean_df2 <- mean_df[ ,seq(2, length(mean_df[1,]), by=2)]
ratio_df <- mean_df1/mean_df2
colnames(ratio_df) <- paste(colnames(mean_df1), colnames(mean_df2), sep="_")
ratio_df <- log2(ratio_df)

## Generate PMA data frame
ExpPMA <- ExpPMA[ ,as.vector(ExpDef[,1])]
ExpPMAlog <- ExpPMA=="P" | ExpPMA=="M"
PMA_df <- t(apply(ExpPMAlog, 1, function(x) tapply(x, ExpDef$Comp, sum)))
colnames(PMA_df) <- as.vector(tapply(ExpDef$File, ExpDef$Comp, paste, collapse="_"))
	
## Query strategy, e.g.: 
##	fold difference >=2 
##	       AND 
##	P|M in X comparisons
query_df <- (ratio_df >= 1 | ratio_df <= -1) & PMA_df>=2
rowsum_vec <- rowSums(query_df)
ratio_df[rowsum_vec > 4,]
PMA_df[rowsum_vec > 4,]
write.table(ratio_df[rowsum_vec > 2,], "my_query.xls", quote=F, sep="\t", col.names = NA)

## Clean up directory
unlink(ExpIDs)

## Task convert script into a function with arguments that allows to use different query parameters.