#############################
## R PROGRAMMING EXERCISES ##
#############################
## Workshop: Programming in R
## May 14, 2011

##########################################
## (1) Loops: for/while and apply loops ##
##########################################
## Create a sample matrix

myMA <- matrix(rnorm(500), 100, 5, dimnames=list(1:100, paste("C", 1:5, sep="")))

## (1.1) Compute the mean of each row in myMA by applying the mean function in a for loop

myve_for <- NULL
for(in seq(along=myMA[,1])) {
        myve_for <- c(myve_for, mean(as.numeric(myMA[i, ])))
}
myResult <- cbind(myMA, mean_for=myve_for)
myResult[1:4, ]

## (1.2) Compute the mean of each row in myMA by applying the mean function in a while loop

<- 1
myve_while <- NULL
while(z <= length(myMA[,1])) {
        myve_while <- c(myve_while, mean(as.numeric(myMA[z, ])))
        z <- z + 1
}
myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while)
myResult[1:4, ]

## (1.3) How can one confirm that the results from both mean calculations are identical?

sum(myResult[,6] != myResult[,7])

## (1.4) Compute the mean of each row in myMA by applying the mean function in an apply loop

myve_apply <- apply(myMA, 1, mean)
myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while, mean_apply=myve_apply)
myResult[1:4, ]

## (1.5) Compute the row means of myMA by taking advantage of the built-in looping behavior
## of the mean() function. This will require the transpose of myMA and its conversion into
## a data frame.

mymean <- mean(as.data.frame(t(myMA)))
myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while, mean_apply=myve_apply, mean_int=mymean)
myResult[1:4, ]

## (1.6) How can one confirm that the values from all four mean calculations are identical?
## The unique() and length() functions can be used for this step.

myq <- apply(myResult[,6:9], 1, function(x) length(unique(x)))
sum(myq != 1)

## (1.7) Replace one mean value in the last column by '0' and repeat the identity check in step (1.6)

myResult[1,8] <- 0
myq <- apply(myResult[,6:9], 1, function(x) length(unique(x)))
sum(myq != 1)

###################
## (2) Functions ##
###################

## (2.1) Save the above code to a script file named "myMeans.R". Then convert the code to a
## function (myMeans) that returns by default the final data frame from step (1.5) with all
## four mean calculations. To test and debug the function, source your script and then
## execute your myMeans function like this:

        ## source("myMeans.R")
        ## myMeans()

## (2.2) Use the following code as basis to implement a function that allows the user
## to compute the mean for any combination of columns in a matrix or data frame.
## One argument of this function should specify the input data set and another should
## allow the selection of the columns by providing a grouping vector (factor).

myMA <- matrix(rnorm(100000), 10000, 10, dimnames=list(1:10000, paste("C", 1:10, sep="")))

myList <- tapply(colnames(myMA), c(1,1,1,2,2,2,3,3,4,4), list) 
names(myList) <- sapply(myList, paste, collapse="_")
myMAmean <- sapply(myList, function(x) apply(myMA[,x], 1, mean))
myMAmean[1:4,] 

myMAcomp <- function(myMA=myMA, group=c(1,1,1,2,2,2,3,3,4,4), myfct=mean) {
        myList <- tapply(colnames(myMA), group, list)
        names(myList) <- sapply(myList, paste, collapse="_")
        myMAmean <- sapply(myList, function(x) apply(myMA[,x], 1, myfct))
        return(myMAmean)
}
myMAcomp(myMA=myMA, group=c(1,1,1,2,2,2,3,3,4,4), myfct=mean)[1:4,] 

#######################################################
## (3) Improving Speed Performance by Avoiding Loops ##
#######################################################

## (3.1) Implement a speed improved version of your previous function from step (2.2).
## Use the following code for its implementation. In this code the R loop over the
## data intensive dimension is avoided by using the rowMeans() function.

myList <- tapply(colnames(myMA), c(1,1,1,2,2,2,3,3,4,4), list)
myMAmean <- sapply(myList, function(x) rowMeans(myMA[,x]))
colnames(myMAmean) <- sapply(myList, paste, collapse="_")
myMAmean[1:4,]

## (3.2) Compare the speed performance of the implementation from step 2.2 with the
## speed improved version from step (3.1) using the system.time() function.

###############################################
## (4) Looping over lists with lapply/sapply ##
###############################################
## Create a sample list populated with character vectors of different lengths

setlist <- lapply(11:30, function(x) sample(letters, x, replace=TRUE))
names(setlist) <- paste("S", seq(along=setlist), sep="") 

## (4.1) Compute the length for all pairwise intersects of the vectors stored in 'setlist'.
## The intersects can be determined with the '%in%' function like this:
##              sum(setlist[[1]] %in% setlist[[2]])

setlist <- sapply(setlist, unique)
olMA <- sapply(names(setlist), function(x) sapply(names(setlist), function(y) sum(setlist[[x]] %in% setlist[[y]])))
olMA 

## (4.2) Plot the resulting intersect matrix as heat map
## The heatmap.2() function from the gplots library can be used for this.
library("gplots")
heatmap.2(olMA, trace="none", Colv="none", Rowv="none", dendrogram="none", col=colorpanel(40, "darkred", "orange", "yellow")) 

######################################################################################
## (5) Batch operations on many files: import, export, calculations, renaming, etc. ##
######################################################################################
## Follow the instructions of this manual section:
## http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_Programming.html

########################################################
## (6) Basic Utilities for DNA Sequence Manipulations ##
########################################################

## (6.1) Write a RevComp() function that returns the reverse and complement of a DNA sequence.
## Include an argument that will allow to return only the reversed sequence, the complemented
## sequence or the reversed and complemented sequence. The following R functions will be useful
## for the implementation:
        ## x <- c("ATGCATTGGACGTTAG")
        ## substring(x, 1:nchar(x), 1:nchar(x))
        ## rev(x)
        ## paste(x, collapse="")
        ## chartr("ATGC", "TACG", x)

revComp <- function(myseq, type="RC") {
        seqmod <- myseq[1] # The substring function below works only on a single string.
        if(type=="R") {
                seqmod <- substring(seqmod, 1:nchar(seqmod), 1:nchar(seqmod))
                seqmod <- rev(seqmod)
                seqmod <- paste(seqmod, collapse="")          
        }
        if(type=="C") { 
                seqmod <- chartr("ATGC", "TACG", seqmod) 
        }
        if(type=="RC") {
                seqmod <- chartr("ATGC", "TACG", seqmod) 
                seqmod <- substring(seqmod, 1:nchar(seqmod), 1:nchar(seqmod))
                seqmod <- rev(seqmod)
                seqmod <- paste(seqmod, collapse="")          
        }
        return(seqmod)
}
cat("\n## (1) REVERSE AND COMPLEMENT OF DNA SEQUENCES")
cat("\n## (1.1) Creates random sample DNA sequences\n\t myDNA <- sapply(1:100, function(x) paste(sample(c(\"A\",\"T\",\"G\",\"C\"), 20, replace=T), collapse=\"\"))\n")
cat("\t\t # Creates 100 sequences each with 20 residues.\n")
cat("\n## (1.2) How to run revComp function\n\t revComp(myseq=myDNA[1], type=\"RC\")\n")
cat("\t\t # Option R: reverse \n\t\t # Option C: complement \n\t\t # Option RC: reverse and complement\n")

## (6.2) Write a function that applies the RevComp() function to many sequences stored in a vector.

cat("\n## (1.3) Apply function to many sequences with sapply/lapply \n\t sapply(myDNA, function(x) revComp(myseq=x, type=\"RC\"))\n")

## (6.3) Write a function that will translate one or many DNA sequences in all three reading frames into proteins.
## The following commands will simplify this task:

        ## AAdf <- read.table(file="AA.txt", header=T, sep="\t")  
        ## AAv <- AAdf[,2]; names(AAv) <- AAdf[,1]
        ## y <- gsub("(...)", "\\1_", y)
        ## y <- unlist(strsplit(y, "_"))
        ## y <- y[grep("∧...$", y)]
        ## AAv[y]

## Import genetic code and store it in named vector AAv
cat("\n## (2) TRANSLATION OF DNA INTO AA SEQUENCES")
AAdf <- read.table(file="http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/AA.txt", header=T, sep="\t")
AAv <- AAdf[,2]; names(AAv) <- AAdf[,1]
cat("\n## (2.1) The genetic code was imported and is stored in named vector AAv\n")

## Define translate function
translateDNA <- function(myseq, frame=1, code=AAv) {
        seqmod <- myseq[1] # Function assumes only a single string
        tripSplit <- function(seqmod=seqmod) { # Internal function for converting DNA string to vector of triplets
                seqmod <- gsub("(...)", "\\1_", seqmod) 
                seqmod <- unlist(strsplit(seqmod, "_")) 
                seqmod <- seqmod[grep("^...$", seqmod)] 
                return(seqmod)        
        }
        if(frame==1) {
                seqmod <- tripSplit(seqmod) 
        }
        if(frame==2) {
                seqmod <- gsub("^.", "", seqmod)
                seqmod <- tripSplit(seqmod) 
        }
        if(frame==3) {
                seqmod <- gsub("^..", "", seqmod)
                seqmod <- tripSplit(seqmod) 
        }
        myAA <- AAv[seqmod]
        myAA <- paste(myAA, collapse="")
        return(myAA)
}
cat("\n## (2.2) How to run translateDNA function \n\t translateDNA(myseq=myDNA[1], frame=1, code=AAv) \n")
cat("\t\t # The three possible reading frames (1, 2 or 3) can be selected under the frame option.\n")

## (3.3) Apply function to many sequences.
cat("\n## (2.3) Apply function to many sequences with sapply/lapply \n\t mypep <- sapply(myDNA, function(x) translateDNA(myseq=x, frame=1))\n")
cat("\n## (2.4) Return result in data frame \n\t data.frame(DNA=names(mypep), PEP=as.vector(mypep))[1:12,]\n")



## Non R code, just for web upload:
## scp R_programming_exercises.R tgirke@cache.ucr.edu:~/public_html/Documents/R_BioCond/My_R_Scripts/