############################# ## R PROGRAMMING EXERCISES ## ############################# ## Workshop: Programming in R ## December 8, 2011 ########################################## ## (1) Loops: for/while and apply loops ## ########################################## ## Create a sample matrix myMA <- matrix(rnorm(500), 100, 5, dimnames=list(1:100, paste("C", 1:5, sep=""))) ## (1.1) Compute the mean of each row in myMA by applying the mean function in a for loop myve_for <- NULL for(i in seq(along=myMA[,1])) { myve_for <- c(myve_for, mean(as.numeric(myMA[i, ]))) } myResult <- cbind(myMA, mean_for=myve_for) myResult[1:4, ] ## (1.2) Compute the mean of each row in myMA by applying the mean function in a while loop z <- 1 myve_while <- NULL while(z <= length(myMA[,1])) { myve_while <- c(myve_while, mean(as.numeric(myMA[z, ]))) z <- z + 1 } myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while) myResult[1:4, ] ## (1.3) How can one confirm that the results from both mean calculations are identical? sum(myResult[,6] != myResult[,7]) ## (1.4) Compute the mean of each row in myMA by applying the mean function in an apply loop myve_apply <- apply(myMA, 1, mean) myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while, mean_apply=myve_apply) myResult[1:4, ] ## (1.5) Compute the row means of myMA by taking advantage of the built-in looping behavior ## of the mean() function. This will require the transpose of myMA and its conversion into ## a data frame. mymean <- mean(as.data.frame(t(myMA))) myResult <- cbind(myMA, mean_for=myve_for, mean_while=myve_while, mean_apply=myve_apply, mean_int=mymean) myResult[1:4, ] ## (1.6) How can one confirm that the values from all four mean calculations are identical? ## The unique() and length() functions can be used for this step. myq <- apply(myResult[,6:9], 1, function(x) length(unique(x))) sum(myq != 1) ## (1.7) Replace one mean value in the last column by '0' and repeat the identity check in step (1.6) myResult[1,8] <- 0 myq <- apply(myResult[,6:9], 1, function(x) length(unique(x))) sum(myq != 1) ################### ## (2) Functions ## ################### ## (2.1) Save the above code to a script file named "myMeans.R". Then convert the code to a ## function (myMeans) that returns by default the final data frame from step (1.5) with all ## four mean calculations. To test and debug the function, source your script and then ## execute your myMeans function like this: ## source("myMeans.R") ## myMeans() ## (2.2) Use the following code as basis to implement a function that allows the user ## to compute the mean for any combination of columns in a matrix or data frame. ## One argument of this function should specify the input data set and another should ## allow the selection of the columns by providing a grouping vector (factor). myMA <- matrix(rnorm(100000), 10000, 10, dimnames=list(1:10000, paste("C", 1:10, sep=""))) myList <- tapply(colnames(myMA), c(1,1,1,2,2,2,3,3,4,4), list) names(myList) <- sapply(myList, paste, collapse="_") myMAmean <- sapply(myList, function(x) apply(myMA[,x], 1, mean)) myMAmean[1:4,] myMAcomp <- function(myMA=myMA, group=c(1,1,1,2,2,2,3,3,4,4), myfct=mean) { myList <- tapply(colnames(myMA), group, list) names(myList) <- sapply(myList, paste, collapse="_") myMAmean <- sapply(myList, function(x) apply(myMA[,x], 1, myfct)) return(myMAmean) } myMAcomp(myMA=myMA, group=c(1,1,1,2,2,2,3,3,4,4), myfct=mean)[1:4,] ####################################################### ## (3) Improving Speed Performance by Avoiding Loops ## ####################################################### ## (3.1) Implement a speed improved version of your previous function from step (2.2). ## Use the following code for its implementation. In this code the R loop over the ## data intensive dimension is avoided by using the rowMeans() function. myList <- tapply(colnames(myMA), c(1,1,1,2,2,2,3,3,4,4), list) myMAmean <- sapply(myList, function(x) rowMeans(myMA[,x])) colnames(myMAmean) <- sapply(myList, paste, collapse="_") myMAmean[1:4,] ## (3.2) Compare the speed performance of the implementation from step 2.2 with the ## speed improved version from step (3.1) using the system.time() function. ############################################### ## (4) Looping over lists with lapply/sapply ## ############################################### ## Create a sample list populated with character vectors of different lengths setlist <- lapply(11:30, function(x) sample(letters, x, replace=TRUE)) names(setlist) <- paste("S", seq(along=setlist), sep="") ## (4.1) Compute the length for all pairwise intersects of the vectors stored in 'setlist'. ## The intersects can be determined with the '%in%' function like this: ## sum(setlist[[1]] %in% setlist[[2]]) setlist <- sapply(setlist, unique) olMA <- sapply(names(setlist), function(x) sapply(names(setlist), function(y) sum(setlist[[x]] %in% setlist[[y]]))) olMA ## (4.2) Plot the resulting intersect matrix as heat map ## The heatmap.2() function from the gplots library can be used for this. library("gplots") heatmap.2(olMA, trace="none", Colv="none", Rowv="none", dendrogram="none", col=colorpanel(40, "darkred", "orange", "yellow")) ###################################################################################### ## (5) Batch operations on many files: import, export, calculations, renaming, etc. ## ###################################################################################### ## Follow the instructions of this manual section: ## http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/R_Programming.html ######################################################## ## (6) Basic Utilities for DNA Sequence Manipulations ## ######################################################## ## (6.1) Write a RevComp() function that returns the reverse and complement of a DNA sequence. ## Include an argument that will allow to return only the reversed sequence, the complemented ## sequence or the reversed and complemented sequence. The following R functions will be useful ## for the implementation: ## x <- c("ATGCATTGGACGTTAG") ## substring(x, 1:nchar(x), 1:nchar(x)) ## rev(x) ## paste(x, collapse="") ## chartr("ATGC", "TACG", x) revComp <- function(myseq, type="RC") { seqmod <- myseq[1] # The substring function below works only on a single string. if(type=="R") { seqmod <- substring(seqmod, 1:nchar(seqmod), 1:nchar(seqmod)) seqmod <- rev(seqmod) seqmod <- paste(seqmod, collapse="") } if(type=="C") { seqmod <- chartr("ATGC", "TACG", seqmod) } if(type=="RC") { seqmod <- chartr("ATGC", "TACG", seqmod) seqmod <- substring(seqmod, 1:nchar(seqmod), 1:nchar(seqmod)) seqmod <- rev(seqmod) seqmod <- paste(seqmod, collapse="") } return(seqmod) } cat("\n## (1) REVERSE AND COMPLEMENT OF DNA SEQUENCES") cat("\n## (1.1) Creates random sample DNA sequences\n\t myDNA <- sapply(1:100, function(x) paste(sample(c(\"A\",\"T\",\"G\",\"C\"), 20, replace=T), collapse=\"\"))\n") cat("\t\t # Creates 100 sequences each with 20 residues.\n") cat("\n## (1.2) How to run revComp function\n\t revComp(myseq=myDNA[1], type=\"RC\")\n") cat("\t\t # Option R: reverse \n\t\t # Option C: complement \n\t\t # Option RC: reverse and complement\n") ## (6.2) Write a function that applies the RevComp() function to many sequences stored in a vector. cat("\n## (1.3) Apply function to many sequences with sapply/lapply \n\t sapply(myDNA, function(x) revComp(myseq=x, type=\"RC\"))\n") ## (6.3) Write a function that will translate one or many DNA sequences in all three reading frames into proteins. ## The following commands will simplify this task: ## AAdf <- read.table(file="AA.txt", header=T, sep="\t") ## AAv <- AAdf[,2]; names(AAv) <- AAdf[,1] ## y <- gsub("(...)", "\\1_", y) ## y <- unlist(strsplit(y, "_")) ## y <- y[grep("∧...\$", y)] ## AAv[y] ## Import genetic code and store it in named vector AAv cat("\n## (2) TRANSLATION OF DNA INTO AA SEQUENCES") AAdf <- read.table(file="http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/AA.txt", header=T, sep="\t") AAv <- AAdf[,2]; names(AAv) <- AAdf[,1] cat("\n## (2.1) The genetic code was imported and is stored in named vector AAv\n") ## Define translate function translateDNA <- function(myseq, frame=1, code=AAv) { seqmod <- myseq[1] # Function assumes only a single string tripSplit <- function(seqmod=seqmod) { # Internal function for converting DNA string to vector of triplets seqmod <- gsub("(...)", "\\1_", seqmod) seqmod <- unlist(strsplit(seqmod, "_")) seqmod <- seqmod[grep("^...\$", seqmod)] return(seqmod) } if(frame==1) { seqmod <- tripSplit(seqmod) } if(frame==2) { seqmod <- gsub("^.", "", seqmod) seqmod <- tripSplit(seqmod) } if(frame==3) { seqmod <- gsub("^..", "", seqmod) seqmod <- tripSplit(seqmod) } myAA <- AAv[seqmod] myAA <- paste(myAA, collapse="") return(myAA) } cat("\n## (2.2) How to run translateDNA function \n\t translateDNA(myseq=myDNA[1], frame=1, code=AAv) \n") cat("\t\t # The three possible reading frames (1, 2 or 3) can be selected under the frame option.\n") ## (3.3) Apply function to many sequences. cat("\n## (2.3) Apply function to many sequences with sapply/lapply \n\t mypep <- sapply(myDNA, function(x) translateDNA(myseq=x, frame=1))\n") cat("\n## (2.4) Return result in data frame \n\t data.frame(DNA=names(mypep), PEP=as.vector(mypep))[1:12,]\n") ############################ ## (7) Build an R Package ## ############################ ## Save one or more of your functions to a file called 'script.R'. ## Generate the package with the package.skeleton function package.skeleton(name="mypackage", code_files=c("script1.R""), namespace=TRUE) ## Build tarball of the package R CMD build mypackage ## Install package install.packages("mypackage_1.0.tar.gz", repos=NULL, type="source")