########################################################## ### Identify Over-Represented Strings in Sequence Sets ### ########################################################## ## Authors: Ugoeze Nwokedi, Kevin Horan, Thomas Girke (thomas.girke@ucr.edu, UC Riverside) ## Last update: July 14, 2007 ## Utility: ## (1) Generate a database of all possible DNA/RNA/PEP strings of a given length ## (2) Sequence import into R ## (3) Determine the frequency of the strings in reference and test sequences ## (4) Calculate over-representation p-value ## (5) Large-scale meta analysis by looping over entries in string database ## How to run the script: ## source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/wordFinder.R") ############################################################################################## ## (1) Function to Create a Datatabes of All Possible DNA/RNA/PEP Strings of a Given Length ## ############################################################################################## ## Create X'mer Database makeXmerDB <- function(type="DNA", xmer=4) { ## Define sequence type if(type=="DNA") { N <- c("A", "T", "G", "C") } if(type=="RNA") { N <- c("A", "U", "G", "C") } if(type=="PEP") { N <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y") } ## Create sequence strings myseq <- N seqlength <- xmer if(xmer>=2) { for(i in 1:(seqlength-1)) { myseq <- sapply(as.vector(myseq), function(x) paste(x, N, sep="")) } } myseq <- as.vector(myseq) return(myseq) } cat("\n# (1.1) Execute the following command, to generate a 6-mer database:\n\t myDB <- makeXmerDB(type=\"DNA\", xmer=6)\n") ## Remove reverse and complement strings in pattern database. They create duplicated results when searching both DNA strands. ## Note that the number of removed sequences will be less than half of the number of elements in myDB, because the reverse and ## complement of palindromic sequences is identical to their original sequences. removeDups <- function(myDBF=myDB) { myDB <- myDB temp <- paste(myDB, as.vector(sapply(myDB, rev_comp_fct, rev=T, comp=T)), sep="_") temp <- strsplit(temp, "_") temp <- lapply(temp, sort) temp <- unlist(lapply(temp, paste, collapse="_")) myDB <- myDB[!duplicated(temp)] return(myDB) } cat("\n# (1.2) Only for searches of both DNA strands: remove reverse and complement patterns which create duplicated results: \n\t myDB <- removeDups(myDB=myDB) \n") ################################## ## (2) Sequence Import Function ## ################################## seq_imp_fct <- function(fileloc) { my_fasta <- readLines(fileloc) # reads file line-wise into vector y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign y <- as.vector(y); y[y==-1] <- 0 index <- which(y==0) distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)]) distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry distance <- data.frame(distance, dist=distance[,2]-distance[,1]) seq_no <- 1:length(y[y==0]) index <- rep(seq_no, as.vector(distance[,3])) my_fasta <- data.frame(index, y, my_fasta) my_fasta[my_fasta[,2]==0,1] <- 0 seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F) seq <- as.vector(seq[2:length(seq)]) Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3]) ID <- gsub("^>| .*", "", as.character(Desc), perl=T) Desc <- gsub("^.*? ", "", as.character(Desc), perl=T) my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq) my_fasta } cat("\n# (2.1) Import sample sequences into R like this: \n\t myseq <- seq_imp_fct(\"http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/Samples/sampleDNA.txt\"); myseq <- myseq[1:8,]\n") cat("\n# (2.2) Run this command to set all sequences to upper case since the function gregexpr() is case sensitive! \n\t myseq$seq <- gsub(\"([A-Z])\", \"\\\\U\\\\1\", as.character(myseq$seq), perl=T, ignore.case=T)\n") cat("\n# (2.3) To speed up the sequence loading process, save and load them like this: \n\t save(myseq, file=\"myseq\"); load(file=\"myseq\") \n") ## Reverse and/or Complement Function rev_comp_fct <- function(seq=mystr, rev=T, comp=T) { if(rev==T) { seq <- as.vector(unlist(strsplit(seq, split=""))) seq <- rev(seq) seq <- paste(seq, collapse="") } if(comp==T) { seq <- gsub("A", "1", seq, ignore.case = T) seq <- gsub("T", "2", seq, ignore.case = T) seq <- gsub("C", "3", seq, ignore.case = T) seq <- gsub("G", "4", seq, ignore.case = T) seq <- gsub("1", "T", seq, ignore.case = T) seq <- gsub("2", "A", seq, ignore.case = T) seq <- gsub("3", "G", seq, ignore.case = T) seq <- gsub("4", "C", seq, ignore.case = T) } seq } ################################# ## (3) Match Counting Function ## ################################# ## The matchCounter function can be used to calculate how often every word in the counts ## Define matchCounter function matchCounter <- function(seq, ids, myDB, strands=2, pos=T, mm=0) { testseq <- seq myids <- ids ## Search only positive strand if(strands==1) { mymatch <- lapply(myDB, function(x) gregexpr(x, testseq, perl=T)) } ## Search both strands if(strands==2) { myDB2 <- paste(myDB, "|", rev_comp_fct(myDB, rev=T, comp=T), sep="") mymatch <- lapply(myDB2, function(x) gregexpr(x, testseq, perl=T)) } names(mymatch) <- myDB motifs <- myDB[sort(rep(1:length(myDB), length(myids)))] ids <- myids[rep(1:length(myids), length(myDB))] count <- unlist(lapply(names(mymatch), function(x) lapply(mymatch[[x]], function(y) sum(y!=-1) ))) if(pos==F) { mymatchDF <- data.frame(motifs, ids, count) return(mymatchDF) } if(pos==T) { pos <- unlist(lapply(names(mymatch), function(x) lapply(mymatch[[x]], function(y) paste(y, collapse=", ") ))) pos[pos=="-1"] <- "0" mymatchDF <- data.frame(motifs, ids, count, pos) if(mm>0) { mm <- lapply(myDB, function(x) agrep(x, testseq, max=mm, value=F)) fixzero <- sapply(mm, length) mm[fixzero==0] <- 0 mm <- lapply(mm, function(x) 1:length(testseq) %in% x) names(mm) <- myDB mm <- unlist(mm) mymatchDF <- data.frame(motifs, ids, count, pos, mm) } return(mymatchDF) } } cat("\n# (3.1) Run matchCounter function: \n\t mymatchDF <- matchCounter(seq=as.character(myseq$seq), ids=as.character(myseq$ID), myDB=myDB, strands=2, pos=T, mm=0); mymatchDF[1:12,] \n \t\t # To speed up this process, set \"pos=F\".\n") ################################################ ## (4) Calculate Over-Representation P-Values ## ################################################ ## Formula and variables: ## phyper(k, D, n-D, N, lower.tail=F) ## k: number of motif matches in sample sequence (number of good items in sequence sample) ## D: number of motif matches in sequence database (number of good items in sequence database) ## n: number of words in sequence database minus D (number of bad items in sequence database) ## N: number of words in sample sequence (number of drawn items) enrichTest <- function(myseq=myseq, mymatchDF=mymatchDF, xmer=6) { ## Obtain values for required variables motifMatches <- tapply(mymatchDF$count, mymatchDF$motif, sum) k <- as.vector(mymatchDF$count) motifMatches <- tapply(mymatchDF$count, mymatchDF$motif, sum) D <- as.vector(motifMatches[mymatchDF$motifs]) n <- sum(nchar(myseq$seq)-(xmer-1)) N <- nchar(myseq$seq)-(xmer-1) names(N) <- myseq$ID N <- as.vector(N[mymatchDF$ids]) ## Calculate hypergeometric p-values pval <- phyper(k-1, D, n-D, N, lower.tail=F) ## Bonferroni p-value adjustment. Multiplyer is the number of sequences that show at least one match for a given motif. N_Match_Seq <- tapply(mymatchDF$count, mymatchDF$motif, function(x) sum(x>=1)) N_Match_Seq <- as.vector(N_Match_Seq[mymatchDF$motifs]) N_Match_Seq[N_Match_Seq==0] <- 1 adj_pval <- pval * N_Match_Seq adj_pval[adj_pval>1] <- 1 ## Assemble results in data frame mymatchDF <- data.frame(mymatchDF, D=D, n=n, N=N, pval=pval, adj_pval) return(mymatchDF) } cat("\n# (4.1) Calculate over-representation P-values: \n\t mymatchDF <- enrichTest(myseq=myseq, mymatchDF=mymatchDF, xmer=6); mymatchDF[order(mymatchDF$pval),][1:12,]\n") ############################################################################## ## (5) Large-Scale Meta Analysis by Looping Over Entries in String Database ## ############################################################################## ## This meta function is for genome-wide analyses. It runs the above steps in a memory ## efficient manner by looping over the entries in the string database and storing ## only the results for enriched patterns. metaAnalysis <- function(seq=as.character(myseq$seq), myDB=myDB, cutoff=0.01) { cont <- NULL for(i in 1:length(myDB)) { mymatchDF <- matchCounter(seq=as.character(myseq$seq), ids=as.character(myseq$ID), myDB=myDB[i], strands=2, pos=T, mm=0) mymatchDF <- enrichTest(myseq=myseq, mymatchDF=mymatchDF, xmer=6) mymatchDF_filter <- mymatchDF[mymatchDF$adj_pval <= cutoff, ] cont <- rbind(cont, mymatchDF_filter) cat("Iteration: ", i, ", N rows: ", length(cont[,1]), "\n", sep="") } return(cont) } cat("\n# (5.1) Run Genome-Wide Analysis: \n\t finalDF <- metaAnalysis(seq=as.character(myseq$seq), myDB=myDB, cutoff=0.05) \n\n") ## How to run meta analysis: ## Generate motif database ## Import genome sequences: ftp://ftp.arabidopsis.org/home/tair/Sequences/blast_datasets/TAIR7_blastsets/TAIR7_upstream_1000_20070405 ## Run metaAnalysis function