##################################################################
### R Script for Pattern Searching in DNA or Protein Sequences ###
##################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Last update: Nov 22, 2006
# Utility:
# Sequence import function
# Reverse and complement function
# Pattern searching function
# Positional parsing
# HTML and TEXT report of results
# How to run the script:
# source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/patternSearch.R")
##################################
## (1) Sequence Import Function ##
##################################
seq_imp_fct <- function(fileloc) {
my_fasta <- readLines(fileloc) # reads file line-wise into vector
y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
y <- as.vector(y); y[y==-1] <- 0
index <- which(y==0)
distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry
distance <- data.frame(distance, dist=distance[,2]-distance[,1])
seq_no <- 1:length(y[y==0])
index <- rep(seq_no, as.vector(distance[,3]))
my_fasta <- data.frame(index, y, my_fasta)
my_fasta[my_fasta[,2]==0,1] <- 0
seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
seq <- as.vector(seq[2:length(seq)])
Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
ID <- gsub("^>| .*", "", as.character(Desc), perl=T)
Desc <- gsub("^.*? ", "", as.character(Desc), perl=T)
my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq)
my_fasta
}
cat("\n# (1.1) Import sequences into R like this: \n\t myseq <- seq_imp_fct(\"http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/Samples/sampleDNA.txt\")\n")
cat("\n# (1.2) Run this command to set all sequences to upper case since the function gregexpr() is case sensitive! \n\t myseq$seq <- gsub(\"([A-Z])\", \"\\\\U\\\\1\", as.character(myseq$seq), perl=T, ignore.case=T)\n")
cat("\n# (1.3) To speed up the sequence loading process, save them as R object like this: \n\t save(myseq, file=\"myseq\")\n")
cat("\n# (1.4) Next time reload the sequences with this command: \n\t load(file=\"myseq\")\n")
############################################
## (2) Reverse and/or Complement Function ##
############################################
rev_comp_fct <- function(seq=mystr, rev=T, comp=T) {
if(rev==T) {
seq <- as.vector(unlist(strsplit(seq, split="")))
seq <- rev(seq)
seq <- paste(seq, collapse="")
}
if(comp==T) {
seq <- gsub("A", "1", seq, ignore.case = T)
seq <- gsub("T", "2", seq, ignore.case = T)
seq <- gsub("C", "3", seq, ignore.case = T)
seq <- gsub("G", "4", seq, ignore.case = T)
seq <- gsub("1", "T", seq, ignore.case = T)
seq <- gsub("2", "A", seq, ignore.case = T)
seq <- gsub("3", "G", seq, ignore.case = T)
seq <- gsub("4", "C", seq, ignore.case = T)
}
seq
}
cat("\n# (2) The reverse and/or complement of a sequence can be obtained like this: \n\t rev_comp_fct(seq=\"TCGATCGTACGTTCAGCT\", rev=T, comp=T)\n")
###################################
## (3) Pattern Matching Function ##
###################################
# This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'.
# In addition to the pattern counts, the function provides the positions of the pattern matches.
# Define pattern
cat("\n# (3.1) Define segmented pattern in upper case like this:\n\t p1 <- \"GGTTCCA\"; p2 <- \"ACCAG\"; mind <- 10; maxd <- 25 \n\t\t # p1: first pattern segment; p2: second pattern segment; mind: min distance; maxd: max distance\n")
# Define pattern function
pattern_fct <- function(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) { # expects sequences in fourth column of 'sequence' data frame
if(segmented==T & revcomp==F) {
pattern <- paste(p1, ".{", mind, ",", maxd, "}", p2, sep="")
cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
}
if(segmented==T & revcomp==T) {
p1 <- rev_comp_fct(seq=p1, rev=T, comp=T)
p2 <- rev_comp_fct(seq=p2, rev=T, comp=T)
pattern <- paste(p2, ".{", mind, ",", maxd, "}", p1, sep="")
cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
}
if(segmented==F) {
cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
}
pos <- gregexpr(pattern, as.character(sequences$seq), perl=T) # 'gregexpr' returns list with pattern positions
posS <- posS <- unlist(lapply(pos, function(x) paste(x, collapse=", "))) # retrieves positions where pattern matches in each sequence
posS[posS==-1] <- 0
if(genPos==T) {
posE <- unlist(lapply(pos, function(x) paste((as.vector(unlist(x)) + as.vector(unlist(attributes(x))) - 1), collapse=", ")))
posE[posS==0] <- 0
hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
posDF <- data.frame(ID=sequences[,1], hitsv, posS, posE, Length=sequences$Length)
posDF <- posDF[posDF[,2]>=1,]
posDF <- data.frame(
ID=rep(posDF[,1], posDF[,2]),
start=unlist(strsplit(as.character(posDF$posS), ", ")),
end=unlist(strsplit(as.character(posDF$posE), ", ")),
count=rep(posDF[,2], posDF[,2]),
seq_length=rep(posDF[,5], posDF[,2]))
pos_frame <<- posDF
cat("\n The object \"pos_frame\" has been generated.\n")
}
hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
sequences <- data.frame(sequences[,1:3], Position=as.vector(posS), Hits=hitsv, sequences$seq)
mypattern <<- pattern
cat("\n Number of sequences with at least one match:", sum(sequences$Hits>=1), "\n Total number of matches:", sum(sequences$Hits),"\n\n")
sequences
}
cat("\n# (3.2) Run pattern search: \n\t Qmyseq <- pattern_fct(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) \n\t\t # The argument \"pattern\" allows to specify regular expression patterns directly when \"segmented=F\"\n")
#######################
## (4.1) HTML Report ##
#######################
html_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) {
export <- Qmyseq[Qmyseq$Hits>=1,]
if(all!=T) { export <- export[1:nhits,] }
export_count <- length(export[,1])
seqhtml <- gsub(paste("(", mypattern, ")", sep=""), "\\1", as.character(export$seq), perl=T, ignore.case=T)
seqhtml <- paste(seqhtml, rep("
", length(seqhtml)))
export <- data.frame(export[,-6], seq=seqhtml)
export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2], sep=", "), seq=export[,6])
export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq)
export <- as.vector(unlist(t(export)))
export <- c(
"",
"
", paste("Pattern used for search:", mypattern), paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), paste("Total number of matches:", sum(Qmyseq$Hits)), paste("Number of sequences exported to this page:", export_count, "", "", "") write.table(export, file=paste(myfile, ".html", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (4.1) Export results in HTML format: \n\t html_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n") ######################## ## (4.2) FASTA Report ## ######################## fasta_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) { export <- Qmyseq[Qmyseq$Hits>=1,] if(all!=T) { export <- export[1:nhits,] } export_count <- length(export[,1]) export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2], sep=", "), seq=export[,6]) export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq) export <- as.vector(unlist(t(export))) export <- c( paste("Pattern used for search:", mypattern), paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), paste("Total number of matches:", sum(Qmyseq$Hits)), paste("Number of sequences exported to this page:", export_count, "\n"), export) write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (4.2) Export results in fasta format: \n\t fasta_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n") ################################### ## (5) Position Parsing Function ## ################################### # This step requires a position data frame of the below format, which is automatically generated by the pattern_fct() # function. The following example is for parsing ORFs from chromosomes. The ID numbers in the first colum # need to match the ones in myseq. Additional tracking columns can be appended at the end! Very long sequences like chromosomes # require a lot of memory for parsing (~5GB per 100 million bps). # ID start end count seq_length # 1 500 521 7852 30432563 # 2 500 521 4853 19705359 # 3 500 521 6048 23470805 # 4 500 521 4655 18585042 # 5 500 521 7072 26992728 # C 500 541 88 154478 # M 500 541 122 366924 # Define parse function parsePos <- function(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame) { parsel <- as.list(as.character(myseql[,2])) names(parsel) <- myseql[,1] parsel <- parsel[unique(as.vector(parseIDs))] myids <- names(parsel) parsel <- lapply(parsel, strsplit, "") parsel <- lapply(parsel, unlist) exceedl <- apply(coor, 1, function(x) parsel[[x[1]]][as.numeric(x[2]):as.numeric(x[3])]) exceed <- lapply(exceedl, function(x) sum(is.na(x))) if(sum(unlist(exceed))>0) { bad_coor <- which(as.vector(unlist(exceed))>0) cat("Warning: the coordinates of the following sequences are off, which caused NA insertions:\n", myids[bad_coor], "\n") } parsel <- as.list(apply(coor[,1:3], 1, function(x) paste(parsel[[as.vector(x[1])]][as.numeric(x[2]):as.numeric(x[3])],collapse=""))) names(parsel) <- paste(coor[,1], "Seg", coor[,2], coor[,3], sep="_") parsel } cat("\n# (5.1) Parse coordinates:\n\t seql <- parsePos(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame)\n") # Function to add up- or down-stream to position data frame getUpDown <- function(pos_frame=pos_frame, getUp=0, getDown=0) { pos_frame$start <- as.numeric(as.vector(pos_frame$start)) - getUp pos_frame$end <- as.numeric(as.vector(pos_frame$end)) + getDown pos_frame$start[pos_frame$start < 1] <- 1 pos_frame$end[pos_frame$end > pos_frame$seq_length] <- pos_frame$seq_length[pos_frame$end > pos_frame$seq_length] pos_frame } cat("\n# (5.2) Up- or down-stream regions can be retrieved by providing length info in getDown/getUp arguments: \n\t pos_frame <- getUpDown(pos_frame=pos_frame, getUp=0, getDown=0)\n") # Export sequence list to text file in fasta format seql2fasta <- function(seql=seql, myfile="myfile") { names(seql) <- paste(">", names(seql), sep="") export <- data.frame(IDs=names(seql), seq=unlist(seql)) export <- as.vector(unlist(t(export))) write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (5.3) Export parse results in fasta format:\n\t seql2fasta(seql=seql, myfile=\"myfile\")\n")"), export, "