######################################################## ### R Script to Translate DNA into Protein Sequences ### ######################################################## # Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside # Last update: Nov 26, 2006 # Utility: # Translates one or many DNA sequences in all six reading frames into proteins # Export of results in FASTA format # How to run the script: # source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/translateDNA.R") ################################## ## (1) Sequence Import Function ## ################################## seq_imp_fct <- function(fileloc) { my_fasta <- readLines(fileloc) # reads file line-wise into vector y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign y <- as.vector(y); y[y==-1] <- 0 index <- which(y==0) distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)]) distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry distance <- data.frame(distance, dist=distance[,2]-distance[,1]) seq_no <- 1:length(y[y==0]) index <- rep(seq_no, as.vector(distance[,3])) my_fasta <- data.frame(index, y, my_fasta) my_fasta[my_fasta[,2]==0,1] <- 0 seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F) seq <- as.vector(seq[2:length(seq)]) Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3]) ID <- gsub("^>| .*", "", as.character(Desc), perl=T) Desc <- gsub("^.*? ", "", as.character(Desc), perl=T) my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq) my_fasta } cat("\n# (1.1) Import 2058 ORFs from Halobacterium into R like this: \n\t myseq <- seq_imp_fct(\"ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.ffn\")\n") cat("\n# (1.2) Import codon table:\n\t AAdf <- read.table(file=\"http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/AA.txt\", header=T, sep=\"\\t\")\n") ############################### ## (2) Define translateDNA() ## ############################### # (a) Translate a single DNA sequence translate <- function(mystr="TGGCTATCCCATGATTTTCTCCAC", frame, pepCode=pepCode) { # argument "pepCode" can have the values 3 or 4; the argument "frame" can have one integer between 1 and 6. mystr <- gsub("(...)", "\\1_", mystr) # inserts '_' between all characters in sequence string (required for step). mystr <- unlist(strsplit(mystr , "_")) # generates vectorized triplet sequence mystr <- mystr[grep("^...$", mystr)] mystr <- data.frame(Order=1:length(mystr), Seq=mystr) if(frame==1|frame==2|frame==3) { mystr <- merge(mystr, AAdf, by.x="Seq", by.y="Codon", all.x=T) # Merges codon table with input sequence. The order column is used in the next step to sort back to the initial order in the sequence string. mystr[,pepCode] <- as.character(mystr[,pepCode]) mystr[,pepCode][is.na(mystr[,pepCode])] <- "X" # assigns X's to triplets with unusual characters (e.g. N's) mystr <- paste(as.vector(mystr[order(mystr[,2]), pepCode]), collapse="") } if(frame==4|frame==5|frame==6) { mystr <- merge(mystr, AAdf, by.x="Seq", by.y="AntiCodon", all.x=T) # Merges codon table with input sequence. The order column is used in the next step to sort back to the initial order in the sequence string. mystr <- mystr[,c(1,2,4,5,6,3)] # obtains same column order as for frames 1-3 mystr[,pepCode] <- as.character(mystr[,pepCode]) mystr[,pepCode][is.na(mystr[,pepCode])] <- "X" # assigns X's to triplets with unusual characters (e.g. N's) mystr <- paste(as.vector(mystr[rev(order(mystr[,2])), pepCode]), collapse="") # generates reverse protein sequence!!! } mystr } # (b) Transform input DNA into 6 reading frames (list object) and translate them with the translate() function. translateDNA <- function(myseq=myseq, frame=c(1,2,3,4,5,6), pepCode="single") { # argument frame allows to select any combination of the 6 reading frames if(pepCode=="single") { pepCode <- 3 } if(pepCode=="triple") { pepCode <- 4 } myseq <- gsub("([A-Z])", "\\U\\1", as.character(myseq), perl=T, ignore.case=T) # takes care of case inconsistencies frame1 <- myseq frame2 <- gsub("^.", "", myseq) frame3 <- gsub("^..", "", myseq) myl <- list(frame1=frame1, frame2=frame2, frame3=frame3, frame4=frame1, frame5=frame2, frame6=frame3) # The next line runs translate() only on the reading frames that are specified by the provided 'frame' argument myl <- lapply(names(myl)[frame], function(x) translate(mystr=myl[x], frame=which(names(myl) %in% x), pepCode=pepCode)) names(myl) <- paste(rep("frame", length(frame)), frame, sep="") myl } cat("\n# (2.1) Translate sample DNAs in all 6 reading frames:\n\t pep_list <- apply(data.frame(myseq[1:10,4]), 1, function(x) translateDNA(myseq=x, frame=c(1,2,3,4,5,6), pepCode=\"single\"))\n") cat("\n# (2.2) Assign names to protein sequences (e.g. names of DNA sequences):\n\t names(pep_list) <- myseq[1:10,1]\n\t pep_list <- as.list(unlist(pep_list))\n") ########################################################## ## (3) Export protein list to text file in fasta format ## ########################################################## seql2fasta <- function(seql=pep_list, myfile="myfile") { names(seql) <- paste(">", names(seql), sep="") export <- data.frame(IDs=names(seql), seq=unlist(seql)) export <- as.vector(unlist(t(export))) write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (3.1) Export protein sequences in fasta format:\n\t seql2fasta(seql=pep_list, myfile=\"myfile\")\n")