########################################################
### R Script to Translate DNA into Protein Sequences ###
########################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Last update: Nov 26, 2006
# Utility: 
#	Translates one or many DNA sequences in all six reading frames into proteins
#	Export of results in FASTA format

# How to run the script:
# 	source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/translateDNA.R")

##################################
## (1) Sequence Import Function ##
##################################
seq_imp_fct <- function(fileloc) {
	my_fasta <- readLines(fileloc) # reads file line-wise into vector 
	y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign 
	y <- as.vector(y);  y[y==-1] <- 0
	index <- which(y==0)
	distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
	distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry 
	distance <- data.frame(distance, dist=distance[,2]-distance[,1])
	seq_no <- 1:length(y[y==0])
	index <- rep(seq_no, as.vector(distance[,3]))
	my_fasta <- data.frame(index, y, my_fasta)
	my_fasta[my_fasta[,2]==0,1] <- 0
	seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
	seq <- as.vector(seq[2:length(seq)])
	Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
	ID <- gsub("^>| .*", "", as.character(Desc), perl=T)
	Desc <- gsub("^.*? ", "", as.character(Desc), perl=T)
	my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq)
	my_fasta
}	
cat("\n# (1.1) Import 2058 ORFs from Halobacterium into R like this: \n\t myseq <- seq_imp_fct(\"ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.ffn\")\n")
cat("\n# (1.2) Import codon table:\n\t AAdf <- read.table(file=\"http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/AA.txt\", header=T, sep=\"\\t\")\n") 

###############################
## (2) Define translateDNA() ##
###############################
# (a) Translate a single DNA sequence
translate <- function(mystr="TGGCTATCCCATGATTTTCTCCAC", frame, pepCode=pepCode) { # argument "pepCode" can have the values 3 or 4; the argument "frame" can have one integer between 1 and 6.
	mystr <- gsub("(...)", "\\1_", mystr) # inserts '_' between all characters in sequence string (required for step).
	mystr <- unlist(strsplit(mystr , "_")) # generates vectorized triplet sequence
	mystr <- mystr[grep("^...$", mystr)]
	mystr <- data.frame(Order=1:length(mystr), Seq=mystr)
	if(frame==1|frame==2|frame==3) {
		mystr <- merge(mystr, AAdf, by.x="Seq", by.y="Codon", all.x=T) # Merges codon table with input sequence. The order column is used in the next step to sort back to the initial order in the sequence string.
		mystr[,pepCode] <- as.character(mystr[,pepCode])
		mystr[,pepCode][is.na(mystr[,pepCode])] <- "X" # assigns X's to triplets with unusual characters (e.g. N's)
		mystr <- paste(as.vector(mystr[order(mystr[,2]), pepCode]), collapse="")
	}
	if(frame==4|frame==5|frame==6) {
		mystr <- merge(mystr, AAdf, by.x="Seq", by.y="AntiCodon", all.x=T) # Merges codon table with input sequence. The order column is used in the next step to sort back to the initial order in the sequence string.
		mystr <- mystr[,c(1,2,4,5,6,3)] # obtains same column order as for frames 1-3
		mystr[,pepCode] <- as.character(mystr[,pepCode])
		mystr[,pepCode][is.na(mystr[,pepCode])] <- "X" # assigns X's to triplets with unusual characters (e.g. N's)
		mystr <- paste(as.vector(mystr[rev(order(mystr[,2])), pepCode]), collapse="") # generates reverse protein sequence!!!
	}
	mystr
}
# (b) Transform input DNA into 6 reading frames (list object) and translate them with the translate() function.
translateDNA <- function(myseq=myseq, frame=c(1,2,3,4,5,6), pepCode="single") { # argument frame allows to select any combination of the 6 reading frames 
	if(pepCode=="single") { pepCode <- 3 } 
	if(pepCode=="triple") { pepCode <- 4 }
	myseq <- gsub("([A-Z])", "\\U\\1", as.character(myseq), perl=T, ignore.case=T) # takes care of case inconsistencies
	frame1 <- myseq
	frame2 <- gsub("^.", "", myseq)
	frame3 <- gsub("^..", "", myseq)
	myl <- list(frame1=frame1, frame2=frame2, frame3=frame3, frame4=frame1, frame5=frame2, frame6=frame3)
	# The next line runs translate() only on the reading frames that are specified by the provided 'frame' argument
	myl <- lapply(names(myl)[frame], function(x) translate(mystr=myl[x], frame=which(names(myl) %in% x), pepCode=pepCode))
	names(myl) <- paste(rep("frame", length(frame)), frame, sep="")
	myl
}
cat("\n# (2.1) Translate sample DNAs in all 6 reading frames:\n\t pep_list <- apply(data.frame(myseq[1:10,4]), 1, function(x) translateDNA(myseq=x, frame=c(1,2,3,4,5,6), pepCode=\"single\"))\n")
cat("\n# (2.2) Assign names to protein sequences (e.g. names of DNA sequences):\n\t names(pep_list) <- myseq[1:10,1]\n\t pep_list <- as.list(unlist(pep_list))\n")

##########################################################
## (3) Export protein list to text file in fasta format ##
##########################################################
seql2fasta <- function(seql=pep_list, myfile="myfile") {
        names(seql) <- paste(">", names(seql), sep="")
        export <- data.frame(IDs=names(seql), seq=unlist(seql))
        export <- as.vector(unlist(t(export)))
        write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F)
}
cat("\n# (3.1) Export protein sequences in fasta format:\n\t seql2fasta(seql=pep_list, myfile=\"myfile\")\n")