####################################################################################################
### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ###
####################################################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Utility: This script brings 454 sequences into the proper orientation, trims off the 
# adaptor segments and tags the low quality calls (default score <14) with Ns. The 
# current draft implementation is not very fast. The processing of 50,000 sequences 
# will take 15-25 minutes. 
# Requirements: cross_match program from the Phred/Phrap/Consed package
# Code tag where one can adjust filter settings: '### adjust'
# How it works:
# (A) Some format editing on the command-line
	# Requirements:
		# clean directory containing
			# (a) 454.seq and 454.seq.qual files
			# (b) The 2 adaptor files are exprected to be in parent directory under the names 
				# adaptfor.txt
				# adaptrev.txt
	# Make sure the extension of *.qual matches the one expected by cross_match, e.g.
	# $ mv 454.qual 454.seq.qual 
	# For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines:
	# $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual 
# (B) Processing in R
	# Run the following script from the directory with your 454 sequences like this:
	# source("454_sRNA_Trim.R")
	# 	or 
	# R CMD BATCH --no-save 454_sRNA_Trim.R

# Start of R script
# (1) Split sequence batch into smaller sets of 50,000 per file
	# Define Sequence Import Function 'seq_imp_fct'	
	seq_imp_fct <- function(fileloc) {
		my_fasta <- readLines(fileloc) # reads file line-wise into vector 
		y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign 
		y <- as.vector(y);  y[y==-1] <- 0
		index <- which(y==0)
		distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
		distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry 
		distance <- data.frame(distance, dist=distance[,2]-distance[,1])
		seq_no <- 1:length(y[y==0])
		index <- rep(seq_no, as.vector(distance[,3]))
		my_fasta <- data.frame(index, y, my_fasta)
		my_fasta[my_fasta[,2]==0,1] <- 0
		seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
		seq <- as.vector(seq[2:length(seq)])
		Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
		my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
		my_fasta
	}	
	# Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match	
	seq <- seq_imp_fct(fileloc="454.seq")
	export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
	slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999)
	slice[length(slice[,1]),2] <- length(seq[,1])
	for (i in 1:length(slice[,1])) {
		write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
	}
	qual <- seq_imp_fct(fileloc="454.seq.qual")
	export_qual <- data.frame(Desc=qual[,1], seq=qual[,3])
	for (i in 1:length(slice[,1])) {
		write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F)
	}
	
# (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs
#     Run cross_match on all generated *.seq files:
	for (i in 1:length(slice[,1])) {
		if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name
		# next import steps protect Ns that are inserted by 454.com for positions with score=0
		seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep=""))
		export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
		export_seq$seq <- gsub("N", "Z", as.character(seq$seq))
		write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F)
		### adjust
    		system(paste("cross_match ",  paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only
		# next import steps tag forward adaptor with N's
		seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep=""))
		export_seq <- data.frame(Desc=seq[,1], seq=seq[,3])
		export_seq$seq <- gsub("X", "N", as.character(seq$seq))
		write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F)
		system(paste("mv ", paste(i, ".seq", sep=""), ".screen ",  paste(i, ".seq", sep=""),  sep=""))
		### adjust
		system(paste("cross_match ",  paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) #  runs cross_match for reverse adaptor only, gets tagged with X's
		system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep="")) 
		system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep=""))
	}
	
# (3) Import of cross_matched sequences into R
	# Import of 'all.seq.screen'
	seq <- seq_imp_fct(fileloc="all.seq.screen")
	cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n")
	# Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns
	seqedit <- gsub("N", "F", as.character(seq$seq))
	seqedit <- gsub("X", "R", as.character(seqedit))
	seqedit <- gsub("Z", "N", as.character(seqedit))
	seq <- data.frame(seq[,-3], seq=seqedit)
	seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1])
	qual <- seq_imp_fct(fileloc="454.seq.qual")
	cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n")
	qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq)))
	# qual[qual[,3]=="N",3] <- 0 # removed May 08, 06
	qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1])

# (4) For larger data sets, loop over slices of 10,000 sequences
# In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted. 
	slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000), 
		              TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)], 
				      length(seq[,1])))
	appendDF <- data.frame(Seq=NULL, Phred=NULL)
	for(i in 1:length(slicedf[,1])){
		cat("\n", "Start of 10K slice no: ", i, "\n")
		seqslice <- seq[slicedf[i,1]:slicedf[i,2],]
		qualslice <- qual[slicedf[i,1]:slicedf[i,2],]
		# Write sequences and qual data in vectorized format into list files
		seqslicel <- strsplit(as.character(seqslice[,3]),"")
		qualslicel <- strsplit(as.character(qualslice[,3])," ")
		names(seqslicel) <- seqslice[,1]
		names(qualslicel) <- qualslice[,1]
		qualslicel <- lapply(qualslicel, as.numeric)
		# Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions
		### adjust
		seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N")) 
		names(seqtriml) <- names(seqslicel)
		# Convert all Ns that fall into adaptor regions back to Fs and Rs
		seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F")) 
		names(seqtriml) <- names(seqslicel)
		seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R")) 
		names(seqtriml) <- names(seqslicel)
		
		# To determine orientation of sequences, generate data frame with adaptor positions
		zzz <- lapply(seqtriml, paste, collapse="")
		posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T))
		posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T))
		names(posF) <- names(seqtriml); names(posR) <- names(seqtriml) 
		myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1]))
		myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1]))
		myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0
		mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1])
		mylengthF <- unlist(mylengthF)
		mylengthF[mylengthF==-1] <- 0
		mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1])
		mylengthR <- unlist(mylengthR)
		mylengthR[mylengthR==-1] <- 0
		orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2])
		orientationDF <- data.frame(orientationDF, 
				Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2]))
		# The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results.
		orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1
		orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1
		orientationDF <- data.frame(ID=names(seqtriml), orientationDF)
		
		# Generate data frame with insert parsing positions
		zzz <- lapply(seqtriml, paste, collapse="")
		# fix for gregexpr version change between R 2.2.0 and 2.3.0
		# R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T))
		pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T))
		names(pos) <- names(seqtriml)
		mypos <- as.vector(unlist(pos))
		mypos[mypos==-1] <- 0
		mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]]))))
		mylength <- unlist(mylength)
		# fix for gregexpr version change between R 2.2.0 and 2.3.0
		mylength[mylength==-1] <- 0
		mylength <- mylength-1
		mylength[mylength==-1] <- 0
		count <- lapply(names(pos), function(x) length(pos[[x]][[1]]))
		count <- as.vector(unlist(count))
		IDs <- rep(names(pos), count)
		InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."), 
				InsertCount=rep(count,count), Pos=mypos, Length=mylength)
		InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T)
		
		# Position parsing of inserts using InsertFrame
		InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1)
		InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))]
		InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences 
		mylist <- apply(InsertFrame[,c(4,6)], 1, list)
		mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x))))
		names(mylist) <- as.vector(InsertFrame$UniID)
		myindex <- as.vector(InsertFrame$ID)
		seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates
		quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates
		insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
		insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])])
		names(insert_list) <- as.vector(InsertFrame$UniID)
		names(insert_qual_list) <- as.vector(InsertFrame$UniID)
		
		# Reverse and complement of antisense sequences
		insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]]))
		insert_list_rev <- lapply(insert_list_rev, paste, collapse="")	
		names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x))
		insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x))
		insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]]))
		names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2])		
		insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ")
		
		# Combine everything in final sequence object	
		insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])]
		insert_list_for <- lapply(insert_list_for, paste, collapse="")
		insert_list_final <- c(insert_list_rev, insert_list_for)
		insert_list_final <- insert_list_final[names(insert_list)]
		insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])] 
		insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ")
		insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for)
		insert_qual_list_final <- insert_qual_list_final[names(insert_list)]
		
		# Remove terminal Ns. Most of the code here is to adjust qual data set.
		tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final)))
		names(tempDF)[1:2] <- c("Seq", "Phred")
		remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T))
		remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T))
		NremoveDF <- data.frame(remove_start, remove_end)
		tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2])	
		qual_removel <- strsplit(as.character(tempDF2[,2]), " ")
		names(qual_removel) <- row.names(tempDF2)
		NremoveDF <- data.frame(remove_start, remove_end)
		Nremovel <- apply(NremoveDF, 1, list)
		Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x))))
		names(Nremovel) <-  row.names(tempDF2)		 
		qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])])
		qual_removel <- lapply(qual_removel, paste, collapse=" ")
		tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel)) 
		
		appendDF <- rbind(appendDF, tempDF)
		cat("\n", "End of 10K slice no: ", i, "\n")
	}
# (5) Create final result data frame 
	finalDF <- appendDF
	zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T)
	Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount)
	# Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000)
	names(zzz) <- rownames(finalDF)
	incDF <- data.frame(FROM=seq(1, length(zzz), by=10000), 
			TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz)))
	Ncountfix <- NULL
	for(i in 1:length(incDF[,1])) {
		Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1])))
	cat("\n", "N count loop no: ", i, "\n")
	}
	Ncount[which(Ncountfix==-1)] <- 0
	
	finalDF <- data.frame(finalDF, 
			InsertLength=nchar(as.character(finalDF[,1])),
			mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))),
			Ncount=Ncount
		)
	names(finalDF)[1:2] <- c("Seq", "Phred")
	finalDF <- data.frame(ID=rownames(finalDF), finalDF)

# (6) Filtering steps
	# Remove all sequences that have insert lengths of less than 15bp
	### adjust
	finalDF <- finalDF[finalDF$InsertLength>=15,]
	# Remove inserts with >=20% Ns
	### adjust
	finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,]
	
	# Adjust name extensions to filter results
        namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T) 
        namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".")
        finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1]))
        finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1])
        finalDF <- finalDF[order(finalDF$sort),]
	
	# Export sequences and qual data to two fasta batch files
	export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2])
	write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F)
	export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3])
	write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F)

#	# Optional steps
#	# Average Phred scores of inserts
#	mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))