#################################################################################################### ### R & BioConductor Script for Trimming Adaptor and Low Quality Positions of 454 sRNA Sequences ### #################################################################################################### # Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside # Utility: This script brings 454 sequences into the proper orientation, trims off the # adaptor segments and tags the low quality calls (default score <14) with Ns. The # current draft implementation is not very fast. The processing of 50,000 sequences # will take 15-25 minutes. # Requirements: cross_match program from the Phred/Phrap/Consed package # Code tag where one can adjust filter settings: '### adjust' # How it works: # (A) Some format editing on the command-line # Requirements: # clean directory containing # (a) 454.seq and 454.seq.qual files # (b) The 2 adaptor files are exprected to be in parent directory under the names # adaptfor.txt # adaptrev.txt # Make sure the extension of *.qual matches the one expected by cross_match, e.g. # $ mv 454.qual 454.seq.qual # For the import of the *.qual data into R, the *.qual file needs to have spaces at end of lines: # $ perl -p -i -w -e 's/(^.*)/$1 /g' 454.seq.qual # (B) Processing in R # Run the following script from the directory with your 454 sequences like this: # source("454_sRNA_Trim.R") # or # R CMD BATCH --no-save 454_sRNA_Trim.R # Start of R script # (1) Split sequence batch into smaller sets of 50,000 per file # Define Sequence Import Function 'seq_imp_fct' seq_imp_fct <- function(fileloc) { my_fasta <- readLines(fileloc) # reads file line-wise into vector y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign y <- as.vector(y); y[y==-1] <- 0 index <- which(y==0) distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)]) distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry distance <- data.frame(distance, dist=distance[,2]-distance[,1]) seq_no <- 1:length(y[y==0]) index <- rep(seq_no, as.vector(distance[,3])) my_fasta <- data.frame(index, y, my_fasta) my_fasta[my_fasta[,2]==0,1] <- 0 seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F) seq <- as.vector(seq[2:length(seq)]) Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3]) my_fasta <- data.frame(Desc, Length=nchar(seq), seq) my_fasta } # Generate smaller sequence batches of max 50,000 sequences per file. This is required for cross_match seq <- seq_imp_fct(fileloc="454.seq") export_seq <- data.frame(Desc=seq[,1], seq=seq[,3]) slice <- data.frame(FROM=seq(1,length(seq[,1]), by=50000), TO=seq(1,length(seq[,1]), by=50000)+49999) slice[length(slice[,1]),2] <- length(seq[,1]) for (i in 1:length(slice[,1])) { write.table(as.vector(as.character(t(export_seq[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F) } qual <- seq_imp_fct(fileloc="454.seq.qual") export_qual <- data.frame(Desc=qual[,1], seq=qual[,3]) for (i in 1:length(slice[,1])) { write.table(as.vector(as.character(t(export_qual[slice[i,1]:slice[i,2],]))), file=paste(i, ".seq.qual", sep=""), quote=F, row.names=F, col.names=F) } # (2) Run cross_match in 2 steps to tag forward adaptors with Ns and reverse adaptors with Xs # Run cross_match on all generated *.seq files: for (i in 1:length(slice[,1])) { if(i==1) { system(paste("rm -f ", "all.seq.screen", sep="")) } # removes in fist loop an already existing "all.seq.screen" file, since data slices will be appended to a file of this name # next import steps protect Ns that are inserted by 454.com for positions with score=0 seq <- seq_imp_fct(fileloc=paste(i, ".seq", sep="")) export_seq <- data.frame(Desc=seq[,1], seq=seq[,3]) export_seq$seq <- gsub("N", "Z", as.character(seq$seq)) write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq", sep=""), quote=F, row.names=F, col.names=F) ### adjust system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptfor.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for forward adaptor only # next import steps tag forward adaptor with N's seq <- seq_imp_fct(fileloc=paste(i, ".seq.screen", sep="")) export_seq <- data.frame(Desc=seq[,1], seq=seq[,3]) export_seq$seq <- gsub("X", "N", as.character(seq$seq)) write.table(as.vector(as.character(t(export_seq))), file=paste(i, ".seq.screen", sep=""), quote=F, row.names=F, col.names=F) system(paste("mv ", paste(i, ".seq", sep=""), ".screen ", paste(i, ".seq", sep=""), sep="")) ### adjust system(paste("cross_match ", paste(i, ".seq ", sep=""), "../adaptrev.txt -minmatch 15 -minscore 14 -screen > screen.out", sep="")) # runs cross_match for reverse adaptor only, gets tagged with X's system(paste("cat ", paste(i, ".seq", sep=""), ".screen ", ">> ", "all.seq.screen", sep="")) system(paste("rm ", "-f ", paste(i, ".seq ", sep=""), paste(i, ".seq.qual ", sep=""), paste(i, ".seq.screen ", sep=""), paste(i, ".seq.log", sep=""), sep="")) } # (3) Import of cross_matched sequences into R # Import of 'all.seq.screen' seq <- seq_imp_fct(fileloc="all.seq.screen") cat("\n", "Sequence batch imported containing ", length(seq[,1]), " entries", "\n") # Tag forward adapter with "F" and reverse adapter with "R", and revert Zs (454 Ns) back to Ns seqedit <- gsub("N", "F", as.character(seq$seq)) seqedit <- gsub("X", "R", as.character(seqedit)) seqedit <- gsub("Z", "N", as.character(seqedit)) seq <- data.frame(seq[,-3], seq=seqedit) seq <- data.frame(Desc=gsub(" .*", "", as.character(seq[,1]), perl=T), seq[,-1]) qual <- seq_imp_fct(fileloc="454.seq.qual") cat("\n", "Quality batch imported containing ", length(qual[,1]), " entries", "\n") qual <- data.frame(qual[,-3], seq=gsub("[ ]{2,}", " ", as.character(qual$seq))) # qual[qual[,3]=="N",3] <- 0 # removed May 08, 06 qual <- data.frame(Desc=gsub(" .*", "", as.character(qual[,1]), perl=T), qual[,-1]) # (4) For larger data sets, loop over slices of 10,000 sequences # In this loop the sequences are (1) vector trimmed, (2) quality trimmed and (3) orientation adjusted. slicedf <- data.frame(FROM=seq(1, length(seq[,1]), by=10000), TO=c(seq(0, length(seq[,1]), by=10000)[2:((as.integer(length(seq[,1])/10000))+1)], length(seq[,1]))) appendDF <- data.frame(Seq=NULL, Phred=NULL) for(i in 1:length(slicedf[,1])){ cat("\n", "Start of 10K slice no: ", i, "\n") seqslice <- seq[slicedf[i,1]:slicedf[i,2],] qualslice <- qual[slicedf[i,1]:slicedf[i,2],] # Write sequences and qual data in vectorized format into list files seqslicel <- strsplit(as.character(seqslice[,3]),"") qualslicel <- strsplit(as.character(qualslice[,3])," ") names(seqslicel) <- seqslice[,1] names(qualslicel) <- qualslice[,1] qualslicel <- lapply(qualslicel, as.numeric) # Replaces all NTs with Phred < 14 by Ns; this includes vector/adaptor positions ### adjust seqtriml <- lapply(names(seqslicel), function(x) replace(seqslicel[[x]], which(qualslicel[[x]]<14), "N")) names(seqtriml) <- names(seqslicel) # Convert all Ns that fall into adaptor regions back to Fs and Rs seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="F"), "F")) names(seqtriml) <- names(seqslicel) seqtriml <- lapply(names(seqtriml), function(x) replace(seqtriml[[x]], which(seqslicel[[x]]=="R"), "R")) names(seqtriml) <- names(seqslicel) # To determine orientation of sequences, generate data frame with adaptor positions zzz <- lapply(seqtriml, paste, collapse="") posF <- lapply(names(zzz), function(x) gregexpr("(F){1,}", as.character(zzz[[x]]), perl=T)) posR <- lapply(names(zzz), function(x) gregexpr("(R){1,}", as.character(zzz[[x]]), perl=T)) names(posF) <- names(seqtriml); names(posR) <- names(seqtriml) myposF <- unlist(lapply(names(posF), function(x) as.vector(unlist(posF[[x]][[1]]))[1])) myposR <- unlist(lapply(names(posR), function(x) as.vector(unlist(posR[[x]][[1]]))[1])) myposF[myposF==-1] <- 0; myposR[myposR==-1] <- 0 mylengthF <- lapply(names(posF), function(x) as.vector(unlist(attributes(posF[[x]][[1]])))[1]) mylengthF <- unlist(mylengthF) mylengthF[mylengthF==-1] <- 0 mylengthR <- lapply(names(posR), function(x) as.vector(unlist(attributes(posR[[x]][[1]])))[1]) mylengthR <- unlist(mylengthR) mylengthR[mylengthR==-1] <- 0 orientationDF <- data.frame(F1=myposF, F2=myposF+mylengthF-1, R1=myposR, R2=myposR+mylengthR-1, Length=seqslice[,2]) orientationDF <- data.frame(orientationDF, Orient=(orientationDF[,3]-orientationDF[,2])/abs(orientationDF[,3]-orientationDF[,2])) # The following steps are for sequences that contain only one adaptor. This approximation seems to give fairly good results. orientationDF[orientationDF[,2]==-1 & orientationDF[,3] < (orientationDF[,5]/2), 6] <- -1 orientationDF[orientationDF[,4]==-1 & orientationDF[,1] < ((orientationDF[,5]/2)*0.4), 6] <- 1 orientationDF <- data.frame(ID=names(seqtriml), orientationDF) # Generate data frame with insert parsing positions zzz <- lapply(seqtriml, paste, collapse="") # fix for gregexpr version change between R 2.2.0 and 2.3.0 # R version 2.0.0:# pos <- lapply(names(zzz), function(x) gregexpr("[ATGCN]{1,}", as.character(zzz[[x]]), perl=T)) pos <- lapply(names(zzz), function(x) gregexpr("[^ATGCN][ATGCN]+", paste("", as.character(zzz[[x]])), perl=T)) names(pos) <- names(seqtriml) mypos <- as.vector(unlist(pos)) mypos[mypos==-1] <- 0 mylength <- lapply(names(pos), function(x) as.vector(unlist(attributes(pos[[x]][[1]])))) mylength <- unlist(mylength) # fix for gregexpr version change between R 2.2.0 and 2.3.0 mylength[mylength==-1] <- 0 mylength <- mylength-1 mylength[mylength==-1] <- 0 count <- lapply(names(pos), function(x) length(pos[[x]][[1]])) count <- as.vector(unlist(count)) IDs <- rep(names(pos), count) InsertFrame <- data.frame(ID=IDs, UniID=paste(IDs, unlist(sapply(count, function(x) seq(1,x))), sep="."), InsertCount=rep(count,count), Pos=mypos, Length=mylength) InsertFrame <- merge(InsertFrame, orientationDF, by.x="ID", by.y="ID", all.x=T) # Position parsing of inserts using InsertFrame InsertFrame <- data.frame(InsertFrame, End=InsertFrame[,4]+InsertFrame[,5]-1) InsertFrame <- InsertFrame[,c(1:5, length(InsertFrame), 6:(length(InsertFrame)-1))] InsertFrame <- InsertFrame[!InsertFrame$Pos==0, ] # added May 8, 06 to remove no-insert sequences mylist <- apply(InsertFrame[,c(4,6)], 1, list) mylist <- lapply(mylist, function(x) as.vector(as.matrix(unlist(x)))) names(mylist) <- as.vector(InsertFrame$UniID) myindex <- as.vector(InsertFrame$ID) seqtriml_long <- seqtriml[myindex]; names(seqtriml_long) <- as.vector(InsertFrame$UniID) # creates long version of 'seqtriml' that contains entries with several inserts in duplicates quall_long <- qualslicel[myindex]; names(quall_long) <- as.vector(InsertFrame$UniID) # creates long version of 'quall' that contains entries with several inserts in duplicates insert_list <- lapply(names(mylist), function(x) seqtriml_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])]) insert_qual_list <- lapply(names(mylist), function(x) quall_long[[x]][(mylist[[x]][1]):(mylist[[x]][2])]) names(insert_list) <- as.vector(InsertFrame$UniID) names(insert_qual_list) <- as.vector(InsertFrame$UniID) # Reverse and complement of antisense sequences insert_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_list[[x]])) insert_list_rev <- lapply(insert_list_rev, paste, collapse="") names(insert_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2]) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("A", "1", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("T", "2", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("C", "3", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("G", "4", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("1", "T", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("2", "A", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("3", "G", x)) insert_list_rev <- lapply(insert_list_rev, function(x) gsub("4", "C", x)) insert_qual_list_rev <- lapply(as.vector(InsertFrame[InsertFrame$Orient==-1,2]), function(x) rev(insert_qual_list[[x]])) names(insert_qual_list_rev) <- as.vector(InsertFrame[InsertFrame$Orient==-1,2]) insert_qual_list_rev <- lapply(insert_qual_list_rev, paste, collapse=" ") # Combine everything in final sequence object insert_list_for <- insert_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])] insert_list_for <- lapply(insert_list_for, paste, collapse="") insert_list_final <- c(insert_list_rev, insert_list_for) insert_list_final <- insert_list_final[names(insert_list)] insert_qual_list_for <- insert_qual_list[as.vector(InsertFrame[InsertFrame$Orient==1,2])] insert_qual_list_for <- lapply(insert_qual_list_for, paste, collapse=" ") insert_qual_list_final <- c(insert_qual_list_rev, insert_qual_list_for) insert_qual_list_final <- insert_qual_list_final[names(insert_list)] # Remove terminal Ns. Most of the code here is to adjust qual data set. tempDF <- data.frame(as.data.frame(unlist(insert_list_final)), as.data.frame(unlist(insert_qual_list_final))) names(tempDF)[1:2] <- c("Seq", "Phred") remove_start <- nchar(as.character(tempDF$Seq)) - nchar(gsub("^N{1,}", "", as.character(tempDF$Seq), perl=T)) remove_end <- nchar(as.character(tempDF$Seq)) - nchar(gsub("N{1,}$", "", as.character(tempDF$Seq), perl=T)) NremoveDF <- data.frame(remove_start, remove_end) tempDF2 <- data.frame(row.names=row.names(tempDF), Seq=gsub("^N{1,}|N{1,}$", "", as.character(tempDF$Seq), perl=T), tempDF[,2]) qual_removel <- strsplit(as.character(tempDF2[,2]), " ") names(qual_removel) <- row.names(tempDF2) NremoveDF <- data.frame(remove_start, remove_end) Nremovel <- apply(NremoveDF, 1, list) Nremovel <- lapply(Nremovel, function(x) as.vector(as.matrix(unlist(x)))) names(Nremovel) <- row.names(tempDF2) qual_removel <- lapply(names(qual_removel), function(x) qual_removel[[x]][(1+Nremovel[[x]][1]):(length(as.vector(qual_removel[[x]]))-Nremovel[[x]][2])]) qual_removel <- lapply(qual_removel, paste, collapse=" ") tempDF <- data.frame(row.names=row.names(tempDF2), Seq=tempDF2[,1], Phred=unlist(qual_removel)) appendDF <- rbind(appendDF, tempDF) cat("\n", "End of 10K slice no: ", i, "\n") } # (5) Create final result data frame finalDF <- appendDF zzz <- gregexpr("N", as.character(finalDF[,1]), perl=T) Ncount <- lapply(zzz, function(x) length(unlist(x))); Ncount <- unlist(Ncount) # Obtain Ncount: for_loop here necessary for very large sequence sets (>100,000) names(zzz) <- rownames(finalDF) incDF <- data.frame(FROM=seq(1, length(zzz), by=10000), TO=c(seq(0, length(zzz), by=10000)[2:((as.integer(length(zzz)/10000))+1)], length(zzz))) Ncountfix <- NULL for(i in 1:length(incDF[,1])) { Ncountfix <- c(Ncountfix, unlist(lapply(names(zzz[incDF[i,1]:incDF[i,2]]), function(x) as.vector(unlist(zzz[[x]][[1]]))[1]))) cat("\n", "N count loop no: ", i, "\n") } Ncount[which(Ncountfix==-1)] <- 0 finalDF <- data.frame(finalDF, InsertLength=nchar(as.character(finalDF[,1])), mPhred=as.vector(apply(finalDF, 1, function(x) mean(as.numeric(unlist(strsplit((as.character(x[2]))," ")))))), Ncount=Ncount ) names(finalDF)[1:2] <- c("Seq", "Phred") finalDF <- data.frame(ID=rownames(finalDF), finalDF) # (6) Filtering steps # Remove all sequences that have insert lengths of less than 15bp ### adjust finalDF <- finalDF[finalDF$InsertLength>=15,] # Remove inserts with >=20% Ns ### adjust finalDF <- finalDF[(finalDF$Ncount/finalDF$InsertLength)<=0.2,] # Adjust name extensions to filter results namev <- gsub("\\..*$", "", as.character(rownames(finalDF)), perl=T) namev <- paste(sort(namev), unlist(lapply(as.vector(table(sort(namev))), function(x) 1:x)), sep=".") finalDF <- data.frame(finalDF, sort=1:length(finalDF[,1])) finalDF <- data.frame(ID=sort(namev), finalDF[order(rownames(finalDF)),-1]) finalDF <- finalDF[order(finalDF$sort),] # Export sequences and qual data to two fasta batch files export_seq <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,2]) write.table(as.vector(as.character(t(export_seq))), file="final_seq.txt", quote=F, row.names=F, col.names=F) export_qual <- data.frame(Acc=paste(finalDF[,1], "InsertLength:", finalDF[,4], "mPhred:", round(finalDF[,5], 1), "Ncount", finalDF[,6], sep=" "), seq=finalDF[,3]) write.table(as.vector(as.character(t(export_qual))), file="final_qual.txt", quote=F, row.names=F, col.names=F) # # Optional steps # # Average Phred scores of inserts # mylist <- strsplit(as.character(finalDF[,3]), " "); mylist <- lapply(mylist, as.numeric); mean(unlist(lapply(mylist, mean)))