###################################### # (A) Sequence Batch Import Function # ###################################### # This script provides the following functionality for sequence management and analysis: # (1) Batch sequence import into R data frame # (2) Motif searching with hit statistics # (3) Analysis of sequence composition # (4) All-against-all sequence comparisons # (5) Generation of phylogenetic trees # Download sample proteome from NCBI in fasta format. The chosen example is from Halobacterium sp. which contains 2058 proteins: myfileloc <- "ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.faa" # Larger example from Arabidopsis with 30690 proteins: # ftp://ftp.arabidopsis.org/home/tair/home/tair/Sequences/blast_datasets/TAIR6_pep_20051108 # (A1) Define Sequence Import Function 'seq_imp_fct' # This function does the following: # (a) reads fasta batch file into a single vector # (b) writes each sequence into a single field using a grouping vector # (c) provides length information for each sequence # (d) organizes description lines and sequences in a single data frame seq_imp_fct <- function(fileloc) { my_fasta <- readLines(fileloc) # reads file line-wise into vector y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign y <- as.vector(y); y[y==-1] <- 0 # The 'rep' function is used here to generate a grouping vector/column for writing each sequence into a single field # This 'indirect' approach runs much faster than the alternative 'for loop' below. index <- which(y==0) distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)]) distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry distance <- data.frame(distance, dist=distance[,2]-distance[,1]) seq_no <- 1:length(y[y==0]) index <- rep(seq_no, as.vector(distance[,3])) my_fasta <- data.frame(index, y, my_fasta) my_fasta[my_fasta[,2]==0,1] <- 0 # Alternative 'for loop' that generates above grouping function, but is 100 times slower # counter <- 0 # index <- NULL # for(i in y) { # if(i==1) { index <- c(index, counter) } else # { index <- c(index, 0); counter <- counter + 1 } # } # my_fasta <- data.frame(index, y, my_fasta) # Use the 'paste' function in a 'tapply' loop to write each sequence into a single field # This runs much faster than the alternative 'for loop' below. seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F) seq <- as.vector(seq[2:length(seq)]) # Alternative 'for loop' that does the same as above 'tapply' function, but is >20 times slower # seq <- NULL # for(i in 1:max(my_fasta[,1])) { # seq <- c(seq, paste(my_fasta[my_fasta[,1]==i,3], collapse="")) # } Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3]) my_fasta <- data.frame(Desc, Length=nchar(seq), seq) my_fasta } # (A2) Apply 'seq_imp_fct' to import Halobacterium proteome Halobact <- seq_imp_fct(fileloc=myfileloc) # Generate separate column with accession numbers. This regular expression needs to be adjusted for other title line formats Acc <- gsub(".*gb\\|(.*)\\|.*" , "\\1", as.character(Halobact[,1]), perl=T) # regular expression specific for this example Halobact <- data.frame(Acc, Halobact) ################################# # (B) Pattern Matching Function # ################################# # (B1) Define pattern function # This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'. # In addition to the pattern counts, the function provides the positions of the pattern matches. pattern_fct <- function(pattern, sequences) { # expects sequences in fourth column of 'sequence' data frame pos <- gregexpr(pattern, as.character(sequences[,4]), perl=T) # 'gregexpr' returns list with pattern positions posv <- unlist(lapply(pos, paste, collapse=", ")); posv[posv==-1] <- 0 # retrieves positions where pattern matches in each sequence hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence sequences <- data.frame(sequences[,1:3], Position=as.vector(posv), Hits=hitsv, sequences[,4]) sequences } # (B2) Apply pattern function patterndf <- pattern_fct(pattern="H..H{1,2}", sequences=Halobact) # Print a title message to explain the following output cat("These sequences in the Halobacterium proteome contain the motif \"H..H{1,2}\" more than two times:","\n") print(patterndf[patterndf[,5]>2, 1:5]) # Tag pattern in sequence by setting pattern to upper case and remaining sequence to lower case tag <- gsub("([A-Z])", "\\L\\1", as.character(patterndf[patterndf[,5]>2, 6]), perl=T, ignore.case=T) tag <- gsub("(H..H{1,2})", "\\U\\1", as.character(tag), perl=T, ignore.case=T) export <- data.frame(patterndf[patterndf[,5]>2,-6], tag) export <- data.frame(Acc=paste(">", export[,1], sep=""), seq=export[,6]) # Possibility to export any sequence subset in fasta format to an external file write.table(as.vector(as.character(t(export))), file="export.txt", quote=F, row.names=F, col.names=F) # Print a title message to explain the output cat("The sequences with the \"H..H{1,2}\" pattern in upper case have been written to a file 'export.txt' in your current working directory", "\n", "\n") # (B3) Check AA distribution in sequences AA <- data.frame(AA=c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")) AAstat <- lapply(strsplit(as.character(patterndf[patterndf[,5]>2, 6]),""), table) for(i in 1:length(AAstat)) { AAperc <- AAstat[[i]] / patterndf[patterndf[,5]>2, 3][i] *100 # calculates AA content in percent using length information AA <- merge(AA, as.data.frame(AAperc), by.x="AA", by.y="Var1", all=T) } names(AA)[2:length(AA)] <- as.vector(patterndf[patterndf[,5]>2, 1]) AASummary <- data.frame(AA, Mean=apply(AA[,2:length(AA)], 1, mean, na.rm=T)) for(i in 1:length(patterndf[patterndf[,5]>2, 3])) { patterndf[patterndf[,5]>2, 3][i] } # Print a title message to explain the following output cat("The identified sequences with two or more \"H..H{1,2}\" motifs have the following AA composition:", "\n") print(AASummary) cat("If you encounter in the next steps error messages, then Needle and/or Phylip may not be available on your system!!!!!!!", "\n") ###################################### # Run Needle in All-Against-All Mode # ###################################### # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # This part is for UNIX-like OSs that have EMBOSS installed # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # The following code runs the Needle program from EMBOSS in all-against-all mode # to generate a distance matrix based on scores obtained in each pairwise alignment. # The same code can be used to run Water, BLAST2 or other pairwise sequence analysis # programs in all-against-all mode. # In the last step the obtained distance matrix is used to plot a similarity tree in R. # The same matrix can also be used to calculate a phylogenetic tree with the Phylip # software (see below). patterndf <- patterndf[patterndf[,5]>2,] # start with above patterndf system("rm -f my_needle_file") for(i in 1:length(patterndf[,1])) { cat(as.character(paste(">", as.vector(patterndf[i,1]),sep="")), as.character(as.vector(patterndf[i,6])), file="file1", sep="\n") for(j in 1:length(patterndf[,1])) { cat(as.character(paste(">", as.vector(patterndf[j,1]),sep="")), as.character(as.vector(patterndf[j,6])), file="file2", sep="\n") system("needle file1 file2 stdout -gapopen 10.0 -gapextend 0.5 >> my_needle_file") } } # Import needle results and plot them as tree score <- readLines("my_needle_file") score <- score[grep("^# Score", score, perl=T)] score <- gsub(".* " , "", as.character(score), perl=T) score <- as.numeric(score) scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1]))) scorem <- as.dist(1/scorem) hc <- hclust(scorem, method="complete") plot(hc, hang = -1, main="Distance Tree Based on Needle All-Against-All Comparison") # Print a title message to explain the generated output cat("The created all-against-all distance matrix ('scorem') of Needle scores is plotted in form of a distance tree.", "\n") ################### # PHYLIP Analysis # ################### # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # This part is for UNIX-like OSs that have PHYLIP installed # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # In this step the obtained distance matrix is exported in Phylip format to a file named 'infile'. # This file is used to generate a phylogenetic tree with the 'neighbor' program in Phylip. After this # step the retree module is used to view and edit the tree. scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1]))) z <- 1/scorem zv <- as.vector(z) zv[seq(1,64, by=9)] <- 0 z <- matrix(zv, 8, 8, dimnames=list(names(as.data.frame(scorem)), names(as.data.frame(scorem)))) z <- round(z, 7) write.table(as.data.frame(z), file="infile", quote=F, col.names=F, sep="\t") # Print a title message to explain the generated output cat("The distance matrix was exported in Phylip format to a file named 'infile'. Follow the instructions in the next comment lines to continue with the Phylip anaylsis.", "\n") # system("vim infile") # add number of sequences in first line # system("phylip neighbor") # choose: r, n, y # system("mv -f outtree intree") # system("phylip retree") # choose: y, m, x, y, r