######################################
# (A) Sequence Batch Import Function #
######################################
# This script provides the following functionality for sequence management and analysis:
#	(1) Batch sequence import into R data frame
#	(2) Motif searching with hit statistics
#	(3) Analysis of sequence composition
#	(4) All-against-all sequence comparisons
#	(5) Generation of phylogenetic trees

# Download sample proteome from NCBI in fasta format. The chosen example is from Halobacterium sp. which contains 2058 proteins:
myfileloc <- "ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp/AE004437.faa"
# Larger example from Arabidopsis with 30690 proteins:
# ftp://ftp.arabidopsis.org/home/tair/home/tair/Sequences/blast_datasets/TAIR6_pep_20051108

# (A1) Define Sequence Import Function 'seq_imp_fct'
# This function does the following:
#	(a) reads fasta batch file into a single vector
#	(b) writes each sequence into a single field using a grouping vector 
#	(c) provides length information for each sequence
#	(d) organizes description lines and sequences in a single data frame

seq_imp_fct <- function(fileloc) {
	my_fasta <- readLines(fileloc) # reads file line-wise into vector 
	y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign
	y <- as.vector(y);  y[y==-1] <- 0

	# The 'rep' function is used here to generate a grouping vector/column for writing each sequence into a single field
	# This 'indirect' approach runs much faster than the alternative 'for loop' below.
	index <- which(y==0)
	distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
	distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry 
	distance <- data.frame(distance, dist=distance[,2]-distance[,1])
	seq_no <- 1:length(y[y==0])
	index <- rep(seq_no, as.vector(distance[,3]))
	my_fasta <- data.frame(index, y, my_fasta)
	my_fasta[my_fasta[,2]==0,1] <- 0
	
	# Alternative 'for loop' that generates above grouping function, but is 100 times slower
	# counter <- 0
	# index <- NULL
	# for(i in y) {
	#	if(i==1) { index <- c(index, counter) } else 
	# 	{ index <- c(index, 0); counter <- counter + 1 }
	# }
	# my_fasta <- data.frame(index, y, my_fasta)

	# Use the 'paste' function in a 'tapply' loop to write each sequence into a single field
	# This runs much faster than the alternative 'for loop' below.
	seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
	seq <- as.vector(seq[2:length(seq)])
		
	# Alternative 'for loop' that does the same as above 'tapply' function, but is >20 times slower
	# seq <- NULL
	#	for(i in 1:max(my_fasta[,1])) {
	#	seq <- c(seq, paste(my_fasta[my_fasta[,1]==i,3], collapse=""))
	#	}
	Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
	my_fasta <- data.frame(Desc, Length=nchar(seq), seq)
	my_fasta
}
# (A2) Apply 'seq_imp_fct' to import Halobacterium proteome 
Halobact <- seq_imp_fct(fileloc=myfileloc)
# Generate separate column with accession numbers. This regular expression needs to be adjusted for other title line formats 
Acc <- gsub(".*gb\\|(.*)\\|.*" , "\\1", as.character(Halobact[,1]), perl=T) # regular expression specific for this example
Halobact <- data.frame(Acc, Halobact)

#################################
# (B) Pattern Matching Function #
#################################
# (B1) Define pattern function
# This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'.
# In addition to the pattern counts, the function provides the positions of the pattern matches.
pattern_fct <- function(pattern, sequences) { # expects sequences in fourth column of 'sequence' data frame 
	pos <- gregexpr(pattern, as.character(sequences[,4]), perl=T) # 'gregexpr' returns list with pattern positions
	posv <- unlist(lapply(pos, paste, collapse=", ")); posv[posv==-1] <- 0 # retrieves positions where pattern matches in each sequence
	hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
	sequences <- data.frame(sequences[,1:3], Position=as.vector(posv), Hits=hitsv, sequences[,4])
	sequences
}
# (B2) Apply pattern function
patterndf <- pattern_fct(pattern="H..H{1,2}", sequences=Halobact)
# Print a title message to explain the following output
cat("These sequences in the Halobacterium proteome contain the motif \"H..H{1,2}\" more than two times:","\n")
print(patterndf[patterndf[,5]>2, 1:5])
# Tag pattern in sequence by setting pattern to upper case and remaining sequence to lower case
tag <- gsub("([A-Z])", "\\L\\1", as.character(patterndf[patterndf[,5]>2, 6]), perl=T, ignore.case=T)
tag <- gsub("(H..H{1,2})", "\\U\\1", as.character(tag), perl=T, ignore.case=T)
export <- data.frame(patterndf[patterndf[,5]>2,-6], tag)
export <- data.frame(Acc=paste(">", export[,1], sep=""), seq=export[,6])
# Possibility to export any sequence subset in fasta format to an external file
write.table(as.vector(as.character(t(export))), file="export.txt", quote=F, row.names=F, col.names=F)
# Print a title message to explain the output
cat("The sequences with the \"H..H{1,2}\" pattern in upper case have been written to a file 'export.txt' in your current working directory", "\n", "\n")

# (B3) Check AA distribution in sequences
AA <- data.frame(AA=c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"))
AAstat <- lapply(strsplit(as.character(patterndf[patterndf[,5]>2, 6]),""), table)
for(i in 1:length(AAstat)) {
	AAperc <- AAstat[[i]] / patterndf[patterndf[,5]>2, 3][i] *100 # calculates AA content in percent using length information  
	AA <- merge(AA, as.data.frame(AAperc), by.x="AA", by.y="Var1", all=T)
}
names(AA)[2:length(AA)] <- as.vector(patterndf[patterndf[,5]>2, 1])
AASummary <- data.frame(AA, Mean=apply(AA[,2:length(AA)], 1, mean, na.rm=T))
for(i in 1:length(patterndf[patterndf[,5]>2, 3])) {
	patterndf[patterndf[,5]>2, 3][i]
}
# Print a title message to explain the following output
cat("The identified sequences with two or more \"H..H{1,2}\" motifs have the following AA composition:", "\n")
print(AASummary)
cat("If you encounter in the next steps error messages, then Needle and/or Phylip may not be available on your system!!!!!!!", "\n")
	
######################################
# Run Needle in All-Against-All Mode #
######################################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have EMBOSS installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# The following code runs the Needle program from EMBOSS in all-against-all mode
# to generate a distance matrix based on scores obtained in each pairwise alignment.
# The same code can be used to run Water, BLAST2 or other pairwise sequence analysis
# programs in all-against-all mode. 
# In the last step the obtained distance matrix is used to plot a similarity tree in R.
# The same matrix can also be used to calculate a phylogenetic tree with the Phylip 
# software (see below).
patterndf <- patterndf[patterndf[,5]>2,] # start with above patterndf
system("rm -f my_needle_file")
for(i in 1:length(patterndf[,1])) {
 	cat(as.character(paste(">", as.vector(patterndf[i,1]),sep="")), as.character(as.vector(patterndf[i,6])), file="file1", sep="\n")
 	for(j in 1:length(patterndf[,1])) {	
 		cat(as.character(paste(">", as.vector(patterndf[j,1]),sep="")), as.character(as.vector(patterndf[j,6])), file="file2", sep="\n")
 		system("needle file1 file2 stdout -gapopen 10.0 -gapextend 0.5 >> my_needle_file")
 	}
}
 
# Import needle results and plot them as tree
score <- readLines("my_needle_file")
score <- score[grep("^# Score", score, perl=T)]
score <- gsub(".* " , "", as.character(score), perl=T)
score <- as.numeric(score)
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
scorem <- as.dist(1/scorem)
hc <- hclust(scorem, method="complete")
plot(hc, hang = -1, main="Distance Tree Based on Needle All-Against-All Comparison")
# Print a title message to explain the generated output
cat("The created all-against-all distance matrix ('scorem') of Needle scores is plotted in form of a distance tree.", "\n")

###################
# PHYLIP Analysis #
###################
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This part is for UNIX-like OSs that have PHYLIP installed
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# In this step the obtained distance matrix is exported in Phylip format to a file named 'infile'.  
# This file is used to generate a phylogenetic tree with the 'neighbor' program in Phylip. After this
# step the retree module is used to view and edit the tree.  
scorem <- matrix(score, length(patterndf[,1]), length(patterndf[,1]), dimnames=list(as.vector(patterndf[,1]),as.vector(patterndf[,1])))
z <- 1/scorem
zv <- as.vector(z)
zv[seq(1,64, by=9)] <- 0
z <- matrix(zv, 8, 8, dimnames=list(names(as.data.frame(scorem)), names(as.data.frame(scorem))))
z <- round(z, 7)
write.table(as.data.frame(z), file="infile", quote=F, col.names=F, sep="\t")
# Print a title message to explain the generated output
cat("The distance matrix was exported in Phylip format to a file named 'infile'. Follow the instructions in the next comment lines to continue with the Phylip anaylsis.", "\n")
# system("vim infile") # add number of sequences in first line
# system("phylip neighbor") # choose: r, n, y
# system("mv -f outtree intree")
# system("phylip retree") # choose: y, m, x, y, r