##############################################
# Sequence Parsing by Positional Coordinates #
##############################################
# Utility:
# Script parses sequences based on positional information provided 
# by a coordinate matrix like the one below. The parsing is performed
# on vectorized sequences in a list container.
# Format of coordinate matrix 
#        ID	start	end
#        ID001	20	60
#        ID001	120	190
#        ID002	20	60
#        ID002	120	190
#      
# To demo what the script does, run it like this:
#        source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt")

# Create a sample set of sequences (here random sequences)
fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200'
        x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="")
}
z1 <- c()
for(i in 1:5) { # sets number of generated sequences to '5'
	z1 <- c(fx(i), z1)
}
SeqID <- c("ID001","ID002","ID003","ID004","ID005")
z1 <- data.frame(SeqID=SeqID, Seq=z1)
cat("Source sequences:", "\n")
print(z1)

# Write the sequence data frame into a list object.
# This is an important step for handling sets of vectorized sequences of variable length, since
# list objects are more versatile in handling them than data frames. 
seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields
seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors
names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs

# Generate coordinate data frame with positions to be parsed from sequences.
pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5))
cat("Position matrix:", "\n")
print(pos_matrix)

# Define sequence parse function
seq_parse_fct <- function(seq_list, pos_matrix) {
	# Parse sequences in nested 'for loop'
	# nesting required to parse more than one fragment per sequence ID
	parse_list_all <- list(NULL) # create empty list as list container
	if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix
	for(i in names(seq_list)) { # loop over each sequence ID
		sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,]
		parse_list <- list(NULL) # creates empty list of vectorslength container
		for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence 
			index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3])))
			parse_list <- c(parse_list, list(seq_list[[i]][index]))
			if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component
		}
	parse_list_all <- c(parse_list_all, list(parse_list))
	if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component
	}
	names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs
	parse_list_all
}
parse_list_all <- seq_parse_fct(seq_list, pos_matrix)
cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n")
print(parse_list_all[[1]])

# Collapse sequence vectors into strings
parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list
parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings
cat("Concatenated sequence fragments:", "\n")
parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list))
print(parsed_result)