############################################## # Sequence Parsing by Positional Coordinates # ############################################## # Utility: # Script parses sequences based on positional information provided # by a coordinate matrix like the one below. The parsing is performed # on vectorized sequences in a list container. # Format of coordinate matrix # ID start end # ID001 20 60 # ID001 120 190 # ID002 20 60 # ID002 120 190 # # To demo what the script does, run it like this: # source("http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/positionParsing.txt") # Create a sample set of sequences (here random sequences) fx <- function(test) {x <- as.integer(runif(200, min=1, max=5)) # sets length of each random sequence to '200' x[x==1] <- "A"; x[x==2] <- "T"; x[x==3] <- "G"; x[x==4] <- "C"; paste(x, sep = "", collapse ="") } z1 <- c() for(i in 1:5) { # sets number of generated sequences to '5' z1 <- c(fx(i), z1) } SeqID <- c("ID001","ID002","ID003","ID004","ID005") z1 <- data.frame(SeqID=SeqID, Seq=z1) cat("Source sequences:", "\n") print(z1) # Write the sequence data frame into a list object. # This is an important step for handling sets of vectorized sequences of variable length, since # list objects are more versatile in handling them than data frames. seq_list <- apply(as.matrix(z1[,2]), 1, list) # generates list with list components seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors seq_list <- lapply(seq_list, strsplit, "") # vectorizes sequences by splitting them into separate vector fields seq_list <- lapply(seq_list, unlist) # transforms list components to character vectors names(seq_list) <- SeqID # names sequence entries, e.g. by their IDs # Generate coordinate data frame with positions to be parsed from sequences. pos_matrix <- data.frame(ID=sort(rep(SeqID, 2)), start=rep(c(20,120),5), end=rep(c(60,190),5)) cat("Position matrix:", "\n") print(pos_matrix) # Define sequence parse function seq_parse_fct <- function(seq_list, pos_matrix) { # Parse sequences in nested 'for loop' # nesting required to parse more than one fragment per sequence ID parse_list_all <- list(NULL) # create empty list as list container if (length(setdiff(names(seq_list), as.vector(pos_matrix[,1]))) > 1) stop("The number of sequences and coordinate groups differ") # Check if all sequence IDs are contained in the sequence list and in the coordinate matrix for(i in names(seq_list)) { # loop over each sequence ID sub_pos_matrix <- pos_matrix[pos_matrix[,1]==i,] parse_list <- list(NULL) # creates empty list of vectorslength container for(j in 1:length(sub_pos_matrix[,1])) { # loop over all fragments of a sequence index <- eval(parse(text=paste(sub_pos_matrix[j,2],":", sub_pos_matrix[j,3]))) parse_list <- c(parse_list, list(seq_list[[i]][index])) if (length(parse_list[[1]][[1]])==0) { parse_list <- parse_list[-1] } # removes in first loop first (empty) component } parse_list_all <- c(parse_list_all, list(parse_list)) if (length(parse_list_all[[1]][[1]])==0) { parse_list_all <- parse_list_all[-1] } # removes in first loop first (empty) component } names(parse_list_all) <- SeqID # names sequence entries, e.g. by their IDs parse_list_all } parse_list_all <- seq_parse_fct(seq_list, pos_matrix) cat("Vectorized sequence fragments are managed in the list object 'parse_list_all'.", "\n", "Print first fragment set as sample:", "\n") print(parse_list_all[[1]]) # Collapse sequence vectors into strings parse_seq_list <- lapply(parse_list_all, unlist) # joins sequence vectors in each list parse_seq_list <- lapply(parse_seq_list, paste, collapse="") # collapses vectorised sequences into strings cat("Concatenated sequence fragments:", "\n") parsed_result <- data.frame(SeqID, Parsed_Seq=unlist(parse_seq_list)) print(parsed_result)