#########################################
## Basic Sequence and Range Operations ##
#########################################
## Workshop exercise code from Oct 30, 2010
## For updates see this site:
## http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Exercises

##########################
## (1) Random Sequences ##
##########################
## (1.1.) Write a function that creates any number of random DNA sequences of any length.
## Store the random sequences in a DNAStingSet object named "randset".
library(Biostrings)
rand <- sapply(1:100, function(x) paste(sample(c("A","T","G","C"), 21, replace=T), collapse=""))
randset <- DNAStringSet(rand)
unlist(randset)

## (1.2) Extract any number of pseudo reads from the following reference genome and then concatenate them.

ref <- DNAString(paste(sample(c("A","T","G","C"), 100000, replace=T), collapse=""))
randstart <- sample(1:(length(ref)-15), 1000)
randreads <- Views(ref, randstart, width=15)
subject(randreads)

## (1.3) Enumerate duplicated sequences in your random DNA sequence set and then remove the duplicates.
table(as.character(dsetv))
randset[!duplicated(randset)]

######################################
## (2) Basic Sequence Manipulations ##
######################################
## (2.1) Create for your random DNA sequences the complement, the reverse and
## the reverse and complement sequences.
complement(randset)
reverse(randset)
reverseComplement(randset)

## (2.1) Translate your random DNA sequences into proteins.
translate(randset)

####################################
## (3) Sequence Import and Export ##
####################################
## (3.1) Import the following sequences into a DNAStringSet object.
## Save to ../db: ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp_uid217/AE004437.ffn

myseq <- read.DNAStringSet("../db/AE004437.ffn", "fasta")
names(myseq) <- sub(" .*", "", names(myseq))

## (3.2) Use the trimLRPatterns() function to remove the first and last triplet from each sequence.

trimLRPatterns(Lpattern = "ATG", Rpattern="TGG", subject=myseq, max.Lmismatch=c(0,0,0), max.Rmismatch=c(1,1,1))

## (3.3) Translate the first and second reading frame of all sequences into proteins and export them to a FASTA file.

mypeps1 <- translate(myseq)
mypeps2 <- translate(DNAStringSet(myseq, 2, width(myseq)))
write.XStringSet(mypeps1, file="mypeps1.txt", format="fasta", width=80)

## (3.4) Confirm whether the translated sequences are identical to the ones on NCBI

pep1 <- sapply(as.list(mypeps1), toString)
pep1 <-  gsub("\\*", "", pep1)
ncbi <- read.AAStringSet("../db/AE004437.faa", "fasta")
pep2 <- sapply(as.list(ncbi), toString)
mypeps1[which(pep1!=pep2)]; ncbi[which(pep1!=pep2)]

## (3.4) Extract the following sequences and save their first 100bp to a FASTA file.

myids <- c("gb|AE004437.1|:c1214690-1214457", "gb|AE004437.1|:c1696384-1695860", "gb|AE004437.1|:c84415-83885")
mysubseq <- myseq[names(myseq) %in% myids]
mysubseq <- DNAStringSet(mysubseq, 1, 100)
write.XStringSet(mysubseq, file="mysubseq.txt", format="fasta", width=80)


#############################
## (4) Multiple Alignments ##
#############################
## (4.1) Import the following sequences into a AAStringSet object.

p450 <- read.AAStringSet("../db/p450.mul", "fasta")

## (4.2) Consensus Matrix of Multiple Alignments
## Identify in the p450 multiple alignment the heme binding site containing the only invariant cysteine.

which(consensusMatrix(p450)["C",]==5)

## (4.3) Return the heme binding site including 5 AA upstream and downstream.

p450sub <- AAStringSet(p450, 456, 466)

######################################################
## (5) Align Short Reads Against a Reference Genome ##
######################################################
## (5.1) Determine the number of matches of the DNA string "ATGGTGATG" in the 'ref' sample when allowing 0, 1, 2 and 3 mismatches.

test <- matchPattern("ATGGTGATG", ref, max.mismatch=1)
as.data.frame(test) # Useful for export.
lapply(0:3, function(x) matchPattern("ATGGTGATG", ref, max.mismatch=x))
sapply(0:3, function(x) countPattern("ATGGTGATG", ref, max.mismatch=x))


#############################################
## (6) Storing and Parsing Sequence Ranges ##
#############################################

## (6.1) Collapse overlapping ranges in the following sample IRanges object
ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40), width = c(12, 6, 6, 15, 6, 2, 7))

reduce(ir, min.gapwidth=1)
disjoin(ir) # Returns disjoint ranges.
gaps(ir) # Returns uncovered regions (gaps) in an IRanges object.


## (6.2) Extract sequence fragments from the following sample chr with an IRanges object
chr <- DNAString("GCATATTACAATCGATCCGCATATTAC")
irextract <- IRanges(start = c(1, 25), width = 3)
Views(chr, irextract) # Example for extracting sub-sequences and storing them in a Views object.
seqselect(chr, irextract) # Example for extracting and joining sequences (e.g. exons).

## (6.3) Find overlappling ranges among the following sample IRange objects
ir1 <- ir
ir2 <- reduce(ir)

ol <- findOverlaps(ir1, ir2, minoverlap=1); as.matrix(ol) # Identifies overlapping ranges.

## (6.4) Parse the gene and intergenic sequences from the first human chromosome as shown in above IRanges/GTF example.
## http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Parsing-Gene-Intergenic-Regions-Bas


## (6.5) Plot the length distribution of the two sequence sets in a bar plot to determine the differences among the two sets.
## The cut function can be used to enumerate the sequences for specific length intervals, e.g. 1-100, 101-1000, 1001-5000, 5001-10000, >10000.

y <- width(interchr1); inter_ranges <- cut(y, right=F, breaks=c(1, 100, 1000, 5000, 10000, max(y)+1), labels=c("1-100","101-1000","1001-5000", "5001-10000", ">10000"))
y <- width(geneschr1); gene_ranges <- cut(y, right=F, breaks=c(1, 100, 1000, 5000, 10000, max(y)+1), labels=c("1-100","101-1000","1001-5000", "5001-10000", ">10000"))
countMA <- rbind(table(inter_ranges), table(gene_ranges)); rownames(countMA) <- c("inter_ranges", "gene_ranges")
barplot(countMA, ylim=c(0,2500), beside=T, legend.text=rownames(countMA))