### R code from vignette source 'Rsequences.Rnw'

###################################################
### code chunk number 1: Rsequences.Rnw:139-146
###################################################
myseq <- c("ATGCAGACATAGTG", "ATGAACATAGATCC", "GTACAGATCAC") # Sample sequence data set.
myseq[grep("ATG", myseq)] # String searching with regular expression support.
pos1 <- regexpr("AT", myseq) # Searches 'myseq' for first match of pattern "AT".
as.numeric(pos1); attributes(pos1)$match.length # Returns position information of matches.
pos2 <- gregexpr("AT", myseq) # Searches 'myseq' for all matches of pattern "AT".
as.numeric(pos2[[1]]); attributes(pos2[[1]])$match.length # Returns positions of matches in first sequence.
gsub("^ATG", "atg", myseq) # String substitution with regular expression support.


###################################################
### code chunk number 2: Rsequences.Rnw:149-152
###################################################
nchar(myseq) # Computes length of strings.
substring(myseq[1], c(1,3), c(2,5)) # Positional parsing of several fragments from one string.
substring(myseq, c(1,4,7), c(2,6,10)) # Positional parsing of many strings.


###################################################
### code chunk number 3: Rsequences.Rnw:160-162
###################################################
rand <- sapply(1:100, function(x) paste(sample(c("A","T","G","C"), sample(10:20), replace=T), collapse=""))
rand[1:3]


###################################################
### code chunk number 4: Rsequences.Rnw:166-167
###################################################
table(c(rand[1:4], rand[1]))


###################################################
### code chunk number 5: Rsequences.Rnw:171-177
###################################################
library(Biostrings)
ref <- DNAString(paste(sample(c("A","T","G","C"), 100000, replace=T), collapse=""))
randstart <- sample(1:(length(ref)-15), 1000)
randreads <- Views(ref, randstart, width=15)
rand_set <- DNAStringSet(randreads)
unlist(rand_set)


###################################################
### code chunk number 6: Rsequences.Rnw:210-216
###################################################
# system("wget ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp_uid217/AE004437.ffn")
myseq <- read.DNAStringSet("AE004437.ffn")
myseq[1:3]
sub <- myseq[grep("99.*", names(myseq))]
length(sub)
write.XStringSet(sub, file="AE004437sub.ffn", width=80)


###################################################
### code chunk number 7: Rsequences.Rnw:226-234
###################################################
library(Biostrings)
d <- DNAString("GCATAT-TAC")
d
d[1:4]
r <- RNAString("GCAUAU-UAC") 
r <- RNAString(d) # Converts d into RNAString object.
p <- AAString("HCWYHH")
b <- BString("I store any set of characters. Other XString objects store only the IUPAC characters.")


###################################################
### code chunk number 8: Rsequences.Rnw:242-250
###################################################
dset <- DNAStringSet(c("GCATATTAC", "AATCGATCC", "GCATATTAC")) 
names(dset) <- c("seq1", "seq2", "seq3") # Assigns names
dset[1:2]
width(dset) # Returns the length of each sequences
d <- dset[[1]] # The [[ subsetting operator returns a single entry as XString object
dset2 <- c(dset, dset) # Appends/concatenates two XStringSet objects
dsetchar <- as.character(dset) # Converts XStringSet to named vector 
dsetone <- unlist(dset) # Collapses many sequences to a single one stored in a DNAString container


###################################################
### code chunk number 9: Rsequences.Rnw:253-254
###################################################
DNAStringSet(dset, start=c(1,2,3), end=c(4,8,5)) 


###################################################
### code chunk number 10: Rsequences.Rnw:262-265
###################################################
origMAlign <- read.DNAMultipleAlignment(filepath = system.file("extdata",
              "msx2_mRNA.aln", package = "Biostrings"), format = "clustal")
origMAlign


###################################################
### code chunk number 11: Rsequences.Rnw:274-283
###################################################
phred <- 1:9
phreda <- paste(sapply(as.raw((phred)+33), rawToChar), collapse=""); phreda
as.integer(charToRaw(phreda))-33 
dset <- DNAStringSet(sapply(1:100, function(x) paste(sample(c("A","T","G","C"), 20, replace=T), collapse=""))) # Creates random sample sequence.
myqlist <- lapply(1:100, function(x) sample(1:40, 20, replace=T)) # Creates random Phred score list.
myqual <- sapply(myqlist, function(x) toString(PhredQuality(x))) # Converts integer scores into ASCII characters.
myqual <- PhredQuality(myqual) # Converts to a PhredQuality object.
dsetq1 <- QualityScaledDNAStringSet(dset, myqual) # Combines DNAStringSet and quality data in QualityScaledDNAStringSet object.
dsetq1[1:2]


###################################################
### code chunk number 12: Rsequences.Rnw:292-296
###################################################
randset <- DNAStringSet(rand)
complement(randset[1:2])
reverse(randset[1:2])
reverseComplement(randset[1:2])


###################################################
### code chunk number 13: Rsequences.Rnw:299-300
###################################################
translate(randset[1:2])


###################################################
### code chunk number 14: Rsequences.Rnw:308-318 (eval = FALSE)
###################################################
## myseq1 <- read.DNAStringSet("ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp_uid217/AE004437.ffn") 
## mypos <- matchPattern("ATGGTG", myseq1[[1]], max.mismatch=1) # Finds pattern matches in reference 
## countPattern("ATGGCT", myseq1[[1]], max.mismatch=1) # Counts only the corresponding matches
## tmp <- c(DNAStringSet("ATGGTG"), DNAStringSet(mypos)) # Results shoen in DNAStringSet object
## consensusMatrix(tmp) # Returns a consensus  matrix for query and hits.
## myvpos <- vmatchPattern("ATGGCT", myseq1, max.mismatch=1) # Finds all pattern matches in reference
## myvpos # The results are stored as MIndex object.
## Views(myseq1[[1]], start(myvpos[[1]]), end(myvpos[[1]])) # Retrieves the result for single entry
## sapply(seq(along=myseq1), function(x) 
##        as.character(Views(myseq1[[x]], start(myvpos[[x]]), end(myvpos[[x]])))) # All matches.


###################################################
### code chunk number 15: Rsequences.Rnw:321-328 (eval = FALSE)
###################################################
## myseq <- DNAStringSet(c("ATGCAGACATAGTG", "ATGAACATAGATCC", "GTACAGATCAC"))
## myseq[grep("^ATG", myseq, perl=TRUE)] # String searching with regular expression support
## pos1 <- regexpr("AT", myseq) # Searches 'myseq' for first match of pattern "AT"
## as.numeric(pos1); attributes(pos1)$match.length # Returns position information of matches
## pos2 <- gregexpr("AT", myseq) # Searches 'myseq' for all matches of pattern "AT"
## as.numeric(pos2[[1]]); attributes(pos2[[1]])$match.length # Match positions in first sequence
## DNAStringSet(gsub("^ATG", "NNN", myseq)) # String substitution with regular expression support


###################################################
### code chunk number 16: Rsequences.Rnw:336-338
###################################################
pwm <- PWM(DNAStringSet(c("GCT", "GGT", "GCA"))) 
library(seqLogo); seqLogo(t(t(pwm) * 1/colSums(pwm)))


###################################################
### code chunk number 17: Rsequences.Rnw:341-343
###################################################
chr <- DNAString("AAAGCTAAAGGTAAAGCAAAA") 
matchPWM(pwm, chr, min.score=0.9) # Searches sequence for PWM matches with score better than min.score.


###################################################
### code chunk number 18: Rsequences.Rnw:363-372
###################################################
library(GenomicRanges); library(rtracklayer)
gr <- GRanges(seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)), ranges = IRanges(1:10, end = 7:16, names = head(letters, 10)), strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)), score = 1:10, GC = seq(1, 0, length = 10)) # Example of creating a GRanges object with its constructor function.
gff <- import.gff("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/Samples/gff3.gff", 
                  asRangedData=FALSE) # Imports a simplified GFF3 genome annotation file.
seqlengths(gff) <- end(ranges(gff[which(elementMetadata(gff)[,"type"]=="chromosome"),])) 
names(gff) <- 1:length(gff) # Assigns names to corresponding slot.
gff[1:4,]
gff_rd <- as(gff, "RangedData") # Coerces GRanges object to RangedData class.
gff_gr <- as(gff_rd, "GRanges") # Coerces RangedData object to GRanges class. 


###################################################
### code chunk number 19: Rsequences.Rnw:380-386 (eval = FALSE)
###################################################
## gff[1:4]; gff[1:4, c("type", "group")]; gff[2] <- gff[3] # Subsetting and replacement 
## c(gff[1:2], gff[401:402]) # GRanges objects can be concatenated with the c() function.
## seqnames(gff); ranges(gff); strand(gff); seqlengths(gff) # Accessor functions 
## start(gff[1:4]); end(gff[1:4]); width(gff[1:4]) # Direct access to IRanges components
## elementMetadata(gff); elementMetadata(gff)[, "type"] # Accessing metadata component. 
## gff[elementMetadata(gff)[ ,"type"] == "gene"] # Returns only gene ranges.


###################################################
### code chunk number 20: Rsequences.Rnw:389-397 (eval = FALSE)
###################################################
## strand(gff) <- "*" # Erases the strand information
## reduce(gff) # Collapses overlapping ranges to continuous ranges.
## gaps(gff) # Returns uncovered regions.
## disjoin(gff) # Returns disjoint ranges.
## coverage(gff) # Returns coverage of ranges.
## findOverlaps(gff, gff[1:4]) # Returns the index pairings for the overlapping ranges. 
## countOverlaps(gff, gff[1:4]) # Counts overlapping ranges 
## subsetByOverlaps(gff, gff[1:4]) # Returns only overlapping ranges 


###################################################
### code chunk number 21: Rsequences.Rnw:400-405 (eval = FALSE)
###################################################
## sp <- split(gff) # Stores every range in separate component of a GRangesList object
## split(gff, seqnames(gff)) # Stores ranges of each chromosome in separate component.
## unlist(sp) # Returns data as GRanges object
## sp[1:4, "type"] # Subsetting of GRangesList objects is similar to GRanges objects.
## lapply(sp[1:4], length); sapply(sp[1:4], length) # Looping over GRangesList objects similar to lists


###################################################
### code chunk number 22: Rsequences.Rnw:422-432 (eval = FALSE)
###################################################
## chr <- read.DNAStringSet("AE004437.fna")
## writeLines(readLines("AE004437.gff")[-c(1:7)], "AE004437.gff2")
## gff <- import.gff("AE004437.gff2", asRangedData=FALSE)
## gffgene <- gff[elementMetadata(gff)[,"type"]=="gene"]
## gene <- DNAStringSet(Views(chr[[1]], IRanges(start(gffgene), end(gffgene))))
## names(gene) <- elementMetadata(gffgene)[,"group"]
## pos <- elementMetadata(gffgene[strand(gffgene) == "+"])[,"group"]
## translate(gene[names(gene) %in% pos])
## neg <- elementMetadata(gffgene[strand(gffgene) == "-"])[,"group"]
## translate(reverseComplement(gene[names(gene) %in% neg]))


