######################################### ## Basic Sequence and Range Operations ## ######################################### ## Workshop exercise code from Oct 30, 2010 ## For updates see this site: ## http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Exercises ########################## ## (1) Random Sequences ## ########################## ## (1.1.) Write a function that creates any number of random DNA sequences of any length. ## Store the random sequences in a DNAStingSet object named "randset". library(Biostrings) rand <- sapply(1:100, function(x) paste(sample(c("A","T","G","C"), 21, replace=T), collapse="")) randset <- DNAStringSet(rand) unlist(randset) ## (1.2) Extract any number of pseudo reads from the following reference genome and then concatenate them. ref <- DNAString(paste(sample(c("A","T","G","C"), 100000, replace=T), collapse="")) randstart <- sample(1:(length(ref)-15), 1000) randreads <- Views(ref, randstart, width=15) subject(randreads) ## (1.3) Enumerate duplicated sequences in your random DNA sequence set and then remove the duplicates. table(as.character(dsetv)) randset[!duplicated(randset)] ###################################### ## (2) Basic Sequence Manipulations ## ###################################### ## (2.1) Create for your random DNA sequences the complement, the reverse and ## the reverse and complement sequences. complement(randset) reverse(randset) reverseComplement(randset) ## (2.1) Translate your random DNA sequences into proteins. translate(randset) #################################### ## (3) Sequence Import and Export ## #################################### ## (3.1) Import the following sequences into a DNAStringSet object. ## Save to ../db: ftp://ftp.ncbi.nih.gov/genbank/genomes/Bacteria/Halobacterium_sp_uid217/AE004437.ffn myseq <- read.DNAStringSet("../db/AE004437.ffn", "fasta") names(myseq) <- sub(" .*", "", names(myseq)) ## (3.2) Use the trimLRPatterns() function to remove the first and last triplet from each sequence. trimLRPatterns(Lpattern = "ATG", Rpattern="TGG", subject=myseq, max.Lmismatch=c(0,0,0), max.Rmismatch=c(1,1,1)) ## (3.3) Translate the first and second reading frame of all sequences into proteins and export them to a FASTA file. mypeps1 <- translate(myseq) mypeps2 <- translate(DNAStringSet(myseq, 2, width(myseq))) write.XStringSet(mypeps1, file="mypeps1.txt", format="fasta", width=80) ## (3.4) Confirm whether the translated sequences are identical to the ones on NCBI pep1 <- sapply(as.list(mypeps1), toString) pep1 <- gsub("\\*", "", pep1) ncbi <- read.AAStringSet("../db/AE004437.faa", "fasta") pep2 <- sapply(as.list(ncbi), toString) mypeps1[which(pep1!=pep2)]; ncbi[which(pep1!=pep2)] ## (3.4) Extract the following sequences and save their first 100bp to a FASTA file. myids <- c("gb|AE004437.1|:c1214690-1214457", "gb|AE004437.1|:c1696384-1695860", "gb|AE004437.1|:c84415-83885") mysubseq <- myseq[names(myseq) %in% myids] mysubseq <- DNAStringSet(mysubseq, 1, 100) write.XStringSet(mysubseq, file="mysubseq.txt", format="fasta", width=80) ############################# ## (4) Multiple Alignments ## ############################# ## (4.1) Import the following sequences into a AAStringSet object. p450 <- read.AAStringSet("../db/p450.mul", "fasta") ## (4.2) Consensus Matrix of Multiple Alignments ## Identify in the p450 multiple alignment the heme binding site containing the only invariant cysteine. which(consensusMatrix(p450)["C",]==5) ## (4.3) Return the heme binding site including 5 AA upstream and downstream. p450sub <- AAStringSet(p450, 456, 466) ###################################################### ## (5) Align Short Reads Against a Reference Genome ## ###################################################### ## (5.1) Determine the number of matches of the DNA string "ATGGTGATG" in the 'ref' sample when allowing 0, 1, 2 and 3 mismatches. test <- matchPattern("ATGGTGATG", ref, max.mismatch=1) as.data.frame(test) # Useful for export. lapply(0:3, function(x) matchPattern("ATGGTGATG", ref, max.mismatch=x)) sapply(0:3, function(x) countPattern("ATGGTGATG", ref, max.mismatch=x)) ############################################# ## (6) Storing and Parsing Sequence Ranges ## ############################################# ## (6.1) Collapse overlapping ranges in the following sample IRanges object ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40), width = c(12, 6, 6, 15, 6, 2, 7)) reduce(ir, min.gapwidth=1) disjoin(ir) # Returns disjoint ranges. gaps(ir) # Returns uncovered regions (gaps) in an IRanges object. ## (6.2) Extract sequence fragments from the following sample chr with an IRanges object chr <- DNAString("GCATATTACAATCGATCCGCATATTAC") irextract <- IRanges(start = c(1, 25), width = 3) Views(chr, irextract) # Example for extracting sub-sequences and storing them in a Views object. seqselect(chr, irextract) # Example for extracting and joining sequences (e.g. exons). ## (6.3) Find overlappling ranges among the following sample IRange objects ir1 <- ir ir2 <- reduce(ir) ol <- findOverlaps(ir1, ir2, minoverlap=1); as.matrix(ol) # Identifies overlapping ranges. ## (6.4) Parse the gene and intergenic sequences from the first human chromosome as shown in above IRanges/GTF example. ## http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Parsing-Gene-Intergenic-Regions-Bas ## (6.5) Plot the length distribution of the two sequence sets in a bar plot to determine the differences among the two sets. ## The cut function can be used to enumerate the sequences for specific length intervals, e.g. 1-100, 101-1000, 1001-5000, 5001-10000, >10000. y <- width(interchr1); inter_ranges <- cut(y, right=F, breaks=c(1, 100, 1000, 5000, 10000, max(y)+1), labels=c("1-100","101-1000","1001-5000", "5001-10000", ">10000")) y <- width(geneschr1); gene_ranges <- cut(y, right=F, breaks=c(1, 100, 1000, 5000, 10000, max(y)+1), labels=c("1-100","101-1000","1001-5000", "5001-10000", ">10000")) countMA <- rbind(table(inter_ranges), table(gene_ranges)); rownames(countMA) <- c("inter_ranges", "gene_ranges") barplot(countMA, ylim=c(0,2500), beside=T, legend.text=rownames(countMA))