### R code from vignette source 'Rchipseq.Rnw' ################################################### ### code chunk number 1: Rchipseq.Rnw:211-212 (eval = FALSE) ################################################### ## system("ln -s /R-pkgs/data data") ################################################### ### code chunk number 2: Rchipseq.Rnw:217-221 (eval = FALSE) ################################################### ## system("wget http://biocluster.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/data.zip") ## system("unzip data.zip") ## download.file("http://faculty.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/Rchipseq.pdf", "Rchipseq.pdf") ## download.file("http://faculty.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/Rchipseq.R", "Rchipseq.R") ################################################### ### code chunk number 3: Rchipseq.Rnw:237-250 ################################################### library(Rsubread); library(Rsamtools) dir.create("results") # Note: all output data will be written to directory 'results' buildindex(basename="./results/tair10chr.fasta", reference="./data/tair10chr.fasta") # Build indexed reference genome targets <- read.delim("./data/targets.txt") # Import experiment design information targets input <- paste("./data/", targets[,3], sep="") output <- paste("./results/", targets[,3], ".sam", sep="") reference <- "./results/tair10chr.fasta" for(i in seq(along=targets[,3])) { align(index=reference, readfile1=input[i], output_file=output[i], nthreads=8, indels=1, TH1=2) asBam(file=output[i], destination=gsub(".sam", "", output[i]), overwrite=TRUE, indexDestination=TRUE) unlink(output[i]) } ################################################### ### code chunk number 4: Rchipseq.Rnw:260-267 ################################################### library(rtracklayer); library(GenomicRanges) samples <- as.character(targets$Fastq) samplespath <- paste("./results/", samples, ".bam", sep="") aligns <- readBamGappedAlignments(samplespath[3]) cov <- coverage(aligns) islands <- slice(cov, lower = 15) islands[[1]] ################################################### ### code chunk number 5: Rchipseq.Rnw:274-283 ################################################### plotCov <- function(mycov=cov, mychr=1, mypos=c(1,1000), mymain="Coverage", ...) { op <- par(mar=c(8,3,6,1)) plot(as.numeric(mycov[[mychr]][mypos[1]:mypos[2]]), type="l", lwd=1, col="blue", ylab="", main=mymain, xlab="", xaxt="n", ...) axis(1, las = 2, at=seq(1,mypos[2]-mypos[1], length.out= 10), labels=as.integer(seq(mypos[1], mypos[2], length.out= 10))) par(op) } plotCov(mycov=cov, mychr="Chr1", mypos=c(1,120000)) # Remember: read data is trunctated to first 100kbp ################################################### ### code chunk number 6: Rchipseq.Rnw:291-298 ################################################### chip_signal_list <- sapply(samplespath, list) for(i in seq(along=samplespath)) { aligns <- readBamGappedAlignments(samplespath[i]) chip_signal_list[[i]] <- as(aligns, "GRanges") } chip_signal_list[["./results/SRR038845.fastq.bam"]][1:4,] chip_signal_list <- sapply(names(chip_signal_list), function(x) resize(chip_signal_list[[x]], width = 200)) ################################################### ### code chunk number 7: Rchipseq.Rnw:308-314 ################################################### Nreads <- sapply(names(chip_signal_list), function(x) length(chip_signal_list[[x]])) normfactor <- 10^6/Nreads chip_signal_list <- sapply(names(chip_signal_list), function(x) coverage(chip_signal_list[[x]]) * normfactor[x]) chip_signal_list[["./results/SRR038845.fastq.bam"]][1:2,] chip_peak_list <- sapply(names(chip_signal_list), function(x) slice(chip_signal_list[[x]], lower=5)) chip_peak_list[[1]][[1]][1:3] ################################################### ### code chunk number 8: Rchipseq.Rnw:322-337 ################################################### library(BayesPeak) sig <- readBamGappedAlignments(samplespath[1]) bgr <- readBamGappedAlignments(samplespath[3]) sig <- as(as(sig, "GRanges"), "RangedData") bgr <- as(as(bgr, "GRanges"), "RangedData") raw.output <- bayespeak(treatment=sig, control=bgr, start = 1, end = 100000) # unreliable.jobs <- log(raw.output$QC$lambda1) < 1.5 # Removal of false positives due to overfitting. # bpeaks <- as.data.frame(summarise.peaks(raw.output, method = "lowerbound", exclude.jobs = unreliable.jobs)) bpeaks <- as.data.frame(summarise.peaks(raw.output, method = "lowerbound")) source("./data/Fct/chipseqFct.R") # Imports the rangeCoverage function. sigcovDF <- rangeCoverage(summaryFct=viewMeans, myname="sig_", peaksIR=bpeaks[,1:3], sig=sig, readextend=1, normfactor=10^6/length(start(sig))) bgrcovDF <- rangeCoverage(summaryFct=viewMeans, myname="bgr_", peaksIR=bpeaks[,1:3], sig=bgr, readextend=1, normfactor=10^6/length(start(bgr))) bpeaksDF <- cbind(bpeaks, sigcovDF[,-1], bgrcovDF[,-1]) bpeaksDF[1:4,] ################################################### ### code chunk number 9: Rchipseq.Rnw:344-345 ################################################### plotCov(mycov=chip_signal_list[[1]], mychr="Chr1", mypos=c(11000, 11600), ylim=c(0,200)) ################################################### ### code chunk number 10: Rchipseq.Rnw:353-357 ################################################### simple_peak <- as.data.frame(as(chip_peak_list[[1]], "IRangesList")) # simple_peak <- as.data.frame(chip_peak_list[[1]]) commonpeaks <- subsetByOverlaps(as(bpeaks, "RangedData"), as(simple_peak, "RangedData"), minoverlap=100) bpeaksDF[bpeaksDF$start %in% start(commonpeaks),][1:4,] ################################################### ### code chunk number 11: Exercise 1 ################################################### pubpeaks <- read.delim("./data/Kaufmann_peaks100k.txt") # Published peaks for first 100kbp on chromosomes. pubpeaks <- pubpeaks[order(pubpeaks$space, pubpeaks$start),] pubpeaks[1:4,] # Import olRanges function, which accepts two GRranges (IRanges) objects source("./data/Fct/rangeoverlapper.R") ################################################### ### code chunk number 12: Exercise 1 ################################################### myol1 <- subsetByOverlaps(as(pubpeaks, "RangedData"), as(simple_peak, "RangedData"), minoverlap=1) myol2 <- olRanges(query=as(as(pubpeaks, "RangedData"), "GRanges"), subject=as(as(simple_peak, "RangedData"), "GRanges"), output="df") # myol2[myol2["OLpercQ"] >= 50,] myol3 <- olRanges(query=as(as(pubpeaks, "RangedData"), "GRanges"), subject=as(as(bpeaks, "RangedData"), "GRanges"), output="df") # myol3[myol3["OLpercQ"] >= 50,] ################################################### ### code chunk number 13: Rchipseq.Rnw:392-402 ################################################### library(rtracklayer); library(GenomicRanges); library(Rsamtools) gff <- import.gff("./data/TAIR10_GFF3_trunc.gff", asRangedData=FALSE) seqlengths(gff) <- end(ranges(gff[which(elementMetadata(gff)[,"type"]=="chromosome"),])) subgene_index <- which(elementMetadata(gff)[,"type"] == "gene") gffsub <- gff[subgene_index,] # Returns only gene ranges strand(gffsub) <- "*" # For strand insensitive analysis gffsub[1:4,1:2] ids <- elementMetadata(gffsub)[, "group"] gffgene <- gffsub gffsub <- split(gffsub) # Coerce to GRangesList ################################################### ### code chunk number 14: Rchipseq.Rnw:409-417 ################################################### library(ChIPpeakAnno) annoRD <- unlist(gffsub) names(annoRD) <- gsub(".*=", "", elementMetadata(annoRD)[, "group"]) annoRD <- as(annoRD, "RangedData") peaksRD <- RangedData(space=bpeaksDF$space, IRanges(bpeaksDF$start, bpeaksDF$end)) annotatedPeak <- annotatePeakInBatch(peaksRD, AnnotationData = annoRD) as.data.frame(annotatedPeak)[1:4,1:11] bpeaksDF[1:4,] ################################################### ### code chunk number 15: Rchipseq.Rnw:425-428 ################################################### source("./data/Fct/rangeoverlapper.R") olRanges(query=gffgene, subject=as(as(bpeaks, "RangedData"), "GRanges"), output="df")[1:2,] as.data.frame(annotatedPeak)[c(2,5),1:11] # Corresponding result from ChIPpeakAnno ################################################### ### code chunk number 16: Rchipseq.Rnw:438-452 ################################################### peakranges <- GRanges(seqnames = Rle(bpeaksDF$space), ranges = IRanges(bpeaksDF$start, bpeaksDF$end), strand = Rle(strand("*")), peakIDs=paste("peak", seq(along=bpeaksDF[,1]), sep="_")) countDF <- data.frame(row.names=elementMetadata(peakranges)[,"peakIDs"]) peakranges <- split(peakranges) # Coerce to GRangesList for(i in samplespath) { aligns <- readBamGappedAlignments(i) # Substitute next two lines with this one. counts <- countOverlaps(peakranges, aligns) countDF <- cbind(countDF, counts) } colnames(countDF) <- samples rownames(countDF) <- gsub(".*=", "", rownames(countDF)) countDF[1:4,] write.table(countDF, "./results/countDF", quote=FALSE, sep="\t", col.names = NA) countDF <- read.table("./results/countDF") ################################################### ### code chunk number 17: Rchipseq.Rnw:460-469 ################################################### returnRPKM <- function(counts, ranges) { geneLengthsInKB <- sum(width(ranges))/1000 # Number of bases per sequence range in kbp millionsMapped <- sum(counts)/1e+06 # Factor for converting to million of mapped reads. rpm <- counts/millionsMapped # RPK: reads per kilobase of sequence range. rpkm <- rpm/geneLengthsInKB # RPKM: reads per kilobase of sequence range per million mapped reads. return(rpkm) } countDFrpkm <- apply(countDF, 2, function(x) returnRPKM(counts=x, ranges=peakranges)) countDFrpkm[1:4,] ################################################### ### code chunk number 18: Rchipseq.Rnw:477-479 ################################################### d <- cor(countDFrpkm, method="spearman") plot(hclust(dist(1-d))) # Sample tree ################################################### ### code chunk number 19: Rchipseq.Rnw:487-490 ################################################### source("./data/Fct/colAg.R") countDFrpkm_mean <- colAg(myMA=countDFrpkm, group=c(1,1,2,2), myfct=mean) countDFrpkm_mean[1:4,] ################################################### ### code chunk number 20: Rchipseq.Rnw:493-499 ################################################### countDFrpkm_mean <- cbind(countDFrpkm_mean, log2ratio=log2(countDFrpkm_mean[,1]/countDFrpkm_mean[,2])) countDFrpkm_mean <- countDFrpkm_mean[is.finite(countDFrpkm_mean[,3]), ] degs2fold <- countDFrpkm_mean[countDFrpkm_mean[,3] >= 1 | countDFrpkm_mean[,3] <= -1,] degs2fold[1:4,] write.table(degs2fold, "./results/degs2fold", quote=FALSE, sep="\t", col.names = NA) degs2fold <- read.table("./results/degs2fold") ################################################### ### code chunk number 21: Rchipseq.Rnw:507-519 ################################################### library(DESeq) countDF <- read.table("./results/countDF") conds <- targets$Factor cds <- newCountDataSet(countDF, conds) # Creates object of class CountDataSet derived from eSet class counts(cds)[1:4, ] # CountDataSet has similar accessor methods as eSet class. cds <- estimateSizeFactors(cds) # Estimates library size factors from count data. Alternatively, one can provide here the true library sizes with sizeFactors(cds) <- c(..., ...) cds <- estimateDispersions(cds, fitType="local") # Estimates the variance within replicates res <- nbinomTest(cds, "bgr1", "sig1") # Calls DEGs with nbinomTest res <- na.omit(res) res2fold <- res[res$log2FoldChange >= 1 | res$log2FoldChange <= -1,] res2foldpadj <- res2fold[res2fold$padj <= 0.2, ] # Here padj set very high for demo purpose res2foldpadj[1:2,1:8] ################################################### ### code chunk number 22: Rchipseq.Rnw:527-537 ################################################### library(edgeR) countDF <- read.table("./results/countDF") y <- DGEList(counts=countDF, group=conds) # Constructs DGEList object y <- estimateCommonDisp(y) # Estimates common dispersion y <- estimateTagwiseDisp(y) # Estimates tagwise dispersion et <- exactTest(y, pair=c("bgr1", "sig1")) # Computes exact test for the negative binomial distribution. topTags(et, n=4) edge <- as.data.frame(topTags(et, n=50000)) edge2fold <- edge[edge$logFC >= 1 | edge$logFC <= -1,] edge2foldpadj <- edge2fold[edge2fold$FDR <= 0.01, ] ################################################### ### code chunk number 23: Rchipseq.Rnw:546-555 ################################################### bothDF <- merge(res, countDFrpkm_mean, by.x=1, by.y=0, all=TRUE); bothDF <- na.omit(bothDF) cor(bothDF[,"log2FoldChange"], bothDF[,"log2ratio"], method="spearman") source("./data/Fct/overLapper.R") setlist <- list(edgeR=rownames(edge[order(edge$FDR),][1:20,]), DESeq=res[order(res$padj),][1:20,"id"], RPKM=rownames(degs2fold[order(-degs2fold$log2ratio),][1:20,])) OLlist <- overLapper(setlist=setlist, sep="_", type="vennsets") counts <- sapply(OLlist$Venn_List, length) vennPlot(counts=counts) ################################################### ### code chunk number 24: Rchipseq.Rnw:588-594 ################################################### library(Biostrings); library(cosmo); library(seqLogo) pseq <- getSeq(FaFile("./data/tair10chr.fasta"), as(as(bpeaksDF, "RangedData"), "GRanges")) names(pseq) <- paste(bpeaksDF$space, bpeaksDF$start, sep="_") write.XStringSet(pseq[1:8], "./results/pseq.fasta") # Note: reduced to 8 sequences to run quickly. res <- cosmo(seqs="./results/pseq.fasta", silent=TRUE) plot(res) ################################################### ### code chunk number 25: Exercise 2 ################################################### chr <- read.DNAStringSet("./data/tair10chr.fasta") pwm <- PWM(DNAStringSet(print(res@motifs)$motif)) peakN <- sapply(pseq, function(x) countPWM(pwm, x, min.score=0.9)) peak1kb_freq <- (sum(peakN)/sum(width(pseq))) * 1000 genomeN <- sapply(chr, function(x) countPWM(pwm, x, min.score=0.9)) genome1kb_freq <- (sum(genomeN)/sum(width(chr))) * 1000 ################################################### ### code chunk number 26: Rchipseq.Rnw:621-622 ################################################### sessionInfo()