### R code from vignette source 'Rchipseq.Rnw'

###################################################
### code chunk number 1: Rchipseq.Rnw:211-212 (eval = FALSE)
###################################################
## system("ln -s /R-pkgs/data data")


###################################################
### code chunk number 2: Rchipseq.Rnw:217-221 (eval = FALSE)
###################################################
## system("wget http://biocluster.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/data.zip")
## system("unzip data.zip")
## download.file("http://faculty.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/Rchipseq.pdf", "Rchipseq.pdf")
## download.file("http://faculty.ucr.edu/~tgirke/HTML_Presentations/Manuals/Rngsapps/chipseqBioc2012/Rchipseq.R", "Rchipseq.R")


###################################################
### code chunk number 3: Rchipseq.Rnw:237-250
###################################################
library(Rsubread); library(Rsamtools)
dir.create("results") # Note: all output data will be written to directory 'results'
buildindex(basename="./results/tair10chr.fasta", reference="./data/tair10chr.fasta") # Build indexed reference genome
targets <- read.delim("./data/targets.txt") # Import experiment design information
targets
input <- paste("./data/", targets[,3], sep="")
output <- paste("./results/", targets[,3], ".sam", sep="")
reference <- "./results/tair10chr.fasta"
for(i in seq(along=targets[,3])) {
        align(index=reference, readfile1=input[i], output_file=output[i], nthreads=8, indels=1, TH1=2)
        asBam(file=output[i], destination=gsub(".sam", "", output[i]), overwrite=TRUE, indexDestination=TRUE)
        unlink(output[i])
}


###################################################
### code chunk number 4: Rchipseq.Rnw:260-267
###################################################
library(rtracklayer); library(GenomicRanges)
samples <- as.character(targets$Fastq)
samplespath <- paste("./results/", samples, ".bam", sep="")
aligns <- readBamGappedAlignments(samplespath[3])
cov <- coverage(aligns)
islands <- slice(cov, lower = 15)
islands[[1]]


###################################################
### code chunk number 5: Rchipseq.Rnw:274-283
###################################################
plotCov <- function(mycov=cov, mychr=1, mypos=c(1,1000), mymain="Coverage", ...) {
    op <- par(mar=c(8,3,6,1))
    plot(as.numeric(mycov[[mychr]][mypos[1]:mypos[2]]), type="l",
    lwd=1, col="blue", ylab="", main=mymain, xlab="", xaxt="n", ...)
    axis(1, las = 2, at=seq(1,mypos[2]-mypos[1], length.out= 10),
    labels=as.integer(seq(mypos[1], mypos[2], length.out= 10)))
    par(op)
}
plotCov(mycov=cov, mychr="Chr1", mypos=c(1,120000)) # Remember: read data is trunctated to first 100kbp


###################################################
### code chunk number 6: Rchipseq.Rnw:291-298
###################################################
chip_signal_list <- sapply(samplespath, list)
for(i in seq(along=samplespath)) {
	aligns <- readBamGappedAlignments(samplespath[i])
	chip_signal_list[[i]] <- as(aligns, "GRanges")
}
chip_signal_list[["./results/SRR038845.fastq.bam"]][1:4,]
chip_signal_list <- sapply(names(chip_signal_list), function(x) resize(chip_signal_list[[x]], width = 200))


###################################################
### code chunk number 7: Rchipseq.Rnw:308-314
###################################################
Nreads <- sapply(names(chip_signal_list), function(x)  length(chip_signal_list[[x]]))
normfactor <- 10^6/Nreads
chip_signal_list <- sapply(names(chip_signal_list), function(x) coverage(chip_signal_list[[x]]) * normfactor[x])
chip_signal_list[["./results/SRR038845.fastq.bam"]][1:2,]
chip_peak_list <- sapply(names(chip_signal_list), function(x) slice(chip_signal_list[[x]], lower=5))
chip_peak_list[[1]][[1]][1:3]


###################################################
### code chunk number 8: Rchipseq.Rnw:322-337
###################################################
library(BayesPeak)
sig <- readBamGappedAlignments(samplespath[1])
bgr <- readBamGappedAlignments(samplespath[3])
sig <- as(as(sig, "GRanges"), "RangedData")
bgr <- as(as(bgr, "GRanges"), "RangedData")

raw.output <- bayespeak(treatment=sig, control=bgr, start = 1, end = 100000)   
# unreliable.jobs <- log(raw.output$QC$lambda1) < 1.5 # Removal of false positives due to overfitting.
# bpeaks <- as.data.frame(summarise.peaks(raw.output, method = "lowerbound", exclude.jobs = unreliable.jobs)) 
bpeaks <- as.data.frame(summarise.peaks(raw.output, method = "lowerbound"))
source("./data/Fct/chipseqFct.R") # Imports the rangeCoverage function.
sigcovDF <- rangeCoverage(summaryFct=viewMeans, myname="sig_", peaksIR=bpeaks[,1:3], sig=sig, readextend=1, normfactor=10^6/length(start(sig))) 
bgrcovDF <- rangeCoverage(summaryFct=viewMeans, myname="bgr_", peaksIR=bpeaks[,1:3], sig=bgr, readextend=1, normfactor=10^6/length(start(bgr))) 
bpeaksDF <- cbind(bpeaks, sigcovDF[,-1], bgrcovDF[,-1])
bpeaksDF[1:4,] 


###################################################
### code chunk number 9: Rchipseq.Rnw:344-345
###################################################
plotCov(mycov=chip_signal_list[[1]], mychr="Chr1", mypos=c(11000, 11600), ylim=c(0,200))


###################################################
### code chunk number 10: Rchipseq.Rnw:353-357
###################################################
simple_peak <- as.data.frame(as(chip_peak_list[[1]], "IRangesList"))
   # simple_peak <- as.data.frame(chip_peak_list[[1]])
commonpeaks <- subsetByOverlaps(as(bpeaks, "RangedData"), as(simple_peak, "RangedData"), minoverlap=100)
bpeaksDF[bpeaksDF$start %in% start(commonpeaks),][1:4,]


###################################################
### code chunk number 11: Exercise 1
###################################################
pubpeaks <- read.delim("./data/Kaufmann_peaks100k.txt") # Published peaks for first 100kbp on chromosomes.
pubpeaks <- pubpeaks[order(pubpeaks$space, pubpeaks$start),]
pubpeaks[1:4,]
# Import olRanges function, which accepts two GRranges (IRanges) objects
source("./data/Fct/rangeoverlapper.R")


###################################################
### code chunk number 12: Exercise 1
###################################################
myol1 <- subsetByOverlaps(as(pubpeaks, "RangedData"), as(simple_peak, "RangedData"), minoverlap=1) 
myol2 <- olRanges(query=as(as(pubpeaks, "RangedData"), "GRanges"), subject=as(as(simple_peak, "RangedData"), "GRanges"), output="df")
# myol2[myol2["OLpercQ"] >= 50,]
myol3 <- olRanges(query=as(as(pubpeaks, "RangedData"), "GRanges"), subject=as(as(bpeaks, "RangedData"), "GRanges"), output="df")
# myol3[myol3["OLpercQ"] >= 50,]


###################################################
### code chunk number 13: Rchipseq.Rnw:392-402
###################################################
library(rtracklayer); library(GenomicRanges); library(Rsamtools)
gff <- import.gff("./data/TAIR10_GFF3_trunc.gff", asRangedData=FALSE)
seqlengths(gff) <- end(ranges(gff[which(elementMetadata(gff)[,"type"]=="chromosome"),]))
subgene_index <- which(elementMetadata(gff)[,"type"] == "gene")
gffsub <- gff[subgene_index,] # Returns only gene ranges
strand(gffsub) <- "*" # For strand insensitive analysis
gffsub[1:4,1:2]
ids <- elementMetadata(gffsub)[, "group"]
gffgene <- gffsub
gffsub <- split(gffsub) # Coerce to GRangesList


###################################################
### code chunk number 14: Rchipseq.Rnw:409-417
###################################################
library(ChIPpeakAnno)
annoRD <- unlist(gffsub)
names(annoRD) <- gsub(".*=", "", elementMetadata(annoRD)[, "group"])
annoRD <- as(annoRD, "RangedData") 
peaksRD <- RangedData(space=bpeaksDF$space, IRanges(bpeaksDF$start, bpeaksDF$end))
annotatedPeak <- annotatePeakInBatch(peaksRD, AnnotationData = annoRD)
as.data.frame(annotatedPeak)[1:4,1:11]
bpeaksDF[1:4,]


###################################################
### code chunk number 15: Rchipseq.Rnw:425-428
###################################################
source("./data/Fct/rangeoverlapper.R")
olRanges(query=gffgene, subject=as(as(bpeaks, "RangedData"), "GRanges"), output="df")[1:2,]
as.data.frame(annotatedPeak)[c(2,5),1:11] # Corresponding result from ChIPpeakAnno


###################################################
### code chunk number 16: Rchipseq.Rnw:438-452
###################################################
peakranges <- GRanges(seqnames = Rle(bpeaksDF$space), ranges = IRanges(bpeaksDF$start, bpeaksDF$end), 
              strand = Rle(strand("*")), peakIDs=paste("peak", seq(along=bpeaksDF[,1]), sep="_"))
countDF <- data.frame(row.names=elementMetadata(peakranges)[,"peakIDs"])
peakranges <- split(peakranges) # Coerce to GRangesList
for(i in samplespath) {
        aligns <- readBamGappedAlignments(i) # Substitute next two lines with this one.
        counts <- countOverlaps(peakranges, aligns)
        countDF <- cbind(countDF, counts)
}
colnames(countDF) <- samples
rownames(countDF) <- gsub(".*=", "", rownames(countDF))
countDF[1:4,]
write.table(countDF, "./results/countDF", quote=FALSE, sep="\t", col.names = NA)
countDF <- read.table("./results/countDF")


###################################################
### code chunk number 17: Rchipseq.Rnw:460-469
###################################################
returnRPKM <- function(counts, ranges) {
        geneLengthsInKB <- sum(width(ranges))/1000 # Number of bases per sequence range in kbp
        millionsMapped <- sum(counts)/1e+06 # Factor for converting to million of mapped reads.
        rpm <- counts/millionsMapped # RPK: reads per kilobase of sequence range.
        rpkm <- rpm/geneLengthsInKB # RPKM: reads per kilobase of sequence range per million mapped reads.
        return(rpkm)
}
countDFrpkm <- apply(countDF, 2, function(x) returnRPKM(counts=x, ranges=peakranges))
countDFrpkm[1:4,]


###################################################
### code chunk number 18: Rchipseq.Rnw:477-479
###################################################
d <- cor(countDFrpkm, method="spearman")
plot(hclust(dist(1-d))) # Sample tree


###################################################
### code chunk number 19: Rchipseq.Rnw:487-490
###################################################
source("./data/Fct/colAg.R")
countDFrpkm_mean <- colAg(myMA=countDFrpkm, group=c(1,1,2,2), myfct=mean)
countDFrpkm_mean[1:4,]


###################################################
### code chunk number 20: Rchipseq.Rnw:493-499
###################################################
countDFrpkm_mean <- cbind(countDFrpkm_mean, log2ratio=log2(countDFrpkm_mean[,1]/countDFrpkm_mean[,2]))
countDFrpkm_mean <- countDFrpkm_mean[is.finite(countDFrpkm_mean[,3]), ]
degs2fold <- countDFrpkm_mean[countDFrpkm_mean[,3] >= 1 | countDFrpkm_mean[,3] <= -1,]
degs2fold[1:4,]
write.table(degs2fold, "./results/degs2fold", quote=FALSE, sep="\t", col.names = NA)
degs2fold <- read.table("./results/degs2fold")


###################################################
### code chunk number 21: Rchipseq.Rnw:507-519
###################################################
library(DESeq)
countDF <- read.table("./results/countDF")
conds <- targets$Factor
cds <- newCountDataSet(countDF, conds) # Creates object of class CountDataSet derived from eSet class
counts(cds)[1:4, ] # CountDataSet has similar accessor methods as eSet class.
cds <- estimateSizeFactors(cds) # Estimates library size factors from count data. Alternatively, one can provide here the true library sizes with sizeFactors(cds) <- c(..., ...)
cds <- estimateDispersions(cds, fitType="local") # Estimates the variance within replicates
res <- nbinomTest(cds, "bgr1", "sig1") # Calls DEGs with nbinomTest
res <- na.omit(res)
res2fold <- res[res$log2FoldChange >= 1 | res$log2FoldChange <= -1,]
res2foldpadj <- res2fold[res2fold$padj <= 0.2, ] # Here padj set very high for demo purpose
res2foldpadj[1:2,1:8]


###################################################
### code chunk number 22: Rchipseq.Rnw:527-537
###################################################
library(edgeR)
countDF <- read.table("./results/countDF")
y <- DGEList(counts=countDF, group=conds) # Constructs DGEList object
y <- estimateCommonDisp(y) # Estimates common dispersion
y <- estimateTagwiseDisp(y) # Estimates tagwise dispersion
et <- exactTest(y, pair=c("bgr1", "sig1")) # Computes exact test for the negative binomial distribution.
topTags(et, n=4)
edge <- as.data.frame(topTags(et, n=50000)) 
edge2fold <- edge[edge$logFC >= 1 | edge$logFC <= -1,]
edge2foldpadj <- edge2fold[edge2fold$FDR <= 0.01, ]


###################################################
### code chunk number 23: Rchipseq.Rnw:546-555
###################################################
bothDF <- merge(res, countDFrpkm_mean, by.x=1, by.y=0, all=TRUE); bothDF <- na.omit(bothDF)
cor(bothDF[,"log2FoldChange"], bothDF[,"log2ratio"], method="spearman")
source("./data/Fct/overLapper.R")
setlist <- list(edgeR=rownames(edge[order(edge$FDR),][1:20,]), 
	        DESeq=res[order(res$padj),][1:20,"id"], 
                RPKM=rownames(degs2fold[order(-degs2fold$log2ratio),][1:20,]))
OLlist <- overLapper(setlist=setlist, sep="_", type="vennsets")
counts <- sapply(OLlist$Venn_List, length)
vennPlot(counts=counts)


###################################################
### code chunk number 24: Rchipseq.Rnw:588-594
###################################################
library(Biostrings); library(cosmo); library(seqLogo)
pseq <- getSeq(FaFile("./data/tair10chr.fasta"), as(as(bpeaksDF, "RangedData"), "GRanges"))
names(pseq) <- paste(bpeaksDF$space, bpeaksDF$start, sep="_")
write.XStringSet(pseq[1:8], "./results/pseq.fasta") # Note: reduced to 8 sequences to run quickly.
res <- cosmo(seqs="./results/pseq.fasta", silent=TRUE)
plot(res) 


###################################################
### code chunk number 25: Exercise 2
###################################################
chr <- read.DNAStringSet("./data/tair10chr.fasta")
pwm <- PWM(DNAStringSet(print(res@motifs)$motif))
peakN <- sapply(pseq, function(x) countPWM(pwm, x, min.score=0.9)) 
peak1kb_freq <- (sum(peakN)/sum(width(pseq))) * 1000
genomeN <- sapply(chr, function(x) countPWM(pwm, x, min.score=0.9)) 
genome1kb_freq <- (sum(genomeN)/sum(width(chr))) * 1000


###################################################
### code chunk number 26: Rchipseq.Rnw:621-622
###################################################
sessionInfo()