########################
## ChIP-Seq Exercises ##
########################
## Workshop exercise code from Oct 30, 2010
## For updates see this site:
## http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Exercises

###################
## Read Coverage ##
###################
## Import sample ChIP-Seq data 'cstest', estimate its mean insert length with the SISSR method and extend reads accordingly.
library(chipseq); library(lattice)
data(cstest); cstest


## Calculate read coverage, estimate its minimum value for calling peaks at an FDR of 0.0001 and call the peaks accordingly.


## Determine the number of peaks for variable read extension values (e.g. 100 and 200) and coverage cutoffs (e.g. 10 and 20).


## Identify the coordinates of the peak with the highest coverage.
## Then create a coverage plot for this region with this plotting function:

plotCov <- function(mycov=cov, mychr=1, mypos=c(1,1000), mymain="Coverage", ...) {
    op <- par(mar=c(8,3,6,1))
    plot(as.numeric(mycov[[mychr]][mypos[1]:mypos[2]]), type="l",
    lwd=1, col="blue", ylab="", main=mymain, xlab="", xaxt="n", ...)
    axis(1, las = 2, at=seq(1,mypos[2]-mypos[1], length.out= 10),
    labels=as.integer(seq(mypos[1], mypos[2], length.out= 10)))
    par(op)
    }


## Add Coverage Information with the rangeCoverage function from: 
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/chipseqFct.R") # Imports the rangeCoverage function.


## Call peaks with different coverage cutoffs (e.g. 8 & 12) and intersect the peaks that overlap by at least by 100bp among the two sets.


###############
## SOLUTIONS ##
###############

##############
## Coverage ##
##############
## Import sample ChIP-Seq data 'cstest', estimate its mean insert length with the SISSR method and extend reads accordingly.
library(chipseq); library(lattice)
data(cstest); cstest
estimate.mean.fraglen(cstest$ctcf, method="SISSR")
ext <- resize(cstest$ctcf, width = 200)

## Calculate read coverage, estimate its minimum value for calling peaks at an FDR of 0.0001 and call the peaks accordingly.
cov <- coverage(ext)
peakCutoff(cov, fdr = 0.0001)
peaks <- slice(cov, lower = 8)
peaks.sum <- peakSummary(peaks)

## Determine the number of peaks for variable read extension values (e.g. 100 and 200) and coverage cutoffs (e.g. 10 and 20).
chipPar <- function(mydata=cstest$ctcf, myext, mycov) {
    ext <- resize(mydata, width = myext)
    cov <- coverage(ext)
    peaks <- slice(cov, lower = mycov)
    peaks.sum <- peakSummary(peaks)
    length(as.data.frame(peaks.sum)[,1])
}
chipPar(mydata=cstest$ctcf, myext=200, mycov=20)
par1 <- c(50, 200); names(par1) <- par1; par2 <- c(10, 20); names(par2) <- par2
sapply(par1, function(x) sapply(par2, function(y) chipPar(mydata=cstest$ctcf, myext=x, mycov=y)))

## Identify the coordinates of the peak with the highest coverage.
## Then create a coverage plot for this region with this plotting function:

plotCov <- function(mycov=cov, mychr=1, mypos=c(1,1000), mymain="Coverage", ...) {
    op <- par(mar=c(8,3,6,1))
    plot(as.numeric(mycov[[mychr]][mypos[1]:mypos[2]]), type="l",
    lwd=1, col="blue", ylab="", main=mymain, xlab="", xaxt="n", ...)
    axis(1, las = 2, at=seq(1,mypos[2]-mypos[1], length.out= 10),
    labels=as.integer(seq(mypos[1], mypos[2], length.out= 10)))
    par(op)
    }

highest <- as.data.frame(peaks.sum)
highest[rev(order(highest$max)),][1:4,]
plotCov(mycov=cov, mychr="chr11", mypos=c(3092584,3093614))


## Add Coverage Information with the rangeCoverage function from: 
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/chipseqFct.R") # Imports the rangeCoverage function.

highest <- highest[highest$space=="chr10",]
peaksIR <- IRanges(start=start(peaks.sum["chr10"]), end=end(peaks.sum["chr10"]))
sig <- RangedData(cstest$ctcf)["chr10"]
sigcovDF <- rangeCoverage(summaryFct=viewMaxs, myname="sig_", peaksIR=peaksIR, sig=sig, readextend=200, normfactor=10^6/length(start(sig))) # Returns a data frame with the maximum coverage information for each peak normalized per million reads (see 'normfactor' argument). The first column contains the total coverage and the other two the coverage for the positive and negative strands, respectively. To obtain mean coverage values, assign 'viewMeans' to the summaryFct argument.
peaksDF <- cbind(highest, sigcovDF[,-1]); peaksDF[1:4,] # Constructs a data frame containing the peaks called by PICS as well as the corresponding maximum coverage informati
on for each peak.

## Call peaks with different coverage cutoffs (e.g. 8 & 12) and intersect the peaks that overlap by at least by 100bp among the two sets.

peaks8 <- peakSummary(slice(cov, lower = 8))
peaks12 <- peakSummary(slice(cov, lower = 12))
commonpeaks <- subsetByOverlaps(peaks8, peaks12, minoverlap=100)
peaksDF[peaksDF$start %in% start(commonpeaks),][1:4,] # Returns the corresponding coverage information for the common peaks.