############################################
##          Package Installation          ##
############################################
# source("http://bioconductor.org/biocLite.R")
# biocLite()
# previously_installed <- row.names(installed.packages())  
# current <- c("affyio", "affycoretools", "GOstats", "Ruuid", "graph", "GO.db", "Category", "plier", "simpleaffy", "ath1121501.db", "ath1121501cdf", "ath1121501probe", "ricecdf", "riceprobe", "biomaRt")
# previously_installed[!previously_installed %in% current]
############################################

#######################################################
## (A) Download a sample set of Affymetrix cel files ##
#######################################################
## Right-click this link (http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/Samples/Workshop.zip) 
## and save its content to your computer: Workshop.zip. These sample CEL 
## files are from the GEO data set: GSE5621. After unpacking this file 
## archive one should see six *.cel files.

## (A.1) Generate RMA expression data, MAS5 P/M/A calls and export results to Excel
library(affy); library(ath1121501probe)
mydata <- ReadAffy() # Reads cel files in current working directory into affybatch object 'mydata'. 
print.data.frame(ath1121501probe[1:22,]) # Prints probe sequences and their positions for first two Affy IDs.
eset_rma <- rma(mydata) # Generates RMA expression values and stores them as ExpressionSet.
exprs(eset_rma)[1:4,] # Prints first 4 rows in data frame structure.
# exprs(eset_rma) <- 2^(exprs(eset_rma))[1:4,] # If needed unlog RMA expression values.
mydf <- 2^exprs(eset_rma)
myList <- tapply(colnames(mydf), c(1,1,2,2,3,3), list)
names(myList) <- sapply(myList, paste, collapse="_")
mymean <- sapply(myList, function(x) rowMeans(mydf[,x])) # Generic approach for calculating mean values for any sample combination.
eset_pma <- mas5calls(mydata) # Generates MAS 5.0 P/M/A calls.
my_frame <- data.frame(exprs(eset_rma), exprs(eset_pma), assayDataElement(eset_pma, "se.exprs")) # Combine RMA intensities, P/M/A calls plus their wilcoxon p-values in one data frame.
my_frame <- my_frame[, sort(names(my_frame))] # Sorts columns by cel file name.
write.table(my_frame, file="my_file.xls", sep="\t", col.names = NA) # Exports data to text file that can be imported into Excel.

## (A.2) Add annotation information
library("ath1121501.db") # Loads required annotation package.
## Constructs a data frame containing the gene IDs, gene symbols and descriptions for all probe sets on the chip.
Annot <- data.frame(ACCNUM=sapply(contents(ath1121501ACCNUM), paste, collapse=", "), 
                    SYMBOL=sapply(contents(ath1121501SYMBOL), paste, collapse=", "), 
                    DESC=sapply(contents(ath1121501GENENAME), paste, collapse=", ")) 
all <- merge(Annot, my_frame, by.x=0, by.y=0, all=T) # Merges everything with above expression data.
write.table(all, file="my_annot_file.xls", sep="\t", col.names = NA) # Exports data to text file that can be imported into Excel.

## (B.1) Visualization and quality control
d <- cor(2^exprs(eset_rma), method="pearson"); plot(hclust(dist(1-d))) # Generates a correlation matrix for all-against-all chip comparisons.
library(affyQCReport)
QCReport(mydata, file="ExampleQC.pdf") # Generates a comprehensive QC report for the AffyBatch object 'mydata' in PDF format. See affyQCReport for details.
eset <- eset_rma # To work with following commands.
## Continue with commands in section "Visualization and quality controls".

## (B.2) Simple Affy (command summary)
## Save this covdesc.txt to your R working directory.
## Generate expression data with RMA and MAS5.
## Filter each of the three data sets with the following parameters: 2-fold changes, present in all 4 chips and p-score less than 0.001.
## Write the results into separate files.
## Create scatter plots for the filtered data sets and save them to external image files.
## Identify the overlap of the significant changes between the RMA and MAS5 data.
## Perform simpleaffy QC checks: scaling factor, percent present calls, etc.

## (C.1) Identify differtially expressed genes (DEGs) with the limma package
## Commands from the affy Limma Section in manual. 
## cDNA microarray users can save and extract the SWIRL cDNA microarray sample 
## data. For a quick demonstration of the analysis of this data set, one can 
## copy&paste or source the following command-line summary into the R terminal: my_swirl_commands.txt.

## For affy analysis download this targets file: http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/affy_targets.txt
library(limma) # Loads limma library.
targets <- readTargets("affy_targets.txt") # Reads targets information from file 'affy_targets.txt' and assigns it to targets frame.
data <- ReadAffy(filenames=targets$FileName) # Reads CEL files (specified in 'targets') into AffyBatch object.
eset <- rma(data) # Normalizes data with 'rma' function and assigns them to ExpressionSet object (see above).
# exprs(eset) <- log2(exprs(eset)) # If eset contains absolute intensity values like MAS5 results, then they should be transformed to log2 (or loge) values for limma. RMA/GCRMA generate log2 values and MAS5 produces absolute values.
pData(eset) # Lists the analyzed file names.
write.exprs(eset, file="affy_all.xls") # Exports all affy expression values to tab delimited text file. The MAS 5.0 P/M/A calls can be retrieved with the simpleaffy package or with the affy package like this: 'eset <- mas5calls(data); write.exprs(eset, file="my_PMA.txt")'.
design <- model.matrix(~ -1+factor(c(1,1,2,2,3,3))) # Creates appropriate design matrix. Alternatively, such a design matrix can be created in any spreadsheet program and then imported into R.
colnames(design) <- c("group1", "group2", "group3") # Assigns column names.
fit <- lmFit(eset, design) # Fits a linear model for each gene based on the given series of arrays.
contrast.matrix <- makeContrasts(group2-group1, group3-group2, group3-group1, levels=design) # Creates appropriate contrast matrix to perform all pairwise comparisons. Alternatively, such a contrast matrix can be created in any spreadsheet program and then imported into R. For complex experiments one can also use this function to compute a contrast matrix with all possible pairwise comparisons.
fit2 <- contrasts.fit(fit, contrast.matrix) # Computes estimated coefficients and standard errors for a given set of contrasts.
fit2 <- eBayes(fit2) # Computes moderated t-statistics and log-odds of differential expression by empirical Bayes shrinkage of the standard errors towards a common value.
topTable(fit2, coef=1, adjust="fdr", sort.by="B", number=10) # Generates list of top 10 ('number=10') differentially expressed genes sorted by B-values ('sort.by=B') for each of the three comparison groups ('coef=1') in this sample set. The summary table contains the following information: logFC is the log2-fold change, the AveExpr is the average expression value accross all arrays and channels, the moderated t-statistic (t) is the logFC to its standard error, the P.Value is the associated p-value, the adj.P.Value is the p-value adjusted for multiple testing and the B-value (B) is the log-odds that a gene is differentially expressed (the-higher-the-better). Usually one wants to base gene selection on the adjusted P-value rather than the t- or B-values. More details on this can be found in the limma PDF manual (type 'limmaUsersGuide()') or on this FAQ page.
write.table(topTable(fit2, coef=1, adjust="fdr", sort.by="B", number=50000), file="limma_complete.xls", row.names=F, sep="\t") # Exports complete limma statistics table for first comparison group ('coef=1') to tab delimited text file.
results <- decideTests(fit2, p.value=0.05); vennDiagram(results) # Creates venn diagram of all changed genes with p-value equal or less than 0.05.
x <- topTable(fit2, coef=1, adjust="fdr", sort.by="P", number=50000); y <- x[x$adj.P.Val < 0.05,]; y; print("Number of genes in this list:"); length(y$ID) # Filters out candidates that have P-values < 0.05 in each group ('coef=1') and provides the number of candidates for each list. These numbers should be identical with the sum of the values in each circle of the above venn diagram.
x <- topTable(fit2, coef=1, adjust="fdr", sort.by="P", number=50000); y <- x[x$adj.P.Val < 0.01 & (x$logFC > 1 | x$logFC < -1) & x$AveExpr > 10,]; y; print("Number of genes in this list:"); length(y$ID) # Same as above but with complex filter: P-value < 0.01 AND at least 2-fold change AND expression value A > 10.
results <- decideTests(fit2, p.value=0.000005); heatDiagram(results, fit2$coef, primary=1) # This function plots heat diagram gene expression profiles for genes which are significantly differentially expressed in the primary condition (this is not a cluster analysis heat map). Genes are sorted by differential expression under the primary condition. The argument 'primary=1' selects the first contrast column in the 'results' matrix as primary condition. The plotted genes can be extracted like this 'results[results[,1]==1,]'. More information on this function can be found in the limma manual.


## (D.1) GO Term enrichment analysis with GOstats
library(ath1121501.db); library(GOstats)
affySample <- c("266592_at", "266703_at", "266199_at", "246949_at", "267370_at", "267115_s_at", "266489_at", "259845_at", "266295_at", "262632_at")
geneSample <- as.vector(unlist(mget(affySample, ath1121501ACCNUM, ifnotfound=NA)))
library(ath1121501cdf)
affyUniverse <- ls(ath1121501cdf)
geneUniverse <- as.vector(unlist(mget(affyUniverse, ath1121501ACCNUM, ifnotfound=NA)))
params <- new("GOHyperGParams", geneIds = geneSample, universeGeneIds = geneUniverse, annotation="ath1121501", 
               ontology = "MF", pvalueCutoff = 0.5, conditional = FALSE, testDirection = "over")
hgOver <- hyperGTest(params); summary(hgOver)
htmlReport(hgOver, file = "MyhyperGresult.html") 

## (D.2) GO batch analysis using the GOHyperGAll script
## Download and unzip the annotation objects for Arabidopsis (http://faculty.ucr.edu/%7Etgirke/Documents/R_BioCond/Samples/ArabSampleGOHyperGAll.zip) 
## into your working directory. Then continue with the following commands:
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/GOHyperGAll.txt") # Imports the GOHyperGAll functions.
loadData(); load(file="MF_node_affy_list"); load(file="BP_node_affy_list"); load(file="CC_node_affy_list") # Loads the downloaded annotation objects for Arabidopsis.
GOHyperGAll_result <- GOHyperGAll(gocat="MF", sample=unique(as.vector(GO_MF_DF[1:40,2])), Nannot=2); GOHyperGAll_result[1:10,-8] # Performs the enrichment test for provided set of gene identifiers.
CL_DF <- data.frame(geneID=GO_MF_DF[1:400,2], ClusterID=sort(rep(1:4,100)), ClusterSize=rep(100,400)) # Create sample data set for batch processing.
BatchResult <- GOCluster_Report(CL_DF=CL_DF, method="all", id_type="gene", CLSZ=10, cutoff=0.001, gocats=c("MF", "BP", "CC"), recordSpecGO=c("GO:0003674", "GO:0008150", "GO:0005575")); BatchResult[1:4,-10] # Performs all three GO analyses for many sample set at once. When the method argument is set to "slim" then the goSlim method is used.
write.table(BatchResult, file="GO_Batch.xls", sep="\t", col.names = NA) # Exports batch result to Excel file.

## (E.1) Clustering of differentially expressed genes
my_fct <- function(x) hclust(x, method="complete") # Creates function to perform hierarchical clustering (complete linkage).
heatmap(as.matrix(2^exprs(eset)[1:40,]), col = cm.colors(256), hclustfun=my_fct) # Plots heatmap with dendrograms.
## Replace in last step 'exprs(eset)[1:40,]' by matrix of differentially expressed genes from limma analysis.

## (F.1) Clean up this R script and execute it with the source() function
source("array.R") # Executes all of the above commands and generates the corresponding output files in the current working directory.