## Exercise Script for R Basics Workshop

## Import the tables into R
my_mw <- read.delim(file="http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/Samples/MolecularWeight_tair7.xls", header=T, sep="\t") 
my_target <- read.delim(file="http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/Samples/TargetP_analysis_tair7.xls", header=T, sep="\t") 

## Check tables and rename gene ID columns

my_mw[1:4,]; my_target[1:4,]; colnames(my_target)[1] <- "ID"; colnames(my_mw)[1] <- "ID" # Prints out the first four rows of the two data frames and renames their gene ID columns.

## Merge the two tables based on common ID field

my_mw_target <- merge(my_mw, my_target, by.x="ID", by.y="ID", all.x=T)

## Shorten one table before the merge and then remove the non-matching rows (NAs) in the merged file

my_mw_target2 <- merge(my_mw, my_target[1:40,], by.x="ID", by.y="ID", all.x=T) # To remove non-matching rows, use the argument setting 'all=F'.
na.omit(my_mw_target2) # Removes rows containing "NAs" (non-matching rows).

## Apply several combinatorial queries (filters) on the data set. For example:

my_mw_target[my_mw_target[,2]>100000 & my_mw_target[,4]=="C",] # Retrieves all records with MW greater than 100000 and targeting to chloroplasts.
dim(my_mw_target[my_mw_target[,2]>100000 & my_mw_target[,4]=="C", ]) # Determines the number of matches for the above query

## Use a regular expression in a substitute function to generate a separate ID column that lacks gene model extensions

my_mw_target3 <- data.frame(loci=gsub("\\..*", "", as.character(my_mw_target[,1]), perl = TRUE), my_mw_target)
my_mw_target3 <- data.frame(loci=gsub("(^.*)\\.\\d{1,}", "\\1", as.character(my_mw_target[,1]), perl = TRUE), my_mw_target) # Same as above but with backreference

## Calculate the number of gene models per loci with the table() function

my_mw_target4 <- cbind(my_mw_target3, Freq=table(my_mw_target3[,1])[as.character(my_mw_target3[,1])]) # The table() function counts the number of duplicates for each loci and cbind() appends the information to the data frame.
my_mw_target4[order(-my_mw_target4$Freq.Freq),][1:30,] # Sorts by count information.
length(unique(my_mw_target4[,1])) # Removes duplicates and prints the number of unique loci.
sum(!duplicated(my_mw_target4[,1])) # Gives the same result as previous command; remember: the function 'duplicated' returns a logical vector.
length(my_mw_target4[,2])-length(unique(my_mw_target4[,1])) # Calculates the number of duplicated entries.
sum(duplicated(my_mw_target4[,1])) # Generates the same result as the previous command.

## Perform a variety calculations across columns

data.frame(my_mw_target4, avg_AA_WT=(my_mw_target4[,3]/my_mw_target4[,4]))[1:40,] # Calculates the average AA weight per protein.
data.frame(my_mw_target4, mean=apply(my_mw_target4[,6:9], 1, mean))[1:40,] # Calculates the mean across several fields of each row.
data.frame(my_mw_target4, mean=apply(my_mw_target4[,6:9], 1, mean), stdev=apply(my_mw_target4[,6:9], 1, sd, na.rm=T))[1:40,] # Calculates the mean and standard deviation across several fields of each row.
# mean(my_mw_target4[,-c(1,2,5)], na.rm=T) # Calculates the mean for all numeric columns.
data.frame(my_mw_target4[1:40,], ttest=apply(my_mw_target4[1:40,6:9], 1, function(x) {t.test(x[1:2], x[3:4])$p.value} )) # Calculates a two-sample t-test for two slices in each row of the data frame and records the corresponding p-values. To calculate a paired t-test, add the argument 'paired=T'.

## Export data frame into Excel

write.table(my_mw_target4, file="my_file.xls", quote=F, sep="\t", col.names = NA) # Writes the data frame 'my_mw_target4' into a tab-delimited text file that can be opened with Excel.

## Plotting samples

pdf("box_plot.pdf"); boxplot(my_mw_target4[,3], my_mw_target4[,4], names=c("MW","CC"), col=c("blue","red"), main="Box Plot"); dev.off() # Generates box plots for MW and AA counts.
pdf("density_plot.pdf"); plot(density(my_mw_target4[,3]), col="blue", main="Density Plot"); dev.off() # Plots density distribution plots for MW.
pdf("bar_plot.pdf"); barplot(my_mw_target4[1:5,3], beside = F, col = c("red", "green", "blue", "magenta", "black"), names.arg=as.vector(my_mw_target4[1:5,1]), main="MW Bar Plot", ylim=c(0,230000)); dev.off() # Bar plot of MW for first five loci.
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R") # Imports the venn diagram function.
setlist <- list(A=sample(my_mw_target4[,1], 1000), B=sample(my_mw_target4[,1], 1100), C=sample(my_mw_target4[,1], 1200), D=sample(my_mw_target4[,1], 1300), E=sample(my_mw_target4[,1], 1400)) # Generates 5 sets of random IDs.
OLlist <- overLapper(setlist=setlist, sep="_", type="vennsets"); counts <- sapply(OLlist$Venn_List, length) # Plots a 5-way venn diagram for the random ID sets.
OLexport <- as.matrix(unlist(sapply(OLlist[[4]], paste, collapse=" "))); write.table(OLexport, file="OLexport.xls", col.names=F, quote=F, sep="\t") # Exports intersect data in tabular format to a file.
pdf("venn_plot.pdf"); vennPlot(counts=counts); dev.off() # Saves the plot image to a PDF file.