SI Text R –based script for the analysis of whole genome (8x15K) microarrays comparing a DDT resistant field population with two more susceptible West African colonies, Akron (Benin) and Ngoussou (Cameroon).
############################################################################LIMMA NORMALISATION OF RAW DATA - DDT 8x15k MICROARRAYS (three way # # comparison, looped design) #
###########################################################################
#Create a number of files for this process:
##Target file – listing the following for each array
#FileNameTreatmentGErepcy3cy5datearraynumberslide
##Spot Type file – containing info on each spot type and designating a #colour to the spot types for subsequent plots i.e.
#SpotTypeControlTypeProbeNameGeneNamecolour
#detoxtarget0DETOX_**brown
#WGtarget0CUST_**black
#CV0CV_A_90_P**yellow
#Brightcorner1GE_BrightCorner*orange
#Darkcorner1DarkCorner*grey
#positivecontrol1**green
#negativecontrol -1*NegativeControlred
##Design file – containing a row for every sample on all arrays; must be in #the same order the arrays will be combined in at a later step i.e. array #numbers and cy3(Green) cy5(Red) order.
#Array Dye SampleGroup
#US84700254_252705710011_S01re-b_GE2_105_Jan09_2_2.txt Cy51ddt
#US84700254_252705710011_S01re-b_GE2_105_Jan09_2_2.txt Cy34akron
#US84700254_252705710007_S01re-b_GE2_105_Jan09_1_2.txt Cy54akron
#US84700254_252705710007_S01re-b_GE2_105_Jan09_1_2.txt Cy38ngou
#US84700254_252705710007_S01re-b_GE2_105_Jan09_2_4.txt Cy58ngou
#US84700254_252705710007_S01re-b_GE2_105_Jan09_2_4.txt Cy32ddt
#US84700254_252705710011_S01re-b_GE2_105_Jan09_1_3.txt Cy52ddt
#US84700254_252705710011_S01re-b_GE2_105_Jan09_1_3.txt Cy36akron
#set working directory – in R using LIMMA & MAANOVA
setwd ("c:/Documents and Settings/LOCATION OF SCAN FILES")
getwd()
#open limma
source("
biocLite("limma")
library(limma)
#read in target file
targets<- readTargets ("DDTTargets.txt", row.names="arraynumber")
targets
#weight spots i.e. remove spots with signal below background signal using #a boolean formula "rIsWellAboveBG" 1=yes 0=no, R&G sig=1 R+G>1
myfun<-function (x) {
a=x[,"rIsWellAboveBG"] == 1
b=x[,"gIsWellAboveBG"] == 1
as.numeric(a+b >= 1)
}
#read in array files using weight function, default Limma setting for #Agilent arrays is foreground mean signal, background median signal
RG<- read.maimages(targets, source="agilent", wt.fun=myfun)
show(RG)
#check all genes and arrays read correctly,
dim(RG)
names(RG$genes)
#order genes so duplicates are next to each other index=order(RG$genes$ProbeName)
RG=RG[index,]
#read in SpotType file
spottypes<- readSpotTypes("DDTSpotTypes.txt")
spottypes
#match spot types to genes in the RG (read array) list ('status'=character #vector giving the control status of each spot on the array)
RG$genes$Status<- controlStatus(spottypes,RG)
#Create a pre-normalisation
MA-plot
#or to create png images to file
plotMA3by2(RG, prefix=”pre-normalised_MA”)
#plot pre-normalised signal intensities
plotDensities(RG)
#save plot image
Jpeg(‘pre-normalised_densities’)
plotDensities(RG)
dev.off()
#background correction
#method="none" then no correction is done, i.e., the background intensities #are treated as zero. The offset can be used to add a constant to the #intensities before log-transforming, so that the log-ratios are shrunk #towards zero at the lower intensities. This may eliminate or reverse the #usual 'fanning' of log-ratios at low intensities associated with local #background subtraction.
RG.b<- backgroundCorrect(RG, method="none", offset=50)
plotMA3by2(RG.b, prefix="backgroundnonoffset50_MA")
#plot background corrected signal intensities
plotDensities(RG.b)
#save plot image
Jpeg(‘backgroundnoneoffset50_densities’)
plotDensities(RG.b)
dev.off()
#weighting different spots to be more/less important during normalisation #i.e. weight differentially expressed genes 0 and genes not meant to be #differentially expressed 1
RG$weights=modifyWeights(RG$weights, RG$genes$Status, spottypes$SpotType, c (rep(1,3), rep(0,4)))
#within array normalisation method loess
MA=normalizeWithinArrays(RG.b, method="loess")
#plot MA for loess normalised data
plotMA3by2(MA,prefix="DDT_background_none_offset50_loess",col=spottypes$colour, zero.weights=TRUE, common.lim=TRUE)
#boxplot – check the distribution of the red and green ratios for each #array to
boxplot(MA$M[MA$weights==1]~col(MA$M)[MA$weights==1], names=colnames(MA$M), las=2, main="nobackgroundoffset50loesswithinDDT", ylab="M", xaxt="none", ylim=c(-10,10))
axis(1, at=c(1:length(targets[,1])), label=targets$Name, las=2)
#plot densities – check that the red and green dye distribution plotDensities(MA)
############################################################################CONVERSION TO MAANOVA FORMAT AND RUNNING MAANOVA #
###########################################################################
RG.corrected<- RG.MA(MA)
MetaRow=1
MetaCol=1
#bind all the arrays together horizontally
ghanaDDT1.raw=as.data.frame(cbind(RG.corrected[,1]$genes[c(7:11, 4)], MetaRow, MetaCol,RG.corrected[,1]$genes[1:2], RG.corrected[,1]$R, RG.corrected[,1]$G, 1-RG.corrected[,1]$weights))
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,2]$R, RG.corrected[,2]$G, 1-RG.corrected[,2]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,3]$R, RG.corrected[,3]$G, 1-RG.corrected[,3]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,4]$R, RG.corrected[,4]$G, 1-RG.corrected[,4]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,5]$R, RG.corrected[,5]$G, 1-RG.corrected[,5]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,6]$R, RG.corrected[,6]$G, 1-RG.corrected[,6]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,7]$R, RG.corrected[,7]$G, 1-RG.corrected[,7]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,8]$R, RG.corrected[,8]$G, 1-RG.corrected[,8]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,9]$R, RG.corrected[,9]$G, 1-RG.corrected[,9]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,10]$R, RG.corrected[,10]$G, 1-RG.corrected[,10]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,11]$R, RG.corrected[,11]$G, 1-RG.corrected[,11]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,12]$R, RG.corrected[,12]$G, 1-RG.corrected[,12]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,13]$R, RG.corrected[,13]$G, 1-RG.corrected[,13]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,14]$R, RG.corrected[,14]$G, 1-RG.corrected[,14]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,15]$R, RG.corrected[,15]$G, 1-RG.corrected[,15]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,16]$R, RG.corrected[,16]$G, 1-RG.corrected[,16]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,17]$R, RG.corrected[,17]$G, 1-RG.corrected[,17]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,18]$R, RG.corrected[,18]$G, 1-RG.corrected[,18]$weights))
ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)
#add suffix to the array columns bound in the above command to indicate #which values are red/green and what the weighting for each spot names(ghanaDDT1.raw)[seq(11,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(11,length(names(ghanaDDT1.raw)),3)],"R",sep=".")
names(ghanaDDT1.raw)[seq(12,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(12,length(names(ghanaDDT1.raw)),3)],"G",sep=".")
names(ghanaDDT1.raw)[seq(13,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(13,length(names(ghanaDDT1.raw)),3)],"W",sep=".")
ghanaDDT1.raw$ProbeName=substr(ghanaDDT1.raw$ProbeName, 1, 13)
#write a csv table of combined array info
write.csv(ghanaDDT1.raw, file="ghanaDDT1raw_bkgrd_none_offset50_loess.csv")
#import array experimental design
design=read.table("DDTdesignfile.txt", header=TRUE)
write.table(ghanaDDT1.raw, "ghanaDDT_bkgrd_none_offset50_loess.data.txt", row.names=FALSE, sep="\t", quote=FALSE)
#import data.txt and design file into j/MAANOVA (Java based interface) or continue in R and MAANOVA
#Note – can remove control spots at this point manually from the data file #are they are no longer required in analysis…
#Note – ensure order of arrays and dyes in the design file EXACTLY match the ordering in the data file
#JMAANOVA COMMANDS:
#read in data and Log transform (as recommended).
DDT_data_260410 <- read.madata(datafile="C:\\Documents and Settings\\saram\\My Documents\\microarrayscans\\8x15Kscans\\RE-EXTRACTION amended design file\\ghanaDDT_bkgrd_none_offset50_loessGENESonly.data.txt", designfile="C:\\Documents and Settings\\saram\\My Documents\\microarrayscans\\8x15Kscans\\RE-EXTRACTION amended design file\\DDTdesignfile.txt", arrayType="twoColor", header=TRUE, spotflag=TRUE, n.rep=1, metarow=7, metacol=8, row=9, col=10, probeid=1, intensity=11, log.trans=TRUE)
#FIT MIXED EFFECT MODEL (restricted maximum likelihood method used to fit #linear model as default setting) Given the data and formula, this function #fits the regression model for each gene and calculates the ANOVA #estimates, variance components for random terms, fitted values, etc. For a #mixed effect models, the output estimates will be BLUE and BLUP.
DDT_data_260410.fit <- fitmaanova(madata=DDT_data_260410, formula=~Array+Dye+Sample+Group, random=~Array+Sample, method="REML", verbose=TRUE, subCol=FALSE)
#F-TEST UNIVERSAL – tests all combinations of groups. As the experimental #size is small (<5 pools per group) permutation testing in not viable or #recommended therefore we will rely on tabulated pvalues hence n.perm=1.
DDT_data_260410.f_test_Group <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ftest", MME.method="REML", n.perm=1, verbose=TRUE)
#CHECK CONTRAST MATRIX via ‘nameoftestfile.test(f or pairwise t)$Contrast’
DDT_data_260410.f_test_Group$Contrast
# [,1] [,2] [,3]
#[1,] 1 0 -1
#[2,] 0 1 -1
#PAIR-WISE T TEST all (especially important when have > 2 groups). Define #the pairwise test in the matrix within the J/MAANOVA window. Genes within #groups labelled ‘+1’ in the analysis will appear in the positive (RHS) of #the volcano plots and will be positive in the log FC values.
DDT_data_260410.ttest_all <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ttest", Contrast=PairContrast(3), MME.method="REML", n.perm=1, verbose=TRUE)
#pairwise T test: defined comparison in jmaanova i.e.
> DDT_data_260410.akronplus1ngouminus1NEW <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ttest", Contrast=new("matrix", nrow=1, ncol=3, byrow=TRUE, c(1.0, 0.0, -1.0)), MME.method="REML", n.perm=1, verbose=TRUE)
#ADJUST p-value i.e. correct for multiple testing (FDR adjustment) (DDT_data_260410.ddtplus1ngouminus1_FDR=adjPval(DDT_data_260410.ddtplus1ngouminus1, method="jsFDR")
#FDR results table
ind=seq(1, length(DDT_data_260410$probeid))
DDT.tab1=as.data.frame(cbind(ProbeID= DDT_data_260410.akronplus1ngouminus1NEW_FDR$obsAnova$probeid[ind], Gene= DDT_data_260410$genename[ind], Description= DDT_data_260410$description[ind], Status= DDT_data_260410$status[ind]))
DDT.tab1$logFC=DDT_data_260410.akronplus1ngouminus1NEW_FDR $obsAnova$Group[,2]- DDT_data_260410.akronplus1ngouminus1NEW_FDR $obsAnova$Group[,1]
DDT.tab1$Pval=DDT_data_260410.akronplus1ngouminus1NEW_FDR$Fs$Ptab
DDT.tab1$adjPval=DDT_data_260410.akronplus1ngouminus1NEW_FDR$Fs$adjPtab
DDT.tab1$Flag=DDT_data_260410.akronplus1ngouminus1NEW_FDR$obsAnova$flag
write.table(DDT.tab1, row.names=F, file="TOPTABLE_ DDT_data_260410.akronplus1ngouminus1NEW_FDR.txt", sep="\t", quote=F)
#check all objects created in J/MAANOVA via
objects()
#CREATE A RESULTS TABLE from single defined PAIRWISE comparisons.
#i.e. akron vs ddt (2v1 from matrix) DDT_data_260410.akronminus1ddtplus1
ind=seq(1, length(DDT_data_260410$probeid))
DDT.tab=as.data.frame(cbind(ProbeID=DDT_data_260410.akronminus1ddtplus1$obsAnova$probeid[ind], Gene=DDT_data_260410$genename[ind], Description=DDT_data_260410$description[ind], Status=DDT_data_260410$status[ind]))
DDT.tab$logFC=DDT_data_260410.akronminus1ddtplus1$obsAnova$Group[,2]- DDT_data_260410.akronminus1ddtplus1$obsAnova$Group[,1]
DDT.tab$Pval=DDT_data_260410.akronminus1ddtplus1$Fs$Ptab
DDT.tab$adjPval=DDT_data_260410.akronminus1ddtplus1$Fs$adjPtab
DDT.tab$Flag=DDT_data_260410.akronminus1ddtplus1$obsAnova$flag
write.table(DDT.tab, row.names=F, file="DDT_bkgrdnoneoff50loessGENESpairwiseDDTvsAKRON.txt", sep="\t", quote=F)
#R-COMMANDS MAANOVA
library(maanova)
ghanaDDT<-read.madata(datafile="ghanaDDT.data.txt", designfile="DDTdesignfile.txt", arrayType="twoColor", header=TRUE, probeid=1, genename=2, systematicname=3, description=4, status=5, sequence=6, metarow=7, metacol=8, row=9, col=10, intensity=11, spotflag=TRUE, avgreps=0, log.trans=TRUE)
ghanaDDT
(ghanaDDT)
gridcheck(ghanaDDT)
#LOAD PACKAGE
source("
biocLite("qvalue")
#FIT MIXED MODEL ANOVA (removed spot as no reps therefore no spot effect
ghanaDDT.fit<-fitmaanova(ghanaDDT, formula=~Array+Dye+Sample+Group, random=~Array+Sample, covariate=~1)
save (ghanaDDT.fit, file="ghanaDDTfit")
#RESIDUAL PLOT
resiplot(ghanaDDT, ghanaDDT.fit)
#resiplot(DDT_data_260410, DDT_data_260410.fit)
#TEST STATISTIC (defined the test type (global F-test)
testDDT.unadj <- matest(data=ghanaDDT, anovaobj=ghanaDDT.fit, term="Group", test.type="ftest", MME.method="REML", n.perm=1, verbose=TRUE)
#adjust p-values
DDTtest=adjPval(test.unadj, method="jsFDR")
#plot VOLCANO and save to file
volcano(test, method="fdrperm")
jpeg('volcano DDT1 perm 50')
plot(DDT1.tab$logFC, -log10(DDT1.tab$adjPval))
save.image()
save.image("DDTanalysis060410.Rhistory")
savehistory()
savehistory("DDTanalysis060410.Rhistory")