SI Text R –based script for the analysis of whole genome (8x15K) microarrays comparing a DDT resistant field population with two more susceptible West African colonies, Akron (Benin) and Ngoussou (Cameroon).

############################################################################LIMMA NORMALISATION OF RAW DATA - DDT 8x15k MICROARRAYS (three way # # comparison, looped design) #

###########################################################################

#Create a number of files for this process:

##Target file – listing the following for each array

#FileNameTreatmentGErepcy3cy5datearraynumberslide

##Spot Type file – containing info on each spot type and designating a #colour to the spot types for subsequent plots i.e.

#SpotTypeControlTypeProbeNameGeneNamecolour

#detoxtarget0DETOX_**brown

#WGtarget0CUST_**black

#CV0CV_A_90_P**yellow

#Brightcorner1GE_BrightCorner*orange

#Darkcorner1DarkCorner*grey

#positivecontrol1**green

#negativecontrol -1*NegativeControlred

##Design file – containing a row for every sample on all arrays; must be in #the same order the arrays will be combined in at a later step i.e. array #numbers and cy3(Green) cy5(Red) order.

#Array Dye SampleGroup

#US84700254_252705710011_S01re-b_GE2_105_Jan09_2_2.txt Cy51ddt

#US84700254_252705710011_S01re-b_GE2_105_Jan09_2_2.txt Cy34akron

#US84700254_252705710007_S01re-b_GE2_105_Jan09_1_2.txt Cy54akron

#US84700254_252705710007_S01re-b_GE2_105_Jan09_1_2.txt Cy38ngou

#US84700254_252705710007_S01re-b_GE2_105_Jan09_2_4.txt Cy58ngou

#US84700254_252705710007_S01re-b_GE2_105_Jan09_2_4.txt Cy32ddt

#US84700254_252705710011_S01re-b_GE2_105_Jan09_1_3.txt Cy52ddt

#US84700254_252705710011_S01re-b_GE2_105_Jan09_1_3.txt Cy36akron

#set working directory – in R using LIMMA & MAANOVA

setwd ("c:/Documents and Settings/LOCATION OF SCAN FILES")

getwd()

#open limma

source("

biocLite("limma")

library(limma)

#read in target file

targets<- readTargets ("DDTTargets.txt", row.names="arraynumber")

targets

#weight spots i.e. remove spots with signal below background signal using #a boolean formula "rIsWellAboveBG" 1=yes 0=no, R&G sig=1 R+G>1

myfun<-function (x) {

a=x[,"rIsWellAboveBG"] == 1

b=x[,"gIsWellAboveBG"] == 1

as.numeric(a+b >= 1)

}

#read in array files using weight function, default Limma setting for #Agilent arrays is foreground mean signal, background median signal

RG<- read.maimages(targets, source="agilent", wt.fun=myfun)

show(RG)

#check all genes and arrays read correctly,

dim(RG)

names(RG$genes)

#order genes so duplicates are next to each other index=order(RG$genes$ProbeName)

RG=RG[index,]

#read in SpotType file

spottypes<- readSpotTypes("DDTSpotTypes.txt")

spottypes

#match spot types to genes in the RG (read array) list ('status'=character #vector giving the control status of each spot on the array)

RG$genes$Status<- controlStatus(spottypes,RG)

#Create a pre-normalisation

MA-plot

#or to create png images to file

plotMA3by2(RG, prefix=”pre-normalised_MA”)

#plot pre-normalised signal intensities

plotDensities(RG)

#save plot image

Jpeg(‘pre-normalised_densities’)

plotDensities(RG)

dev.off()

#background correction

#method="none" then no correction is done, i.e., the background intensities #are treated as zero. The offset can be used to add a constant to the #intensities before log-transforming, so that the log-ratios are shrunk #towards zero at the lower intensities. This may eliminate or reverse the #usual 'fanning' of log-ratios at low intensities associated with local #background subtraction.

RG.b<- backgroundCorrect(RG, method="none", offset=50)

plotMA3by2(RG.b, prefix="backgroundnonoffset50_MA")

#plot background corrected signal intensities

plotDensities(RG.b)

#save plot image

Jpeg(‘backgroundnoneoffset50_densities’)

plotDensities(RG.b)

dev.off()

#weighting different spots to be more/less important during normalisation #i.e. weight differentially expressed genes 0 and genes not meant to be #differentially expressed 1

RG$weights=modifyWeights(RG$weights, RG$genes$Status, spottypes$SpotType, c (rep(1,3), rep(0,4)))

#within array normalisation method loess

MA=normalizeWithinArrays(RG.b, method="loess")

#plot MA for loess normalised data

plotMA3by2(MA,prefix="DDT_background_none_offset50_loess",col=spottypes$colour, zero.weights=TRUE, common.lim=TRUE)

#boxplot – check the distribution of the red and green ratios for each #array to

boxplot(MA$M[MA$weights==1]~col(MA$M)[MA$weights==1], names=colnames(MA$M), las=2, main="nobackgroundoffset50loesswithinDDT", ylab="M", xaxt="none", ylim=c(-10,10))

axis(1, at=c(1:length(targets[,1])), label=targets$Name, las=2)

#plot densities – check that the red and green dye distribution plotDensities(MA)

############################################################################CONVERSION TO MAANOVA FORMAT AND RUNNING MAANOVA #

###########################################################################

RG.corrected<- RG.MA(MA)

MetaRow=1

MetaCol=1

#bind all the arrays together horizontally

ghanaDDT1.raw=as.data.frame(cbind(RG.corrected[,1]$genes[c(7:11, 4)], MetaRow, MetaCol,RG.corrected[,1]$genes[1:2], RG.corrected[,1]$R, RG.corrected[,1]$G, 1-RG.corrected[,1]$weights))

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,2]$R, RG.corrected[,2]$G, 1-RG.corrected[,2]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,3]$R, RG.corrected[,3]$G, 1-RG.corrected[,3]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,4]$R, RG.corrected[,4]$G, 1-RG.corrected[,4]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,5]$R, RG.corrected[,5]$G, 1-RG.corrected[,5]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,6]$R, RG.corrected[,6]$G, 1-RG.corrected[,6]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,7]$R, RG.corrected[,7]$G, 1-RG.corrected[,7]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,8]$R, RG.corrected[,8]$G, 1-RG.corrected[,8]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,9]$R, RG.corrected[,9]$G, 1-RG.corrected[,9]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,10]$R, RG.corrected[,10]$G, 1-RG.corrected[,10]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,11]$R, RG.corrected[,11]$G, 1-RG.corrected[,11]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,12]$R, RG.corrected[,12]$G, 1-RG.corrected[,12]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,13]$R, RG.corrected[,13]$G, 1-RG.corrected[,13]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,14]$R, RG.corrected[,14]$G, 1-RG.corrected[,14]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,15]$R, RG.corrected[,15]$G, 1-RG.corrected[,15]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,16]$R, RG.corrected[,16]$G, 1-RG.corrected[,16]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,17]$R, RG.corrected[,17]$G, 1-RG.corrected[,17]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

ghanaDDT1.raw.new=as.data.frame(cbind(RG.corrected[,18]$R, RG.corrected[,18]$G, 1-RG.corrected[,18]$weights))

ghanaDDT1.raw=cbind(ghanaDDT1.raw, ghanaDDT1.raw.new)

#add suffix to the array columns bound in the above command to indicate #which values are red/green and what the weighting for each spot names(ghanaDDT1.raw)[seq(11,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(11,length(names(ghanaDDT1.raw)),3)],"R",sep=".")

names(ghanaDDT1.raw)[seq(12,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(12,length(names(ghanaDDT1.raw)),3)],"G",sep=".")

names(ghanaDDT1.raw)[seq(13,length(names(ghanaDDT1.raw)),3)]=paste(names(ghanaDDT1.raw)[seq(13,length(names(ghanaDDT1.raw)),3)],"W",sep=".")

ghanaDDT1.raw$ProbeName=substr(ghanaDDT1.raw$ProbeName, 1, 13)

#write a csv table of combined array info

write.csv(ghanaDDT1.raw, file="ghanaDDT1raw_bkgrd_none_offset50_loess.csv")

#import array experimental design

design=read.table("DDTdesignfile.txt", header=TRUE)

write.table(ghanaDDT1.raw, "ghanaDDT_bkgrd_none_offset50_loess.data.txt", row.names=FALSE, sep="\t", quote=FALSE)

#import data.txt and design file into j/MAANOVA (Java based interface) or continue in R and MAANOVA

#Note – can remove control spots at this point manually from the data file #are they are no longer required in analysis…

#Note – ensure order of arrays and dyes in the design file EXACTLY match the ordering in the data file

#JMAANOVA COMMANDS:

#read in data and Log transform (as recommended).

DDT_data_260410 <- read.madata(datafile="C:\\Documents and Settings\\saram\\My Documents\\microarrayscans\\8x15Kscans\\RE-EXTRACTION amended design file\\ghanaDDT_bkgrd_none_offset50_loessGENESonly.data.txt", designfile="C:\\Documents and Settings\\saram\\My Documents\\microarrayscans\\8x15Kscans\\RE-EXTRACTION amended design file\\DDTdesignfile.txt", arrayType="twoColor", header=TRUE, spotflag=TRUE, n.rep=1, metarow=7, metacol=8, row=9, col=10, probeid=1, intensity=11, log.trans=TRUE)

#FIT MIXED EFFECT MODEL (restricted maximum likelihood method used to fit #linear model as default setting) Given the data and formula, this function #fits the regression model for each gene and calculates the ANOVA #estimates, variance components for random terms, fitted values, etc. For a #mixed effect models, the output estimates will be BLUE and BLUP.

DDT_data_260410.fit <- fitmaanova(madata=DDT_data_260410, formula=~Array+Dye+Sample+Group, random=~Array+Sample, method="REML", verbose=TRUE, subCol=FALSE)

#F-TEST UNIVERSAL – tests all combinations of groups. As the experimental #size is small (<5 pools per group) permutation testing in not viable or #recommended therefore we will rely on tabulated pvalues hence n.perm=1.

DDT_data_260410.f_test_Group <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ftest", MME.method="REML", n.perm=1, verbose=TRUE)

#CHECK CONTRAST MATRIX via ‘nameoftestfile.test(f or pairwise t)$Contrast’

DDT_data_260410.f_test_Group$Contrast

# [,1] [,2] [,3]

#[1,] 1 0 -1

#[2,] 0 1 -1

#PAIR-WISE T TEST all (especially important when have > 2 groups). Define #the pairwise test in the matrix within the J/MAANOVA window. Genes within #groups labelled ‘+1’ in the analysis will appear in the positive (RHS) of #the volcano plots and will be positive in the log FC values.

DDT_data_260410.ttest_all <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ttest", Contrast=PairContrast(3), MME.method="REML", n.perm=1, verbose=TRUE)

#pairwise T test: defined comparison in jmaanova i.e.

> DDT_data_260410.akronplus1ngouminus1NEW <- matest(data=DDT_data_260410, anovaobj=DDT_data_260410.fit, term="Group", test.type="ttest", Contrast=new("matrix", nrow=1, ncol=3, byrow=TRUE, c(1.0, 0.0, -1.0)), MME.method="REML", n.perm=1, verbose=TRUE)

#ADJUST p-value i.e. correct for multiple testing (FDR adjustment) (DDT_data_260410.ddtplus1ngouminus1_FDR=adjPval(DDT_data_260410.ddtplus1ngouminus1, method="jsFDR")

#FDR results table

ind=seq(1, length(DDT_data_260410$probeid))

DDT.tab1=as.data.frame(cbind(ProbeID= DDT_data_260410.akronplus1ngouminus1NEW_FDR$obsAnova$probeid[ind], Gene= DDT_data_260410$genename[ind], Description= DDT_data_260410$description[ind], Status= DDT_data_260410$status[ind]))

DDT.tab1$logFC=DDT_data_260410.akronplus1ngouminus1NEW_FDR $obsAnova$Group[,2]- DDT_data_260410.akronplus1ngouminus1NEW_FDR $obsAnova$Group[,1]

DDT.tab1$Pval=DDT_data_260410.akronplus1ngouminus1NEW_FDR$Fs$Ptab

DDT.tab1$adjPval=DDT_data_260410.akronplus1ngouminus1NEW_FDR$Fs$adjPtab

DDT.tab1$Flag=DDT_data_260410.akronplus1ngouminus1NEW_FDR$obsAnova$flag

write.table(DDT.tab1, row.names=F, file="TOPTABLE_ DDT_data_260410.akronplus1ngouminus1NEW_FDR.txt", sep="\t", quote=F)

#check all objects created in J/MAANOVA via

objects()

#CREATE A RESULTS TABLE from single defined PAIRWISE comparisons.

#i.e. akron vs ddt (2v1 from matrix) DDT_data_260410.akronminus1ddtplus1

ind=seq(1, length(DDT_data_260410$probeid))

DDT.tab=as.data.frame(cbind(ProbeID=DDT_data_260410.akronminus1ddtplus1$obsAnova$probeid[ind], Gene=DDT_data_260410$genename[ind], Description=DDT_data_260410$description[ind], Status=DDT_data_260410$status[ind]))

DDT.tab$logFC=DDT_data_260410.akronminus1ddtplus1$obsAnova$Group[,2]- DDT_data_260410.akronminus1ddtplus1$obsAnova$Group[,1]

DDT.tab$Pval=DDT_data_260410.akronminus1ddtplus1$Fs$Ptab

DDT.tab$adjPval=DDT_data_260410.akronminus1ddtplus1$Fs$adjPtab

DDT.tab$Flag=DDT_data_260410.akronminus1ddtplus1$obsAnova$flag

write.table(DDT.tab, row.names=F, file="DDT_bkgrdnoneoff50loessGENESpairwiseDDTvsAKRON.txt", sep="\t", quote=F)

#R-COMMANDS MAANOVA

library(maanova)

ghanaDDT<-read.madata(datafile="ghanaDDT.data.txt", designfile="DDTdesignfile.txt", arrayType="twoColor", header=TRUE, probeid=1, genename=2, systematicname=3, description=4, status=5, sequence=6, metarow=7, metacol=8, row=9, col=10, intensity=11, spotflag=TRUE, avgreps=0, log.trans=TRUE)

ghanaDDT

(ghanaDDT)

gridcheck(ghanaDDT)

#LOAD PACKAGE

source("

biocLite("qvalue")

#FIT MIXED MODEL ANOVA (removed spot as no reps therefore no spot effect

ghanaDDT.fit<-fitmaanova(ghanaDDT, formula=~Array+Dye+Sample+Group, random=~Array+Sample, covariate=~1)

save (ghanaDDT.fit, file="ghanaDDTfit")

#RESIDUAL PLOT

resiplot(ghanaDDT, ghanaDDT.fit)

#resiplot(DDT_data_260410, DDT_data_260410.fit)

#TEST STATISTIC (defined the test type (global F-test)

testDDT.unadj <- matest(data=ghanaDDT, anovaobj=ghanaDDT.fit, term="Group", test.type="ftest", MME.method="REML", n.perm=1, verbose=TRUE)

#adjust p-values

DDTtest=adjPval(test.unadj, method="jsFDR")

#plot VOLCANO and save to file

volcano(test, method="fdrperm")

jpeg('volcano DDT1 perm 50')

plot(DDT1.tab$logFC, -log10(DDT1.tab$adjPval))

save.image()

save.image("DDTanalysis060410.Rhistory")

savehistory()

savehistory("DDTanalysis060410.Rhistory")