Applications of GTOM to Fly Protein-Protein Interaction Networks

Andy Yip, Steve Horvath

# STARTING THE R session:

# Open the R software by double clicking the corresponding icon

#To interact with the R software copy and paste the commands into the R console.

#Text after "#" is a comment and is automatically ignored by R.

# Set the working directory of the R session by using the following command.

setwd(“C:/Documents and Settings/shorvath/My Documents/ADAG/AndyYip/TutoriaGTOMscreeningFly”)

# Note that we use / instead of \ in the path.

# read in the R libraries.

library(sna) # this is needed for closeness

#library(MASS)

#library(class)

library(cluster)

#library(sma) # different from sna! this is needed for plot.mat below

library(impute) # needed for imputing missing value before principal component analysis

#Memory

# check the maximum memory that can be allocated

memory.size(TRUE)/1024

# increase the available memory

memory.limit(size=2048)

# read in the custom network functions

source("C:/Documents and Settings/shorvath/My Documents/RFunctions/NetworkFunctions.txt")


# Load the network data

datAnnotation=read.delim(“FlyProtein_Annotations.csv”,header=T,sep=",")

dim(datAnnotation)

[1] 1371 3

# This vector encodes network essentiality (essential=1 and 0 otherwise)

essentiality = datAnnotation$essentiality - 1;

AdjMat1 = read.csv("FlyProtein_ADJ1.csv", header=F)

AdjMat1 = as.matrix(AdjMat1)

rownames(AdjMat1)=datAnnotation$nodename;

colnames(AdjMat1)=datAnnotation$nodename;

# Note that there are a lot of proteins for which essentiality is not known.

sum(is.na(essentiality))

[1] 466

sum(is.na(AdjMat1))

[1] 0


# ======

# Check Scale Free Topology

# ======

Degree = apply(AdjMat1,2,sum)

# Let’s create a scale free topology plot.

# The black curve corresponds to scale free topology and

# the red curve corresponds to truncated scale free topology.

par(mfrow=c(1,1))

ScaleFreePlot1(Degree, AF1=paste(“Fly Protein Network”),truncated1=TRUE);


# ======

# Module Detection/TOMk Plots

# ======

# Restrict the analysis to the genes with high connectivity

DegCut = 1000

DegreeRank = rank(-Degree)

rest1 = DegreeRank <= DegCut

sum(rest1)

[1] 1042

# Comment: this is different from DegCut due to ties in the ranks...

# Remove any isolated nodes, they affect the correlation between different GTOM measures

conn.comp1 = component.dist(AdjMat1[rest1,rest1]) # need the SNA library

conn.comp1$csize

[1] 1042

# Thus, there is only 1 connected component

#rest1[rest1] = (conn.comp1$membership==1);

AdjMat1rest = AdjMat1[rest1,rest1]

Degree.rest = apply(AdjMat1rest,2,sum);

datAnnotation = datAnnotation[rest1,]

essentiality = essentiality[rest1]

rm(DegreeRank,AdjMat1); collect_garbage();

summary(Degree.rest)

Min. 1st Qu. Median Mean 3rd Qu. Max.

2.00 9.00 12.50 15.94 19.00 74.00

# Compute the distance measures

dissGTOM0 = 1 - AdjMat1rest; diag(dissGTOM0) = 0

# Compute the generalized TOM based dissimilarity

maxk = 3;

for (k in 1:maxk){

assign(paste(“dissGTOM”,k,sep=””), TOMkdist1(AdjMat1rest,k));}

collect_garbage();


# This creates a hierarchical clustering using the TOM matrix

hierGTOM0 = hclust(as.dist(dissGTOM0),method="average")

hierGTOM1 = hclust(as.dist(dissGTOM1),method="average")

hierGTOM2 = hclust(as.dist(dissGTOM2),method="average")

par(mfrow=c(1,3))

plot(hierGTOM0, main="Adjacency Matrix: GTOM0", labels=F, xlab=“”, sub=“”)

plot(hierGTOM1, main="Standard TOM Measure: GTOM1", labels=F, xlab=“”, sub=“”)

plot(hierGTOM2, main="New TOM Measure: GTOM2", labels=F, xlab=“”, sub=“”)

# This suggests a height cut-off of h1 for the GTOM1 but one should check the robustness of this choice

colorGTOM0=as.character(modulecolor2(hierGTOM0,h1=.95,minsize1=40))

colorGTOM1=as.character(modulecolor2(hierGTOM1,h1=.95,minsize1=40))

colorGTOM2=as.character(modulecolor2(hierGTOM2,h1=.6,minsize1=50))


table(colorGTOM0,essentiality)

essentiality

colorGTOM0 0 1

blue 9 37

brown 26 10

grey 374 115

turquoise 47 25

yellow 2 23

table(colorGTOM1, essentiality)

essentiality

colorGTOM1 0 1

black 25 5

blue 10 73

brown 28 11

green 0 22

grey 267 65

red 31 7

turquoise 56 24

yellow 41 3

table(colorGTOM2, essentiality)

essentiality

colorGTOM2 0 1

blue 69 35

brown 11 80

grey 209 65

turquoise 136 16

yellow 33 14


essentialityColor=ifelse(essentiality==1,”black”, “white”)

par(mfrow=c(4,3),mar=c(2,2,2,2))

plot(hierGTOM0, main="Adjacency Matrix: GTOM0", labels=F, xlab=“”, sub=“”)

plot(hierGTOM1, main="Standard TOM Measure: GTOM1", labels=F, xlab=“”, sub=“”)

plot(hierGTOM2, main="New TOM Measure: GTOM2", labels=F, xlab=“”, sub=“”)

hclustplot1(hierGTOM0,colorGTOM1, title1="")

hclustplot1(hierGTOM1,colorGTOM1, title1="Colored by GTOM1 modules")

hclustplot1(hierGTOM2,colorGTOM1, title1="")

hclustplot1(hierGTOM0,colorGTOM2, title1="")

hclustplot1(hierGTOM1,colorGTOM2, title1="Colored by GTOM2 modules")

hclustplot1(hierGTOM2,colorGTOM2, title1="")

hclustplot1(hierGTOM0, essentialityColor, title1="")

hclustplot1(hierGTOM1, essentialityColor,title1="Colored by essentiality")

hclustplot1(hierGTOM2, essentialityColor, title1="")


# How related are the distances?

datDist = data.frame(dissGTOM0=c(as.dist(dissGTOM0)), dissGTOM1=c(as.dist(dissGTOM1)), dissGTOM2=c(as.dist(dissGTOM2)), dissGTOM3=c(as.dist(dissGTOM3)))

set.seed(seed=1) # fix the seed so that the results are reproducible

datDist2 = datDist[sample(1:dim(datDist)[[1]],10000),]

dim(datDist2)

pairs(datDist2, upper.panel=panel.smooth, lower.panel=panel.cor, diag.panel=panel.hist)

# This creates a hierarchical clustering using the TOM and Correlation matrices

hierGTOM3 = hclust(as.dist(dissGTOM3),method="average")

plot(hierGTOM3, main="New TOM Measure: GTOM3", labels=F, xlab=“”, sub=“”)

colorGTOM3=as.character(modulecolor2(hierGTOM3,h1=0.03,minsize1=30))


# The function TOMplot2 creates a TOM plot where the top and left color bars can be different

# Inputs: distance measure, hierarchical (hclust) object, color label=colorTOP row, colorLeft row

TOMplot2(dissGTOM0, hierGTOM0, colorGTOM1, colorGTOM0);

TOMplot2(dissGTOM1, hierGTOM1, colorGTOM1, colorGTOM1);

TOMplot2(dissGTOM2, hierGTOM2, colorGTOM1, colorGTOM2);

TOMplot2(dissGTOM3, hierGTOM3, colorGTOM1, colorGTOM3);


# MDS plots using module color

cmdGTOM0 = cmdscale(dissGTOM0,k=3);

cmdGTOM1 = cmdscale(dissGTOM1,k=3);

cmdGTOM2 = cmdscale(dissGTOM2,k=3);

cmdGTOM3 = cmdscale(dissGTOM3,k=3);

par(mfrow=c(2,2))

symbol1 = c(17, 1)[factor(essentiality)]

plot(cmdGTOM0,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM0”)

plot(cmdGTOM1,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM1”)

plot(cmdGTOM2,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM2”)

plot(cmdGTOM3,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2”, main=”distGTOM3”)


The Use of GTOM for Screening for Neighboring Genes

To show a potential use of the GTOM measures, we carry out the following analysis.

Recall that for each gene the essentiality is an external gene significance variable based on knock out experiments. essentiality=1 means that the gene is essential for yeast survival, 0 otherwise.

We start with an essential gene with high connectivity, i.e. an essential hub gene.

Next we create the following lists.

List1: rank all genes according to their correlation with that essential gene.

List 2: rank all genes according to their TOM1 values with that gene

List 3: rank all genes according to their TOM2 value with that gene.

The goal is to show that Lists 2 and3 are often more enriched with essential genes than the other lists.

We will create the following plots.

x-axis rank of genes say 1:20, y-axis proportion of essential genes in the list up to this rank.

So, each line starts at point (1,1) because the first gene is essential.

At point x=10, the y-value equals the proportion of essential genes that are in the top 10 list of each screening procedure.

This results in figures that shows a) that the TOM measures are superior to the correlation measure and b) highlight the differences between TOM1 and TOM2.

GeneSignificance=essentiality

# Missing essentiality is assigned a label of -1

GeneSignificance[is.na(GeneSignificance)] = -1


# Goal: find the most highly connected essential genes

# Trick: if a gene is non-essential or essentiality is missing we assign it negative connectivity

Degree2 = Degree.rest

Degree2[GeneSignificance!=1 ] = - Degree2[GeneSignificance!=1 ]

DegreeOrder = order(Degree2,decreasing=T)

Degree2[DegreeOrder[1:10]] # each node is not counted as a neighbor of itself

#In the following we create 10 plots for the 10 most highly connected essential genes.

par(mfrow=c(2,5))

no.essentialhubs=10

# this is the number of neighbors that will be considered for each of the essential hub genes

no.neighbors=20

for (i in c(1: no.essentialhubs) ) {

# The following gene is the i-th most highly connected essential gene

Hub1 = DegreeOrder[i]

Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself

Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);

Temp1 = dissGTOM1[Hub1,];

Temp1[Hub1] = -999;

Temp1[GeneSignificance==-1] = 999;

Hub1ListGTOM1 = c(order(Temp1, decreasing=F));

Temp2 = dissGTOM2[Hub1,];

Temp2[Hub1] = -999;

Temp2[GeneSignificance==-1] = 999;

Hub1ListGTOM2 = c(order(Temp2, decreasing=F));

Temp3 = dissGTOM3[Hub1,];

Temp3[Hub1] = -999;

Temp3[GeneSignificance==-1] = 999;

Hub1ListGTOM3 = c(order(Temp3, decreasing=F));

# This computes the proportion of essential genes among the most correlated genes.

Hub1PropEssGTOM0 = sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0);

Hub1PropEssGTOM1 = cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1));

Hub1PropEssGTOM2 = cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2));

Hub1PropEssGTOM3 = cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3));

Hub1PropEss = data.frame(GTOM0=Hub1PropEssGTOM0, GTOM1=Hub1PropEssGTOM1, GTOM2=Hub1PropEssGTOM2, GTOM3=Hub1PropEssGTOM3)[1:no.neighbors,]

matplot(Hub1PropEss, type="l", lty=c(4,3,2,1), lwd=3, col=c(2,3,4,5), ylab="Proportion Essential", xlab="rank", main=paste(“essential hub number: “, i) )

legend(2,1, c("GTOM0", "GTOM1","GTOM2", “GTOM3”), col=c(2,3,4,5), lty=c(4,3,2,1), lwd=3, pch = "*",ncol = 1, cex=.8)

title("")

}# end of for loop

# Comment: Note that in many cases the degree measure outperforms the TOM based measures.

# But there is a lot of variability and it is more meaningful to summarize the results for multiple genes in a single plot.

# Toward this end, we will now average these findings.

# Averaging over all the top essential hubs

no.essentialhubs=30

no.neighbors=40

Hub1PropEssGTOM0=rep(0,no.neighbors)

Hub1PropEssGTOM1=rep(0,no.neighbors)

Hub1PropEssGTOM2=rep(0,no.neighbors)

Hub1PropEssGTOM3=rep(0,no.neighbors)

for (i in c(1:no.essentialhubs) ) {

# The following gene is the i-th most highly connected essential gene

Hub1 = DegreeOrder[i]

Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself

Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);

Temp1 = dissGTOM1[Hub1,];

Temp1[Hub1] = -999;

Temp1[GeneSignificance==-1] = 999;

Hub1ListGTOM1 = c(order(Temp1, decreasing=F))[1:no.neighbors];

Temp2 = dissGTOM2[Hub1,];

Temp2[Hub1] = -999;

Temp2[GeneSignificance==-1] = 999;

Hub1ListGTOM2 = c(order(Temp2, decreasing=F))[1:no.neighbors];

Temp3 = dissGTOM3[Hub1,];

Temp3[Hub1] = -999;

Temp3[GeneSignificance==-1] = 999;

Hub1ListGTOM3 = c(order(Temp3, decreasing=F))[1:no.neighbors];

# This computes the proportion of essential genes among the most correlated genes.

# This computes the proportion of essential genes among the most correlated genes.

Hub1PropEssGTOM0 = Hub1PropEssGTOM0+ sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0)/no.essentialhubs;

Hub1PropEssGTOM1 = Hub1PropEssGTOM1+ cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1))/no.essentialhubs;

Hub1PropEssGTOM2 = Hub1PropEssGTOM2+ cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2))/no.essentialhubs;

Hub1PropEssGTOM3 = Hub1PropEssGTOM3+ cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3))/no.essentialhubs;

}# end of for loop

Hub1PropEss = data.frame(GTOM0=Hub1PropEssGTOM0, GTOM1=Hub1PropEssGTOM1, GTOM2=Hub1PropEssGTOM2, GTOM3=Hub1PropEssGTOM3);

par(mfrow=c(1,1))

matplot(Hub1PropEss, type="l", lty=c(4,3,2,1), lwd=3, col=c(2,3,4,5), ylab="Proportion Essential", xlab="Rank", main=paste(“average over“, i, “essential hub genes “), cex.lab=1.5 ,cex.main=1.5 )

legend(5,1, c("GOM0","GTOM1","GTOM2", "GTOM3"), col=2:5, lty=c(4,3,2,1),lwd=3, pch = "",ncol = 1, cex=1.5)

title("")

The curve suggests that GTOM2 is better than the other measures. But to rigorously test it for a given rank, we will use the Wilcoxon test as described in the following

Now we test whether the difference at Rank=10 is significant

Rank1=10

# Averaging over all the top essential hubs

no.essentialhubs=30

no.neighbors=40

PropEssentialGTOM0=rep(NA,no.essentialhubs)

PropEssentialGTOM1=rep(NA,no.essentialhubs)

PropEssentialGTOM2=rep(NA,no.essentialhubs)

PropEssentialGTOM3=rep(NA,no.essentialhubs)

for (i in c(1:no.essentialhubs) ) {

# The following gene is the i-th most highly connected essential gene

Hub1 = DegreeOrder[i]

Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself

Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);

Temp1 = dissGTOM1[Hub1,];

Temp1[Hub1] = -999;

Temp1[GeneSignificance==-1] = 999;

Hub1ListGTOM1 = c(order(Temp1, decreasing=F))[1:no.neighbors];

Temp2 = dissGTOM2[Hub1,];

Temp2[Hub1] = -999;

Temp2[GeneSignificance==-1] = 999;

Hub1ListGTOM2 = c(order(Temp2, decreasing=F))[1:no.neighbors];

Temp3 = dissGTOM3[Hub1,];

Temp3[Hub1] = -999;

Temp3[GeneSignificance==-1] = 999;

Hub1ListGTOM3 = c(order(Temp3, decreasing=F))[1:no.neighbors];

# This computes the proportion of essential genes among the most correlated genes.

# This computes the proportion of essential genes among the most correlated genes.

PropEssentialGTOM0[i]= sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0)

PropEssentialGTOM1[i]=c(cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1)))[Rank1];

PropEssentialGTOM2[i]=c(cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2)))[Rank1];

PropEssentialGTOM3[i]=c(cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3)))[Rank1];

}# end of for loop

signif(wilcox.test(PropEssentialGTOM0- PropEssentialGTOM2)$p.value,2)

signif(wilcox.test(PropEssentialGTOM1-PropEssentialGTOM2)$p.value,2)

signif(wilcox.test(PropEssentialGTOM3-PropEssentialGTOM2)$p.value,2)

Output

> signif(wilcox.test(PropEssentialGTOM0- PropEssentialGTOM2)$p.value,2)

[1] 0.034

> signif(wilcox.test(PropEssentialGTOM1-PropEssentialGTOM2)$p.value,2)

[1] 0.015

> signif(wilcox.test(PropEssentialGTOM3-PropEssentialGTOM2)$p.value,2)

[1] 0.02

Discussion

We find that neighborhood analysis with GTOM2 leads to significantly better results than GTOM0 (Wilcoxon p-value=0.034), GTOM1 (p-value= 0.015) and GTOM3 (p-value=0.02).

While this example shows that GTOM2 measure can lead to superior results than the standard measures GTOM0 and GTOM1, there is no doubt that more comprehensive empirical analyses are needed to establish the usefulness and limitations of the GTOM2 measure.

THE END

17