Applications of GTOM to Fly Protein-Protein Interaction Networks
Andy Yip, Steve Horvath
# STARTING THE R session:
# Open the R software by double clicking the corresponding icon
#To interact with the R software copy and paste the commands into the R console.
#Text after "#" is a comment and is automatically ignored by R.
# Set the working directory of the R session by using the following command.
setwd(“C:/Documents and Settings/shorvath/My Documents/ADAG/AndyYip/TutoriaGTOMscreeningFly”)
# Note that we use / instead of \ in the path.
# read in the R libraries.
library(sna) # this is needed for closeness
#library(MASS)
#library(class)
library(cluster)
#library(sma) # different from sna! this is needed for plot.mat below
library(impute) # needed for imputing missing value before principal component analysis
#Memory
# check the maximum memory that can be allocated
memory.size(TRUE)/1024
# increase the available memory
memory.limit(size=2048)
# read in the custom network functions
source("C:/Documents and Settings/shorvath/My Documents/RFunctions/NetworkFunctions.txt")
# Load the network data
datAnnotation=read.delim(“FlyProtein_Annotations.csv”,header=T,sep=",")
dim(datAnnotation)
[1] 1371 3
# This vector encodes network essentiality (essential=1 and 0 otherwise)
essentiality = datAnnotation$essentiality - 1;
AdjMat1 = read.csv("FlyProtein_ADJ1.csv", header=F)
AdjMat1 = as.matrix(AdjMat1)
rownames(AdjMat1)=datAnnotation$nodename;
colnames(AdjMat1)=datAnnotation$nodename;
# Note that there are a lot of proteins for which essentiality is not known.
sum(is.na(essentiality))
[1] 466
sum(is.na(AdjMat1))
[1] 0
# ======
# Check Scale Free Topology
# ======
Degree = apply(AdjMat1,2,sum)
# Let’s create a scale free topology plot.
# The black curve corresponds to scale free topology and
# the red curve corresponds to truncated scale free topology.
par(mfrow=c(1,1))
ScaleFreePlot1(Degree, AF1=paste(“Fly Protein Network”),truncated1=TRUE);
# ======
# Module Detection/TOMk Plots
# ======
# Restrict the analysis to the genes with high connectivity
DegCut = 1000
DegreeRank = rank(-Degree)
rest1 = DegreeRank <= DegCut
sum(rest1)
[1] 1042
# Comment: this is different from DegCut due to ties in the ranks...
# Remove any isolated nodes, they affect the correlation between different GTOM measures
conn.comp1 = component.dist(AdjMat1[rest1,rest1]) # need the SNA library
conn.comp1$csize
[1] 1042
# Thus, there is only 1 connected component
#rest1[rest1] = (conn.comp1$membership==1);
AdjMat1rest = AdjMat1[rest1,rest1]
Degree.rest = apply(AdjMat1rest,2,sum);
datAnnotation = datAnnotation[rest1,]
essentiality = essentiality[rest1]
rm(DegreeRank,AdjMat1); collect_garbage();
summary(Degree.rest)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 9.00 12.50 15.94 19.00 74.00
# Compute the distance measures
dissGTOM0 = 1 - AdjMat1rest; diag(dissGTOM0) = 0
# Compute the generalized TOM based dissimilarity
maxk = 3;
for (k in 1:maxk){
assign(paste(“dissGTOM”,k,sep=””), TOMkdist1(AdjMat1rest,k));}
collect_garbage();
# This creates a hierarchical clustering using the TOM matrix
hierGTOM0 = hclust(as.dist(dissGTOM0),method="average")
hierGTOM1 = hclust(as.dist(dissGTOM1),method="average")
hierGTOM2 = hclust(as.dist(dissGTOM2),method="average")
par(mfrow=c(1,3))
plot(hierGTOM0, main="Adjacency Matrix: GTOM0", labels=F, xlab=“”, sub=“”)
plot(hierGTOM1, main="Standard TOM Measure: GTOM1", labels=F, xlab=“”, sub=“”)
plot(hierGTOM2, main="New TOM Measure: GTOM2", labels=F, xlab=“”, sub=“”)
# This suggests a height cut-off of h1 for the GTOM1 but one should check the robustness of this choice
colorGTOM0=as.character(modulecolor2(hierGTOM0,h1=.95,minsize1=40))
colorGTOM1=as.character(modulecolor2(hierGTOM1,h1=.95,minsize1=40))
colorGTOM2=as.character(modulecolor2(hierGTOM2,h1=.6,minsize1=50))
table(colorGTOM0,essentiality)
essentiality
colorGTOM0 0 1
blue 9 37
brown 26 10
grey 374 115
turquoise 47 25
yellow 2 23
table(colorGTOM1, essentiality)
essentiality
colorGTOM1 0 1
black 25 5
blue 10 73
brown 28 11
green 0 22
grey 267 65
red 31 7
turquoise 56 24
yellow 41 3
table(colorGTOM2, essentiality)
essentiality
colorGTOM2 0 1
blue 69 35
brown 11 80
grey 209 65
turquoise 136 16
yellow 33 14
essentialityColor=ifelse(essentiality==1,”black”, “white”)
par(mfrow=c(4,3),mar=c(2,2,2,2))
plot(hierGTOM0, main="Adjacency Matrix: GTOM0", labels=F, xlab=“”, sub=“”)
plot(hierGTOM1, main="Standard TOM Measure: GTOM1", labels=F, xlab=“”, sub=“”)
plot(hierGTOM2, main="New TOM Measure: GTOM2", labels=F, xlab=“”, sub=“”)
hclustplot1(hierGTOM0,colorGTOM1, title1="")
hclustplot1(hierGTOM1,colorGTOM1, title1="Colored by GTOM1 modules")
hclustplot1(hierGTOM2,colorGTOM1, title1="")
hclustplot1(hierGTOM0,colorGTOM2, title1="")
hclustplot1(hierGTOM1,colorGTOM2, title1="Colored by GTOM2 modules")
hclustplot1(hierGTOM2,colorGTOM2, title1="")
hclustplot1(hierGTOM0, essentialityColor, title1="")
hclustplot1(hierGTOM1, essentialityColor,title1="Colored by essentiality")
hclustplot1(hierGTOM2, essentialityColor, title1="")
# How related are the distances?
datDist = data.frame(dissGTOM0=c(as.dist(dissGTOM0)), dissGTOM1=c(as.dist(dissGTOM1)), dissGTOM2=c(as.dist(dissGTOM2)), dissGTOM3=c(as.dist(dissGTOM3)))
set.seed(seed=1) # fix the seed so that the results are reproducible
datDist2 = datDist[sample(1:dim(datDist)[[1]],10000),]
dim(datDist2)
pairs(datDist2, upper.panel=panel.smooth, lower.panel=panel.cor, diag.panel=panel.hist)
# This creates a hierarchical clustering using the TOM and Correlation matrices
hierGTOM3 = hclust(as.dist(dissGTOM3),method="average")
plot(hierGTOM3, main="New TOM Measure: GTOM3", labels=F, xlab=“”, sub=“”)
colorGTOM3=as.character(modulecolor2(hierGTOM3,h1=0.03,minsize1=30))
# The function TOMplot2 creates a TOM plot where the top and left color bars can be different
# Inputs: distance measure, hierarchical (hclust) object, color label=colorTOP row, colorLeft row
TOMplot2(dissGTOM0, hierGTOM0, colorGTOM1, colorGTOM0);
TOMplot2(dissGTOM1, hierGTOM1, colorGTOM1, colorGTOM1);
TOMplot2(dissGTOM2, hierGTOM2, colorGTOM1, colorGTOM2);
TOMplot2(dissGTOM3, hierGTOM3, colorGTOM1, colorGTOM3);
# MDS plots using module color
cmdGTOM0 = cmdscale(dissGTOM0,k=3);
cmdGTOM1 = cmdscale(dissGTOM1,k=3);
cmdGTOM2 = cmdscale(dissGTOM2,k=3);
cmdGTOM3 = cmdscale(dissGTOM3,k=3);
par(mfrow=c(2,2))
symbol1 = c(17, 1)[factor(essentiality)]
plot(cmdGTOM0,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM0”)
plot(cmdGTOM1,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM1”)
plot(cmdGTOM2,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2” , main=”distGTOM2”)
plot(cmdGTOM3,col=as.character(colorGTOM1) , pch=symbol1, cex=1.1, xlab=“Dimension 1”, ylab=“Dimension 2”, main=”distGTOM3”)
The Use of GTOM for Screening for Neighboring Genes
To show a potential use of the GTOM measures, we carry out the following analysis.
Recall that for each gene the essentiality is an external gene significance variable based on knock out experiments. essentiality=1 means that the gene is essential for yeast survival, 0 otherwise.
We start with an essential gene with high connectivity, i.e. an essential hub gene.
Next we create the following lists.
List1: rank all genes according to their correlation with that essential gene.
List 2: rank all genes according to their TOM1 values with that gene
List 3: rank all genes according to their TOM2 value with that gene.
The goal is to show that Lists 2 and3 are often more enriched with essential genes than the other lists.
We will create the following plots.
x-axis rank of genes say 1:20, y-axis proportion of essential genes in the list up to this rank.
So, each line starts at point (1,1) because the first gene is essential.
At point x=10, the y-value equals the proportion of essential genes that are in the top 10 list of each screening procedure.
This results in figures that shows a) that the TOM measures are superior to the correlation measure and b) highlight the differences between TOM1 and TOM2.
GeneSignificance=essentiality
# Missing essentiality is assigned a label of -1
GeneSignificance[is.na(GeneSignificance)] = -1
# Goal: find the most highly connected essential genes
# Trick: if a gene is non-essential or essentiality is missing we assign it negative connectivity
Degree2 = Degree.rest
Degree2[GeneSignificance!=1 ] = - Degree2[GeneSignificance!=1 ]
DegreeOrder = order(Degree2,decreasing=T)
Degree2[DegreeOrder[1:10]] # each node is not counted as a neighbor of itself
#In the following we create 10 plots for the 10 most highly connected essential genes.
par(mfrow=c(2,5))
no.essentialhubs=10
# this is the number of neighbors that will be considered for each of the essential hub genes
no.neighbors=20
for (i in c(1: no.essentialhubs) ) {
# The following gene is the i-th most highly connected essential gene
Hub1 = DegreeOrder[i]
Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself
Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);
Temp1 = dissGTOM1[Hub1,];
Temp1[Hub1] = -999;
Temp1[GeneSignificance==-1] = 999;
Hub1ListGTOM1 = c(order(Temp1, decreasing=F));
Temp2 = dissGTOM2[Hub1,];
Temp2[Hub1] = -999;
Temp2[GeneSignificance==-1] = 999;
Hub1ListGTOM2 = c(order(Temp2, decreasing=F));
Temp3 = dissGTOM3[Hub1,];
Temp3[Hub1] = -999;
Temp3[GeneSignificance==-1] = 999;
Hub1ListGTOM3 = c(order(Temp3, decreasing=F));
# This computes the proportion of essential genes among the most correlated genes.
Hub1PropEssGTOM0 = sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0);
Hub1PropEssGTOM1 = cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1));
Hub1PropEssGTOM2 = cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2));
Hub1PropEssGTOM3 = cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3));
Hub1PropEss = data.frame(GTOM0=Hub1PropEssGTOM0, GTOM1=Hub1PropEssGTOM1, GTOM2=Hub1PropEssGTOM2, GTOM3=Hub1PropEssGTOM3)[1:no.neighbors,]
matplot(Hub1PropEss, type="l", lty=c(4,3,2,1), lwd=3, col=c(2,3,4,5), ylab="Proportion Essential", xlab="rank", main=paste(“essential hub number: “, i) )
legend(2,1, c("GTOM0", "GTOM1","GTOM2", “GTOM3”), col=c(2,3,4,5), lty=c(4,3,2,1), lwd=3, pch = "*",ncol = 1, cex=.8)
title("")
}# end of for loop
# Comment: Note that in many cases the degree measure outperforms the TOM based measures.
# But there is a lot of variability and it is more meaningful to summarize the results for multiple genes in a single plot.
# Toward this end, we will now average these findings.
# Averaging over all the top essential hubs
no.essentialhubs=30
no.neighbors=40
Hub1PropEssGTOM0=rep(0,no.neighbors)
Hub1PropEssGTOM1=rep(0,no.neighbors)
Hub1PropEssGTOM2=rep(0,no.neighbors)
Hub1PropEssGTOM3=rep(0,no.neighbors)
for (i in c(1:no.essentialhubs) ) {
# The following gene is the i-th most highly connected essential gene
Hub1 = DegreeOrder[i]
Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself
Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);
Temp1 = dissGTOM1[Hub1,];
Temp1[Hub1] = -999;
Temp1[GeneSignificance==-1] = 999;
Hub1ListGTOM1 = c(order(Temp1, decreasing=F))[1:no.neighbors];
Temp2 = dissGTOM2[Hub1,];
Temp2[Hub1] = -999;
Temp2[GeneSignificance==-1] = 999;
Hub1ListGTOM2 = c(order(Temp2, decreasing=F))[1:no.neighbors];
Temp3 = dissGTOM3[Hub1,];
Temp3[Hub1] = -999;
Temp3[GeneSignificance==-1] = 999;
Hub1ListGTOM3 = c(order(Temp3, decreasing=F))[1:no.neighbors];
# This computes the proportion of essential genes among the most correlated genes.
# This computes the proportion of essential genes among the most correlated genes.
Hub1PropEssGTOM0 = Hub1PropEssGTOM0+ sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0)/no.essentialhubs;
Hub1PropEssGTOM1 = Hub1PropEssGTOM1+ cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1))/no.essentialhubs;
Hub1PropEssGTOM2 = Hub1PropEssGTOM2+ cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2))/no.essentialhubs;
Hub1PropEssGTOM3 = Hub1PropEssGTOM3+ cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3))/no.essentialhubs;
}# end of for loop
Hub1PropEss = data.frame(GTOM0=Hub1PropEssGTOM0, GTOM1=Hub1PropEssGTOM1, GTOM2=Hub1PropEssGTOM2, GTOM3=Hub1PropEssGTOM3);
par(mfrow=c(1,1))
matplot(Hub1PropEss, type="l", lty=c(4,3,2,1), lwd=3, col=c(2,3,4,5), ylab="Proportion Essential", xlab="Rank", main=paste(“average over“, i, “essential hub genes “), cex.lab=1.5 ,cex.main=1.5 )
legend(5,1, c("GOM0","GTOM1","GTOM2", "GTOM3"), col=2:5, lty=c(4,3,2,1),lwd=3, pch = "",ncol = 1, cex=1.5)
title("")
The curve suggests that GTOM2 is better than the other measures. But to rigorously test it for a given rank, we will use the Wilcoxon test as described in the following
Now we test whether the difference at Rank=10 is significant
Rank1=10
# Averaging over all the top essential hubs
no.essentialhubs=30
no.neighbors=40
PropEssentialGTOM0=rep(NA,no.essentialhubs)
PropEssentialGTOM1=rep(NA,no.essentialhubs)
PropEssentialGTOM2=rep(NA,no.essentialhubs)
PropEssentialGTOM3=rep(NA,no.essentialhubs)
for (i in c(1:no.essentialhubs) ) {
# The following gene is the i-th most highly connected essential gene
Hub1 = DegreeOrder[i]
Temp0 = dissGTOM0[Hub1,]; # each node is counted as a neighbor of itself
Hub1ListGTOM0 = (Temp0==0) & (GeneSignificance!=-1);
Temp1 = dissGTOM1[Hub1,];
Temp1[Hub1] = -999;
Temp1[GeneSignificance==-1] = 999;
Hub1ListGTOM1 = c(order(Temp1, decreasing=F))[1:no.neighbors];
Temp2 = dissGTOM2[Hub1,];
Temp2[Hub1] = -999;
Temp2[GeneSignificance==-1] = 999;
Hub1ListGTOM2 = c(order(Temp2, decreasing=F))[1:no.neighbors];
Temp3 = dissGTOM3[Hub1,];
Temp3[Hub1] = -999;
Temp3[GeneSignificance==-1] = 999;
Hub1ListGTOM3 = c(order(Temp3, decreasing=F))[1:no.neighbors];
# This computes the proportion of essential genes among the most correlated genes.
# This computes the proportion of essential genes among the most correlated genes.
PropEssentialGTOM0[i]= sum(Hub1ListGTOM0 & GeneSignificance==1)/sum(Hub1ListGTOM0)
PropEssentialGTOM1[i]=c(cumsum(GeneSignificance[Hub1ListGTOM1])/(1:length(Hub1ListGTOM1)))[Rank1];
PropEssentialGTOM2[i]=c(cumsum(GeneSignificance[Hub1ListGTOM2])/(1:length(Hub1ListGTOM2)))[Rank1];
PropEssentialGTOM3[i]=c(cumsum(GeneSignificance[Hub1ListGTOM3])/(1:length(Hub1ListGTOM3)))[Rank1];
}# end of for loop
signif(wilcox.test(PropEssentialGTOM0- PropEssentialGTOM2)$p.value,2)
signif(wilcox.test(PropEssentialGTOM1-PropEssentialGTOM2)$p.value,2)
signif(wilcox.test(PropEssentialGTOM3-PropEssentialGTOM2)$p.value,2)
Output
> signif(wilcox.test(PropEssentialGTOM0- PropEssentialGTOM2)$p.value,2)
[1] 0.034
> signif(wilcox.test(PropEssentialGTOM1-PropEssentialGTOM2)$p.value,2)
[1] 0.015
> signif(wilcox.test(PropEssentialGTOM3-PropEssentialGTOM2)$p.value,2)
[1] 0.02
Discussion
We find that neighborhood analysis with GTOM2 leads to significantly better results than GTOM0 (Wilcoxon p-value=0.034), GTOM1 (p-value= 0.015) and GTOM3 (p-value=0.02).
While this example shows that GTOM2 measure can lead to superior results than the standard measures GTOM0 and GTOM1, there is no doubt that more comprehensive empirical analyses are needed to establish the usefulness and limitations of the GTOM2 measure.
THE END
17