-
Notifications
You must be signed in to change notification settings - Fork 6
/
analyze_WGCNA.R
287 lines (234 loc) · 10.8 KB
/
analyze_WGCNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
### WGCNA ###
# BiocManager::install("WGCNA",force = TRUE)
####
rm(list = ls())
options(stringsAsFactors = F)
library(WGCNA)
RNAseq_voom <- TPM # read.table('TCGA-BRCA-FPKM.symbol(1).txt',header =T )
## 1 因为WGCNA针对的是基因进行聚类,而一般我们的聚类是针对样本用hclust即可,所以这个时候需要转置
## median absolute deviation (mad),绝对中位差,选前8000个。
WGCNA_matrix = t(RNAseq_voom[order(apply(RNAseq_voom,1,mad), decreasing = T)[1:8000],])
datExpr0 <- WGCNA_matrix ## top 8000 mad genes
datExpr <- datExpr0
## 下面主要是为了防止临床表型与样本名字对不上
library(openxlsx)
datTraits <- read.xlsx('datTraits.xlsx')
sampleNames = rownames(datExpr)
traitRows = match(sampleNames, datTraits$sample)
rownames(datTraits) = datTraits[traitRows, 1]
## 2 选择合适“软阀值(soft thresholding power)”beta
powers = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#设置网络构建参数选择范围,计算无尺度分布拓扑矩阵
# Plot the results:
##sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.8,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
## 3 构建共表达矩阵 ## 最重要的一步 !!!!!!
# 有了表达矩阵和估计好的最佳beta值,就可以直接构建共表达矩阵
## mergeCutHeight: 合并模块的阈值,越大模块越少
net = blockwiseModules(
datExpr,
power = sft$powerEstimate,
maxBlockSize = 6000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = F,
verbose = 3
)
table(net$colors)
### 把输入的表达矩阵的几千个基因组归类成了几十个模块.算基因间的邻接性,根据邻接性计算基因间的相似性,
### 然后推出基因间的相异性系数,并据此得到基因间的系统聚类树。然后按照混合动态剪切树的标准,
### 设置每个基因模块最少的基因数目为30。
## 4 模块可视化
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
table(mergedColors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
## assign all of the gene to their corresponding module
## hclust for the genes.
## 用不同的颜色来代表那些所有的模块,其中灰色默认是无法归类于任何模块的那些基因,
## 如果灰色模块里面的基因太多,那么前期对表达矩阵挑选基因的步骤可能就不太合适。
## step 5 (最重要的) 模块和性状的关系
## 这一步主要是针对于连续变量,如果是分类变量,需要转换成连续变量方可使用
table(datTraits$subtype)
if(T){
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
design=model.matrix(~0+ datTraits$subtype)
colnames(design)=levels(datTraits$subtype)
moduleColors <- labels2colors(net$colors)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0) ##不同颜色的模块的ME值矩 (样本vs模块)
moduleTraitCor = cor(MEs, design , use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
png("step5-Module-trait-relationships.png",width = 800,height = 1200,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
# 除了上面的热图展现形状与基因模块的相关性外
# 还可以是条形图,但是只能是指定某个形状
# 或者自己循环一下批量出图。
Luminal = as.data.frame(design[,3]);
names(Luminal) = "Luminal"
y=Luminal
GS1=as.numeric(cor(y,datExpr, use="p"))
GeneSignificance=abs(GS1)
# Next module significance is defined as average gene significance.
ModuleSignificance=tapply(GeneSignificance,
moduleColors, mean, na.rm=T)
sizeGrWindow(8,7)
par(mfrow = c(1,1))
# 如果模块太多,下面的展示就不友好
# 不过,我们可以自定义出图。
plotModuleSignificance(GeneSignificance,moduleColors)
}
## 通过模块与各种表型的相关系数,可以很清楚的挑选自己感兴趣的模块进行下游分析了
## 6 感兴趣性状的模块的具体基因分析
## 1)首先计算模块与基因的相关性矩阵
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
## 2) 再计算性状与基因的相关性矩阵
## 只有连续型性状才能只有计算
## 这里把是否属于 Luminal 表型这个变量用0,1进行数值化。
Luminal = as.data.frame(design[,3])
names(Luminal) = "Luminal"
geneTraitSignificance = as.data.frame(cor(datExpr, Luminal, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(Luminal), sep="")
names(GSPvalue) = paste("p.GS.", names(Luminal), sep="")
## 3) 最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
module = "brown"
column = match(module, modNames)
moduleGenes = moduleColors==module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for Luminal",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
### step7:网络的可视化
## 选取部分基因做拓扑热图,随机选择400个基因画拓扑重叠热图,图中行和列都表示单个基因,深黄色和红色表示高度的拓扑重叠:
load(net$TOMFiles[1], verbose=T)
TOM <- as.matrix(TOM)
nSelect = 400
select = sample(nGenes, size = nSelect)
dissTOM = 1-TOM
# Transform dissTOM with a power to make moderately strong
selectTOM = dissTOM[select, select]
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^softPower
diag(plotDiss) = NA
TOMplot(plotDiss,
selectTree,
selectColors,
main = "Network heatmap plot, selected genes")
## 画模块和性状的关系
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 只有连续型性状才能只有计算
## 这里把是否属于 Luminal 表型这个变量用0,1进行数值化。
Luminal = as.data.frame(design[,3])
names(Luminal) = "Luminal"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, Luminal))
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5)
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle
= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 模块的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
## step8:提取指定模块的基因名\
# 主要是关心具体某个模块内部的基因
if(T){
# Select module
module = "brown";
# Select module probes
probes = colnames(datExpr) ## 我们例子里面的probe就是基因
inModule = (moduleColors==module);
modProbes = probes[inModule];
head(modProbes)
# 如果使用WGCNA包自带的热图就很丑。
which.module="brown"
dat=datExpr[,moduleColors==which.module ]
plotMat(t(scale(dat)),nrgcols=30,rlabels=T,
clabels=T,rcols=which.module,
title=which.module )
datExpr[1:4,1:4]
dat=t(datExpr[,moduleColors==which.module ] )
library(pheatmap)
pheatmap(dat ,show_colnames =F,show_rownames = F) #对那些提取出来的1000个基因所在的每一行取出,组合起来为一个新的表达矩阵
n=t(scale(t(log(dat+1)))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
group_list=datTraits$subtype
ac=data.frame(g=group_list)
rownames(ac)=colnames(n)
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac )
# 可以很清晰的看到,所有的形状相关的模块基因
# 其实未必就不是差异表达基因。
}