-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathClusterCpGs.R
184 lines (148 loc) · 9.94 KB
/
ClusterCpGs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
##################################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##################################################################################################
# Start
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Clustering 450K methylation CpGs probes written by Dr Reza Rafiee
# Research Associate, Northern Institute for Cancer Research, Newcastle University
# This script loads 20,000 methylation probes (from 450K methylation profiling) and doing clustering analysis
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
library(mclust) # Gaussian Mixture Modelling package for Model-Based Clustering, Classification, and Density Estimation
library(scatterplot3d)
library(pheatmap)
library(apcluster) # Affinity Propagation Clustering
load("~/20KBetaValues_51InfantSHH.RData") # 20,000 probes
length(colnames(BetaValues_51Samples_20K)) # n=51
# Performs a principal components analysis on the given data matrix and returns the results as an object of class prcomp
PCA_Comp_Scaled_Centered <- prcomp(t(BetaValues_51Samples_20K), center = TRUE, scale=T) # scale =T is appropriate for high-dimensional omic data
summary(PCA_Comp_Scaled_Centered)
# Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
# Standard deviation 56.400 45.3408 35.12111 32.24443 28.50901 27.18614 24.98240 24.3309 22.87971 21.83193 21.25324 20.75756 20.46585 18.90885 18.24397 17.89305
# Proportion of Variance 0.159 0.1028 0.06167 0.05199 0.04064 0.03695 0.03121 0.0296 0.02617 0.02383 0.02259 0.02154 0.02094 0.01788 0.01664 0.01601
# Cumulative Proportion 0.159 0.2618 0.32351 0.37550 0.41614 0.45309 0.48430 0.5139 0.54007 0.56390 0.58649 0.60803 0.62897 0.64685 0.66349 0.67950
# PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31
# Standard deviation 17.63880 17.53269 17.04100 16.66391 16.16575 15.92072 15.55023 15.35452 15.11828 14.7654 14.57411 14.51679 14.25707 14.21835 14.03399
# Proportion of Variance 0.01556 0.01537 0.01452 0.01388 0.01307 0.01267 0.01209 0.01179 0.01143 0.0109 0.01062 0.01054 0.01016 0.01011 0.00985
# Cumulative Proportion 0.69506 0.71043 0.72495 0.73883 0.75190 0.76457 0.77666 0.78845 0.79988 0.8108 0.82140 0.83194 0.84210 0.85221 0.86205
# PC32 PC33 PC34 PC35 PC36 PC37 PC38 PC39 PC40 PC41 PC42 PC43 PC44 PC45 PC46 PC47
# Standard deviation 13.89703 13.62558 13.54381 13.2693 13.12656 12.8847 12.78211 12.46655 12.22380 11.93376 11.72942 11.5741 11.41000 11.26610 10.89133 10.6728
# Proportion of Variance 0.00966 0.00928 0.00917 0.0088 0.00862 0.0083 0.00817 0.00777 0.00747 0.00712 0.00688 0.0067 0.00651 0.00635 0.00593 0.0057
# Cumulative Proportion 0.87171 0.88099 0.89017 0.8990 0.90758 0.9159 0.92405 0.93182 0.93930 0.94642 0.95330 0.9600 0.96650 0.97285 0.97878 0.9845
# PC48 PC49 PC50 PC51
# Standard deviation 10.44080 10.25791 9.81097 6.577e-14
# Proportion of Variance 0.00545 0.00526 0.00481 0.000e+00
# Cumulative Proportion 0.98993 0.99519 1.00000 1.000e+00
#
# How many eigenvectors/components we need to use in this analysis?
# Taking the first k eigenvectors that capture at least 99% of the total variance
# creating a graphic visualization of the relationship between eigenvalues and number of factors
par(mfrow=c(1,1))
par(mar=c(5,4,4,5) + 0.1)
par(cex.axis=0.8)
plot(PCA_Comp_Scaled_Centered,main = "Variances vs. number of components", type = "l")
# biplot(prcomp(PCA_Comp_Scaled_Centered$x, scale = TRUE))
pairs(PCA_Comp_Scaled_Centered$x[,1:5], pch=19, col="black",log = "")
GMM_object_PCA <- Mclust(as.matrix(PCA_Comp_Scaled_Centered$x[,1:5]), G=1:4)
Best_Num_of_Clusters <- dim(GMM_object_PCA$z)[2]
cat("Model-based optimal number of clusters:", Best_Num_of_Clusters, "\n")
# model-based optimal number of clusters: 2 clusters
# show a matrix whose [i,k]th entry is the probability that observation i in the test data belongs to the kth class.
GMM_object_PCA$z
# Resampling-based Inference for Gaussian finite mixture models
# Bootstrap or jackknife estimation of standard errors and percentile bootstrap confidence intervals
# for the parameters of a Gaussian mixture model.
bootClust <- MclustBootstrap(GMM_object_PCA)
bootClust$modelName #[1] "VII"
summary(bootClust, what = "se")
summary(bootClust, what = "ci")
# plot(GMM_object_PCA)
plot(GMM_object_PCA, what = c("BIC", "classification", "uncertainty", "density"),
dimens = NULL, xlab = NULL, ylab = NULL, ylim = NULL, addEllipses = TRUE, main = TRUE)
#sort(GMM_object_PCA$z[,1])
#sort(GMM_object_PCA$z[,2])
Probability_Assignment <- GMM_object_PCA$z
#
# [,1] [,2]
# NMB113 1.161098e-18 1.0000000000
# NMB138 1.087764e-13 1.0000000000
# NMB143 3.095577e-15 1.0000000000
# NMB200 9.673815e-01 0.0326185108
# ...
bestmodelBIC <- mclustBIC(GMM_object_PCA$data)
# Top 3 models based on the BIC criterion:
# VII,2 VII,5 EII,5
# -2562.430 -2565.113 -2569.015
length(which(GMM_object_PCA$classification == 1)) # red colour, n=18
length(which(GMM_object_PCA$classification == 2)) # blue colour, n=33
which(GMM_object_PCA$classification == 1)
which(GMM_object_PCA$classification == 2)
# Group 1
# which(GMM_object_PCA$classification == 1)
# NMB200 NMB254 NMB272 NMB32 NMB324 NMB328 NMB363 NMB465 NMB471 NMB483 NMB497 NMB553 NMB594 NMB608 NMB621 NMB64 NMB676 NMB712
# 4 5 6 7 8 9 10 15 17 22 27 29 32 33 35 36 43 45
# Group 2
# which(GMM_object_PCA$classification == 2)
# NMB113 NMB138 NMB143 NMB364 NMB371 NMB379 NMB439 NMB466 NMB474 NMB477 NMB479 NMB482 NMB485 NMB486 NMB495 NMB496 NMB498 NMB554 NMB580 NMB612 NMB651 NMB667 NMB670
# 1 2 3 11 12 13 14 16 18 19 20 21 23 24 25 26 28 30 31 34 37 38 39
# NMB673 NMB674 NMB675 NMB690 NMB720 NMB726 NMB79 NMB798 NMB803 NMB873
# 40 41 42 44 46 47 48 49 50 51
#GMM_object_PCA$data[,1:3]
grpCols <- ifelse(GMM_object_PCA$classification == 1, "blue", "red")
# Add regression plane
shapes = c(16, 17, 18)
my.lm <- lm(GMM_object_PCA$data[,1] ~ GMM_object_PCA$data[,2] + GMM_object_PCA$data[,3])
s3d <- scatterplot3d(GMM_object_PCA$data[,1:3], pch = 17, type = "h", color = grpCols,angle=-225, scale.y=0.9, col.grid="black", grid=TRUE) # angle=-225, 280, 55, 75
s3d$plane3d(my.lm)
text(s3d$xyz.convert(GMM_object_PCA$data[,1:3]), labels = rownames(PCA_Comp_Scaled_Centered$x), cex= 0.8, col = "black")
# Visualising heatmap of PCA components
pheatmap(GMM_object_PCA$data[,1:5],color = colorRampPalette(c("navy", "white", "firebrick3"))(100),clustering_method = "ward.D2")
write.csv(Probability_Assignment, file="~/Probabilities_Assignment.csv")
################################################## AP clustering ###############################
# Affinity Propagation clustering is used to confirm an aggreement/consensus with GMM+EM results
# Affinity propagation (AP) is a relatively new clustering algorithm that has been introduced by
# Brendan J. Frey and Delbert Dueck. The authors themselves describe affinity propagation as
# follows:
# “An algorithm that identifies exemplars among data points and forms clusters of data
# points around these exemplars. It operates by simultaneously considering all data
# point as potential exemplars and exchanging messages between data points until a
# good set of exemplars and clusters emerges.”
AP_object_PCA <- apcluster(negDistMat(r=2), PCA_Comp_Scaled_Centered$x[,1:5], q=0.0)
cat("affinity propogation optimal number of clusters:", length(AP_object_PCA@clusters), "\n")
plot(AP_object_PCA,GMM_object_PCA$data[,1:5])
heatmap(AP_object_PCA)
show(AP_object_PCA)
AP_object_PCA
# APResult object
#
# Number of samples = 51
# Number of iterations = 129
# Input preference = -73914.21
# Sum of similarities = -234507
# Sum of preferences = -221742.6
# Net similarity = -456249.6
# Number of clusters = 3
# Exemplars:
# NMB371 NMB498 NMB553
# Clusters:
# Cluster 1, exemplar NMB371:
# NMB143 NMB371 NMB439 NMB485 NMB486 NMB554 NMB612 NMB670 NMB673 NMB674 NMB675
# Cluster 2, exemplar NMB498:
# NMB113 NMB138 NMB364 NMB379 NMB466 NMB474 NMB477 NMB479 NMB482 NMB495 NMB496 NMB498 NMB580 NMB651 NMB667 NMB690 NMB720 NMB726 NMB79 NMB798 NMB803 NMB873
# Cluster 3, exemplar NMB553:
# NMB200 NMB254 NMB272 NMB32 NMB324 NMB328 NMB363 NMB465 NMB471 NMB483 NMB497 NMB553 NMB594 NMB608 NMB621 NMB64 NMB676 NMB712
length(AP_object_PCA@clusters[[1]]) # n1=11 (red colours)
length(AP_object_PCA@clusters[[2]]) # n2=22 (green colours)
# n1+n2 = 33 (Group 2)
length(AP_object_PCA@clusters[[3]]) # n3=18 (Group 1; blue colours)
# We could use all 20k probes in this clustering
# AP_object_20k <- apcluster(negDistMat(r=2), t(BetaValues_51Samples_20K), q=0.01)
# cat("affinity propogation optimal number of clusters:", length(AP_object_20k@clusters), "\n")
# affinity propogation optimal number of clusters: 2
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# End
##################################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##################################################################################################