-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandMatrix.py
179 lines (144 loc) · 7.64 KB
/
randMatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rand score and hierarchical clustering processing
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import scipy.cluster.hierarchy as sch
from sklearn.metrics import adjusted_rand_score
import graphs as plot
def calc_cluster_characteristics(k, data, clusteringMethod, **kwargs):
"""
define a function that will give us the relevant output for the input k
Args:
k: epsilon value (for DBSCAN), resolution parameter (for phenograph), or cut height (for hierarchical clustering)
data: UMAP embedding of data
clusteringMethod: 'DBSCAN' or 'HC'
"""
print('Parameter value:', k)
results = pd.Series(dtype = 'float64')
results['k'] = k
sampleMeta = kwargs.get('sampleMeta', None)
if clusteringMethod == 'DBSCAN':
results['communities'] = DBSCAN(eps=k, min_samples=2).fit(data).labels_
elif clusteringMethod == 'HC':
Y_cluster = sch.linkage(data.values.T, metric='correlation')
results['communities'], _ = labelHCLandrace(data, sampleMeta, Y_cluster, cutHeight = k, newVariety = False)
else:
print('invalid clustering method')
return results
def randScoreMatrix(data, ks, clusteringMethod, **kwargs):
"""
Matrix of adjusted rand scores for a range of parameter values
Args:
data: UMAP embedding of data
ks: list of parameter values to scan over
clusteringMethod: 'DBSCAN', 'phenograph', or 'HC'
"""
sampleMeta = kwargs.get('sampleMeta', None)
admixedCutoff = kwargs.get('admixedCutoff', None)
if sampleMeta is None:
cluster_chars_list = [calc_cluster_characteristics(k, data, clusteringMethod, admixedCutoff = admixedCutoff) for k in ks]
else:
cluster_chars_list = [calc_cluster_characteristics(k, data, clusteringMethod, sampleMeta = sampleMeta) for k in ks]
cluster_characteristics = pd.concat(cluster_chars_list, axis = 1).transpose()
#calculate rand score
n = len(ks)
Rand_indices = pd.DataFrame(np.zeros((n,n)), index = ks, columns = ks)
Rand_indices.index.name = 'k1'
Rand_indices.columns.name = 'k2'
for i in range(n):
for j in range(n):
Rand_indices.iloc[i, j] = adjusted_rand_score(
cluster_characteristics['communities'][i],
cluster_characteristics['communities'][j]
)
#plot
plt.figure(figsize = (8,8))
plt.imshow(Rand_indices, aspect='auto', interpolation='none', cmap='coolwarm',vmin=0,vmax = 1)
plt.gca().invert_yaxis()
plt.xticks(np.arange(len(ks)),ks)
plt.yticks(np.arange(len(ks)),ks)
plt.title('Adjusted Rand Score')
plt.colorbar()
if clusteringMethod == 'DBSCAN':
plt.xlabel('epsilon 1')
plt.ylabel('epsilon 2')
if clusteringMethod == 'HC':
plt.xlabel('cut height 1')
plt.ylabel('cut height 2')
plt.tight_layout()
def labelHCLandrace(clusterSubset, sampleMeta, Y_cluster, cutHeight, clusterNumber = 0, admixedCutoff = None, newVariety = True):
"""
Take a dendrogram and label samples that match a reference variety, otherwise label as admixed or a new variety
Args:
clusterSubset: processed SNP proportion data subset to a single DBSCAN cluster
Y_cluster: dendrogram for clusterSubset
clusterNumber: DBSCAN cluster number
cutHeight: cutoff value for cutting a dendrogram into clusters
sampleMeta: metadata paired with genotyping data
admixedCutoff: divergence cutoff to differentiate between admixtures and non reference varieties
"""
references = sampleMeta[(sampleMeta['reference'].notna())]
subClusterNumber = np.zeros(len(Y_cluster)+1) #zero is admixed
subClusterNames = [['Admixed']]
counter = 0
subClusters = sch.cut_tree(Y_cluster, height = cutHeight)
for cluster in np.unique(subClusters):
sampleIndex = np.where(subClusters == cluster)[0]
shortName = clusterSubset.columns[sampleIndex][np.isin(clusterSubset.columns[sampleIndex], references['short_name'].values.astype('str'))]
refInSubCluster = np.unique(sampleMeta[sampleMeta['short_name'].isin(shortName.astype('int'))]['reference'])
if (len(refInSubCluster) > 0): #if there's a reference in the cluster
if (len(refInSubCluster) > 1): #if multiple references, flatten
refInSubCluster = np.asarray(['+'.join(refInSubCluster)])
if not np.isin(refInSubCluster, subClusterNames): #add if it's a new reference
subClusterNames.append(list(refInSubCluster))
subClusterNumber[sampleIndex.tolist()] = np.where(subClusterNames == refInSubCluster)[0][0]
elif newVariety: #add new non reference varieties
if not admixedCutoff: #no admixedCutoff
subClusterNames.append(['Genetic entity-'+str(clusterNumber)+'-'+str(counter)])
subClusterNumber[sampleIndex.tolist()] = (len(subClusterNames)-1)
counter += 1
elif min(plot.homozygousDivergence(clusterSubset.values[:,sampleIndex])) < admixedCutoff: #at least one sample must be less than admixedCutoff
subClusterNames.append(['Genetic entity-'+str(clusterNumber)+'-'+str(counter)])
subClusterNumber[sampleIndex.tolist()] = (len(subClusterNames)-1)
counter += 1
return subClusterNumber, subClusterNames
def cutoffQuality(clusterSubset, sampleMeta, Y_cluster):
"""
Evaluate cut height values based on the number of varieties/cluster and technical replicates split up
Args:
clusterSubset: processed SNP proportion data
sampleMeta: metadata paired with genotyping data
Y_cluster: dendrogram for clusterSubset
"""
references = sampleMeta[(sampleMeta['reference'].notna())]
refRows = np.arange(len(clusterSubset.columns))[np.isin(clusterSubset.columns, references['short_name'].values.astype('str'))]
varNames = []
for i in clusterSubset.columns[refRows]:
varNames.append(references[references['short_name'] == int(i)]['reference'].values[0])
varNames = np.asarray(varNames)
cutList = np.linspace(min(Y_cluster[:,2]),max(Y_cluster[:,2]),50)
clusters = sch.cut_tree(Y_cluster, height=cutList)
df = pd.DataFrame(clusters)
repTogether = np.zeros(len(cutList)) #number of technical replicates in same cluster
test = pd.DataFrame(columns=df.columns, index=np.arange(len(np.unique(varNames)))) #number of clusters that contain more than one variety
for i, var in enumerate(np.unique(varNames)):
rows = refRows[np.where(varNames == var)]
repTogether += pd.Series({c: len(df.iloc[rows][c].unique()) for c in df}).values == 1
test.iloc[i] = pd.Series(df.iloc[rows][c].unique() for c in df.columns) #don't count the same variety multiple times
avgVarInCluster = []
for c in test.columns:
_, counts = np.unique(np.concatenate(test[c].values), return_counts=True)
avgVarInCluster.append(np.average(counts))
plt.figure()
plt.plot(cutList, repTogether, '-o', markersize = 2, label = 'Technical replicates together')
plt.plot(cutList, avgVarInCluster, '-^', markersize = 2, label = 'Avg varieties/cluster')
plt.legend(loc = 'upper left')
plt.xlabel('Cut height')
plt.ylabel('Number of varieties')
plt.tight_layout()
return repTogether, np.asarray(avgVarInCluster), len(np.unique(varNames)), cutList