-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering_methods.py
319 lines (248 loc) · 12 KB
/
clustering_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
feature_dim = 2048
from logger import logging, CustomLogger
clmLogger = CustomLogger('clm')
def print_memory_usage(message=""):
"""Print GPU memory usage."""
allocated = torch.cuda.memory_allocated() / (1024 ** 2)
reserved = torch.cuda.memory_reserved() / (1024 ** 2)
clmLogger.log(f"{message} - Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB")
def getClusterCenters(data, labels, batch_size, custom_distance):
unique_labels = np.unique(labels)
clmLogger.log("unique_labels {}".format(unique_labels))
medoid_indexes = []
with torch.no_grad():
for label in unique_labels:
clmLogger.log("------LABEL {}".format(label))
cluster_indexes = np.where(labels == label)[0]
cluster_blob = data[cluster_indexes]
num_points = cluster_blob.shape[0]
print('Cluster blob initial shape ', cluster_blob.shape)
cluster_blob = torch.from_numpy(data[cluster_indexes].reshape(num_points,-1,feature_dim)).float()
print('Cluster blob shape 2 ', cluster_blob.shape)
# Calculate distance matrix (upper triangle since direction doesn't matter)
distance_matrix = np.zeros((num_points, num_points))
for i in range(num_points):
for j in range(i+1, num_points, batch_size):
end = min(j+batch_size, num_points)
batch_videos_j = cluster_blob[j:end].to('cuda')
batch_videos_i = cluster_blob[i].unsqueeze(0).expand(end-j, -1, -1).to('cuda')
distance = custom_distance(batch_videos_i, batch_videos_j).cpu().numpy()
distance_matrix[i, j:end] = distance
distance_matrix[j:end, i] = distance # Symmetric matrix
total_distances = np.sum(distance_matrix, axis=1)
medoid_index = np.argmin(total_distances)
clmLogger.log("medoid_index {}".format(medoid_index))
final_index = cluster_indexes[medoid_index]
medoid_indexes.append(final_index)
medoid_indexes = np.array(medoid_indexes)
clmLogger.log('Medoid indexes {}'.format(medoid_indexes))
return medoid_indexes, data[medoid_indexes]
def getClusterCentersWDistance(data, labels, precomp_distance):
unique_labels = np.unique(labels)
clmLogger.log("unique_labels {}".format(unique_labels))
medoid_indexes = []
with torch.no_grad():
for label in unique_labels:
clmLogger.log("------LABEL {}".format(label))
cluster_indexes = np.where(labels == label)[0]
cluster_blob = data[cluster_indexes]
num_points = cluster_blob.shape[0]
distance_matrix = np.zeros((num_points, num_points))
for i in range(num_points):
for j in range(i+1, num_points):
distance_matrix[i,j] = precomp_distance[cluster_indexes[i], cluster_indexes[j]]
total_distances = np.sum(distance_matrix, axis=1)
medoid_index = np.argmin(total_distances)
clmLogger.log("medoid_index {}".format(medoid_index))
final_index = cluster_indexes[medoid_index]
medoid_indexes.append(final_index)
medoid_indexes = np.array(medoid_indexes)
clmLogger.log('Medoid indexes {}'.format(medoid_indexes))
return medoid_indexes, data[medoid_indexes]
def visualize_distance_matrix(distance_matrix, save_dir='results/distance_matrix.png'):
plt.imshow(distance_matrix, cmap='viridis', interpolation='nearest')
plt.colorbar(label='Distance')
plt.title('Distance Matrix Heatmap')
plt.xlabel('Data Point Index')
plt.ylabel('Data Point Index')
plt.show()
plt.savefig(save_dir)
def visualize_clusters_with_pca(data, labels, title="DBSCAN Clustering", method='pca', save_dir='figures/clusters.png'):
"""
Function to visualize clustered data with PCA or t-SNE, with each label having a different color.
Parameters:
- data: numpy array, the dataset (samples, 300, 1024)
- labels: numpy array, the cluster labels from DBSCAN
- method: 'pca' or 'tsne' for dimensionality reduction
"""
# Flatten the data from (samples, 300, 1024) to (samples, 300 * 1024)
clmLogger.log('Data size {}'.format(data.shape))
flattened_data = data
#flattened_data = data.reshape(data.shape[0], -1)
# Apply dimensionality reduction (PCA or t-SNE)
if method == 'pca':
reducer = PCA(n_components=2) # Reduce to 2 dimensions
reduced_data = reducer.fit_transform(flattened_data)
elif method == 'tsne':
# First PCA in temporal dimension, then TSNE IN REST
data_shortened = np.zeros((data.shape[0],10,1024))
reducer = PCA(n_components=10)
for i in range(len(data)):
vid = data[i].reshape(1024,-1)
reduced_data = reducer.fit_transform(vid)
data_shortened[i] = reduced_data.reshape(-1,1024)
data_shortened = data_shortened.reshape(-1, 10*1024)
perplexity_value = min(30, len(data_shortened) - 1) # Ensure perplexity < n_samples
reducer = TSNE(n_components=2, perplexity=perplexity_value, random_state=0)
reduced_data = reducer.fit_transform(data_shortened)
else:
raise ValueError("Unknown method: use 'pca' or 'tsne'")
# Generate a color map based on the number of unique labels
unique_labels = set(labels)
colors = plt.cm.get_cmap('rainbow', len(unique_labels))
# Plot each cluster with a different color
for label in unique_labels:
# Color noise points as black (label == -1)
if label == -1:
color = 'k'
else:
color = colors(label / len(unique_labels))
plt.scatter(reduced_data[labels == label, 0], reduced_data[labels == label, 1],
c=[color], label=f"Cluster {label}" if label != -1 else "Noise", s=50, edgecolors='k')
plt.title(title)
plt.legend()
plt.show()
plt.savefig(save_dir)
def DBSCAN_clustering(data):
# NO ACTUAL CLUSTER CENTERS...
# DBSCAN clustering
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
clmLogger.log(clustering.labels_) # -1 indicates noise points
return clustering
def custom_affinitypropagation(data, distance_matrix):
# Convert distances to similarities (AffinityPropagation expects similarities)
similarity_matrix = -distance_matrix
# Apply Affinity Propagation clustering
clustering = AffinityPropagation(verbose=True, affinity='precomputed').fit(similarity_matrix)
# Output number of clusters and cluster labels
clmLogger.log("Number of clusters: {}".format(len(set(clustering.labels_))))
clmLogger.log("Cluster labels: {}".format(clustering.labels_))
clmLogger.log('Cluster indices: {} type {}'.format(clustering.cluster_centers_indices_, type(clustering.cluster_centers_indices_)))
return clustering.labels_, clustering.cluster_centers_indices_, [data[i] for i in clustering.cluster_centers_indices_]
def GMM(data):
lowest_bic = np.infty
best_gmm = None
n_components_range = range(1, 6)
for n_components in n_components_range:
gmm = GaussianMixture(n_components=n_components)
gmm.fit(data)
bic = gmm.bic(data)
if bic < lowest_bic:
lowest_bic = bic
best_gmm = gmm
clmLogger.log("Best number of components: {}".format(best_gmm.n_components))
return best_gmm
def custom_dbscan(X, eps=0.1, min_samples=2, custom_distance_func=None):
"""
DBSCAN clustering with a custom distance function.
Parameters:
- X: The input data (numpy array).
- eps: The maximum distance between two samples for them to be considered as in the same neighborhood.
- min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered a core point.
- custom_distance_func: A custom distance function that takes two points as input and returns their distance.
Returns:
- Cluster labels for each point in the dataset.
"""
if custom_distance_func is None:
raise ValueError("A custom distance function must be provided.")
# Run DBSCAN with the custom distance function
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=custom_distance_func)
dbscan.fit(X)
return dbscan.labels_
# Custom distance function (e.g., Manhattan distance)
def custom_manhattan_distance(x, y):
return np.sum(np.abs(x - y), axis=-1)
def visualize_clusters(data, labels, title="DBSCAN Clustering"):
"""
Function to visualize clustered data, with each label having a different color.
Parameters:
- data: numpy array, the dataset
- labels: numpy array, the cluster labels from DBSCAN
"""
# Generate a color map based on the number of unique labels
unique_labels = set(labels)
colors = plt.cm.get_cmap('rainbow', len(unique_labels))
for label in unique_labels:
# Color noise points as black (label == -1)
if label == -1:
color = 'k'
else:
color = colors(label / len(unique_labels))
# Plot each cluster with a different color
plt.scatter(data[labels == label, 0], data[labels == label, 1], c=[color], label=f"Cluster {label}" if label != -1 else "Noise", s=50, edgecolors='k')
plt.title(title)
plt.legend()
plt.show()
def k_medoids(distance_matrix, k, max_iter=100):
"""
k-medoids clustering algorithm with precomputed distance matrix.
Parameters:
- distance_matrix: Precomputed distance matrix (n x n).
- k: Number of clusters (medoids).
- max_iter: Maximum number of iterations.
Returns:
- medoids: Final medoids' indices.
- labels: Cluster labels for each point.
"""
n = distance_matrix.shape[0] # Number of data points
# Step 1: Initialize k medoids randomly
medoids = random.sample(range(n), k)
# Function to assign each point to the nearest medoid
def assign_labels(medoids):
labels = np.argmin(distance_matrix[:, medoids], axis=1)
return labels
# Step 2: Assign each point to the nearest medoid
labels = assign_labels(medoids)
# Step 3: Iteratively update the medoids
for _ in range(max_iter):
new_medoids = medoids.copy()
# Try to update each medoid
for i in range(k):
cluster_indices = np.where(labels == i)[0] # Points assigned to the i-th medoid
if len(cluster_indices) == 0:
continue
# Find the point in the cluster with the minimum total distance to all other points in the cluster
min_total_distance = np.inf
best_medoid = medoids[i]
for candidate in cluster_indices:
total_distance = np.sum(distance_matrix[candidate, cluster_indices])
if total_distance < min_total_distance:
min_total_distance = total_distance
best_medoid = candidate
new_medoids[i] = best_medoid
# Step 4: Check for convergence (medoids do not change)
if np.array_equal(medoids, new_medoids):
break
medoids = new_medoids
labels = assign_labels(medoids)
return medoids, labels
if __name__=="__main__":
# Example data - Simple function testing.
X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]])
X= np.random.rand(100,2)
clmLogger.log(X[0])
cluster = DBSCAN_clustering(X)
cluster2_labels = custom_dbscan(X, 3, 2, custom_manhattan_distance)
clmLogger.log("Cluster1 labels: {}".format(cluster.labels_))
clmLogger.log("Cluster2 labels: {}".format(cluster2_labels))
visualize_clusters(X, cluster2_labels, "Custom DBSCAN")
visualize_clusters(X, cluster.labels_)