-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopics.py
136 lines (101 loc) · 4.73 KB
/
topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import numpy as np
import seaborn as sns
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import hdbscan
from scipy.stats import gaussian_kde
from tqdm import tqdm
import orjson
from utils import get_messages
cache_folder = 'cache'
os.makedirs(cache_folder, exist_ok=True)
def save_or_load(filename, calculation_function=None, *args, **kwargs):
filepath = os.path.join(cache_folder, filename)
if os.path.exists(filepath):
print(f"Loading {filename} from file...")
data = np.load(filepath)
else:
if calculation_function is None:
raise ValueError("calculation_function must be provided if file does not exist.")
print(f"Calculating {filename}...")
data = calculation_function(*args, **kwargs)
np.save(filepath, data)
return data
embeddings = save_or_load('embeddings.npy')
def perform_pca(data):
pca = PCA(n_components=50)
return pca.fit_transform(data)
embeddings_pca = save_or_load('embeddings_pca.npy', perform_pca, embeddings)
def perform_tsne(data):
tsne = TSNE(n_components=2, random_state=42, verbose=1)
return tsne.fit_transform(data)
embeddings_reduced = save_or_load('embeddings_tsne.npy', perform_tsne, embeddings_pca)
def perform_hdbscan(data):
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, gen_min_span_tree=True)
return clusterer.fit_predict(data)
cluster_labels = save_or_load('cluster_labels.npy', perform_hdbscan, embeddings_reduced)
unique_labels = np.unique(cluster_labels)
num_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
print(f"Number of clusters: {num_clusters}")
def find_best_min_cluster_size(data, min_size_start, min_size_end, step=1):
best_min_cluster_size = None
best_silhouette_score = -1
for min_cluster_size in range(min_size_start, min_size_end + 1, step):
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(data)
filtered_labels = cluster_labels[cluster_labels != -1]
filtered_data = data[cluster_labels != -1]
if len(np.unique(filtered_labels)) > 1:
score = silhouette_score(filtered_data, filtered_labels)
print(f"min_cluster_size: {min_cluster_size}, silhouette_score: {score}")
if score > best_silhouette_score:
best_silhouette_score = score
best_min_cluster_size = min_cluster_size
return best_min_cluster_size, best_silhouette_score
# best_size, best_score = find_best_min_cluster_size(embeddings_reduced, 5, 50, step=5)
# print(f"The best min_cluster_size is {best_size} with a silhouette score of {best_score}")
# The best min_cluster_size is 30 with a silhouette score of 0.6569266
messages = get_messages()
cluster_texts_folder = os.path.join(cache_folder, 'cluster_texts')
os.makedirs(cluster_texts_folder, exist_ok=True)
for cluster_label in np.unique(cluster_labels):
if cluster_label == -1:
continue
indices_in_cluster = np.where(cluster_labels == cluster_label)[0]
cluster_messages = []
for index in indices_in_cluster:
message = messages[int(index)]
cluster_messages.append({'id': int(index), 'text': message})
cluster_file_path = os.path.join('cache', 'cluster_texts', f'cluster_{cluster_label}.json')
with open(cluster_file_path, 'wb') as cluster_file:
cluster_file.write(orjson.dumps(cluster_messages))
print(f"Stored messages for Cluster {cluster_label} in {cluster_file_path}")
plt.figure(figsize=(10, 8))
palette = sns.color_palette('bright', np.unique(cluster_labels).max() + 1)
for i, color in tqdm(enumerate(palette), desc='Plotting clusters', total=len(palette)):
points = embeddings_reduced[cluster_labels == i]
if len(points) == 0:
continue
sns.scatterplot(x=points[:, 0], y=points[:, 1], color=color)
try:
kde = gaussian_kde(points.T)
x_min, x_max = points[:, 0].min() - 1, points[:, 0].max() + 1
y_min, y_max = points[:, 1].min() - 1, points[:, 1].max() + 1
x, y = np.mgrid[x_min:x_max:100j, y_min:y_max:100j]
z = kde(np.vstack([x.ravel(), y.ravel()]))
z = z.reshape(x.shape)
plt.contour(x, y, z, colors=[color], linewidths=2)
except np.linalg.LinAlgError:
pass
# print(f"Could not plot Gaussian KDE for cluster {i} due to singularity.")
noise_points = embeddings_reduced[cluster_labels == -1]
if len(noise_points) > 0:
sns.scatterplot(x=noise_points[:, 0], y=noise_points[:, 1], color=(0.5, 0.5, 0.5))
plt.title('HDBSCAN clustering with t-SNE reduced embeddings')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.legend()
plt.show()