-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathts_customgrid_clustering.py
155 lines (132 loc) · 5.16 KB
/
ts_customgrid_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import multiprocessing
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rich import print
from rich.progress import BarColumn, Progress, TimeRemainingColumn
from tslearn.clustering import TimeSeriesKMeans
from tslearn.piecewise import SymbolicAggregateApproximation
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from music import MusicDB
mpl.rcParams["figure.dpi"] = 300
savefig_options = dict(format="png", dpi=300, bbox_inches="tight")
def do_sax_kmeans(params):
plots = False
# unpack
df, k = params
# Make sax
sax = SymbolicAggregateApproximation(n_segments=130, alphabet_size_avg=20)
ts_sax = sax.fit_transform(df)
sax_dataset_inv = sax.inverse_transform(ts_sax)
km = TimeSeriesKMeans(
n_clusters=k, metric="euclidean", max_iter=50, random_state=5138
)
km.fit(ts_sax)
km_dtw = TimeSeriesKMeans(
n_clusters=k, metric="dtw", max_iter=50, random_state=5138
)
km_dtw.fit(ts_sax)
if plots:
# centroids
plt.plot(km.cluster_centers_.reshape(ts_sax.shape[1], k))
plt.show()
# WHAT THE f IS THIS
for i in range(k):
plt.plot(np.mean(df[np.where(km.labels_ == i)[0]], axis=0))
plt.show()
return (
k,
round(km.inertia_, 4), # OR km.inertia_ silhouette_score(df, km.labels_)
round(km_dtw.inertia_, 4), # OR km_drw.inertia_
)
if __name__ == "__main__":
"""
On the dataset created, compute clustering based on Euclidean/Manhattan and DTW distances and compare the results. To perform the clustering you can choose among different distance functions and clustering algorithms. Remember that you can reduce the dimensionality through approximation. Analyze the clusters and highlight similarities and differences.
"""
musi = MusicDB()
"""
if method_chosen == 1:
# pathing
fig, ax = plt.subplots(figsize=(12, 8))
ax = sbn.heatmap(
cost_matrix, annot=True, square=True, linewidths=0.1, cmap="YlGnBu", ax=ax
)
ax.invert_yaxis()
# Get the warp path in x and y directions
path_x = [p[0] for p in path]
path_y = [p[1] for p in path]
# Align the path from the center of each cell
path_xx = [x + 0.5 for x in path_x]
path_yy = [y + 0.5 for y in path_y]
ax.plot(path_xx, path_yy, color="blue", linewidth=3, alpha=0.2)
fig.savefig("ex1_heatmap.png", **savefig_options)
elif method_chosen == 2:
# Kmeans
km = TimeSeriesKMeans(
n_clusters=8, metric="euclidean", max_iter=5, random_state=1149
)
km.fit(musi.df.iloc[:, :200])
print(km.cluster_centers_.shape)
plt.plot(np.squeeze(km.cluster_centers_).T)
plt.show()
print(km.inertia_)
elif method_chosen == 3:
# KMeans with PAA approximation
segments = 10
n_clusters = 8
percent_rows_to_use = 1
rows_to_use = round(len(musi.df) * percent_rows_to_use / 100)
x = musi.df.iloc[:rows_to_use, :]
paa = PiecewiseAggregateApproximation(n_segments=segments)
X_paa = paa.fit_transform(x)
plt.plot(X_paa.reshape(X_paa.shape[1], X_paa.shape[0]))
plt.show()
km = TimeSeriesKMeans(
n_clusters=n_clusters, metric="euclidean", max_iter=5, random_state=0
)
km.fit(X_paa)
# plt.plot(km.cluster_centers_.reshape(X_paa.shape[1], n_clusters))
# plt.show()
for i in range(n_clusters):
plt.plot(np.mean(x[np.where(km.labels_ == i)[0]], axis=0))
plt.show()
"""
# Kmeans with SAX, grid search, multiprocessing
k = 11
x = musi.df
# Rescale - but why?
scaler = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0) # Rescale time series
ts = scaler.fit_transform(x)
# build param collection
param_collection = []
for ki in range(3, k, 1):
param_collection.append((x, ki))
# make results
pl_results = []
event_steps = [int(len(param_collection) * i / 100) for i in range(100)]
event_steps.append(1)
event_steps = set(event_steps)
# make progress reporting
progress = Progress(
"[progress.description]{task.description}",
BarColumn(),
"{task.completed} of {task.total}",
"[progress.percentage]{task.percentage:>3.0f}%",
TimeRemainingColumn(),
)
# populate results
with progress:
task_id = progress.add_task("[cyan]KMeans…", total=len(param_collection))
with multiprocessing.Pool() as pool:
for one_result in pool.imap_unordered(do_sax_kmeans, param_collection):
pl_results.append(one_result)
progress.advance(task_id)
if int(progress._tasks[task_id].completed) in event_steps:
with open("data/tokens/progress.txt", "w") as f:
f.write(str(progress._tasks[task_id].completed))
# make df
dfm = pd.DataFrame(pl_results, columns=["k", "sse euclidean", "sse dtw"])
# output results
print(dfm.sort_values(by="sse euclidean").iloc[:20, :])
print(dfm.sort_values(by="sse dtw").iloc[:20, :])