-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathm4advcl_Clustering_bis.py
96 lines (78 loc) · 2.13 KB
/
m4advcl_Clustering_bis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from pyclustering.cluster import xmeans
from sklearn.preprocessing import LabelEncoder
import utils
df = utils.load_tracks(
"data/tracks.csv", dummies=True, buckets="continuous", fill=True, outliers=True
)
df = df.head(100)
print(df.info())
"""
print(df.shape)
column2drop = [
("track", "language_code"),
("album", "type"),
]
df.drop(column2drop, axis=1, inplace=True)
"""
# feature to reshape
label_encoders = dict()
column2encode = [
("album", "type"),
]
for col in column2encode:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
numeric_columns = [
("album", "comments"),
# ("album", "date_created"),
("album", "favorites"),
("album", "listens"),
("artist", "comments"),
("album", "type"),
# ("artist", "date_created"),
("artist", "favorites"),
("track", "comments"),
# ("track", "date_created"),
("track", "duration"),
("track", "favorites"),
("track", "interest"),
("track", "listens"),
# ("artist", "active_year_end"),
# ("artist", "wikipedia_page"),
# ("track", "composer"),
# ("track", "information"),
# ("track", "lyricist"),
# ("track", "publisher"),
# ("album", "engineer"),
# ("album", "information"),
# ("artist", "bio"),
# ("album", "producer"),
# ("artist", "website"),
]
X = df[numeric_columns].values
print("dataset:", X.shape)
print(X)
xm = xmeans.xmeans(X)
xm.process()
clusters = xm.get_clusters()
centers = xm.get_centers()
print("Clusters: ", clusters)
print("Centers: ", centers)
# Visual Guidotti
# print("score", score)
i = df.columns.values.tolist().index(("album", "listens"))
j = df.columns.values.tolist().index(("track", "favorites"))
sns.set()
colours = ListedColormap(["r", "b", "g"])
for indexes in clusters:
plt.scatter(X[indexes, i], X[indexes, j], alpha=0.4, cmap=colours)
for c in centers:
plt.scatter(c[i], c[j], s=100, edgecolors="k")
plt.xlabel("album,listens")
plt.ylabel("track,favourites")
plt.title("Visualizing centeroids and centers")
plt.show()