-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
43 lines (31 loc) · 1.05 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
import code
import numpy as np
from sklearn.cluster import AgglomerativeClustering, KMeans
from collections import defaultdict, Counter
number_of_clusters = 4
with open("data.json") as f:
j = json.load(f)
all_colors = ['black', 'white', 'grey', 'brown', 'red', 'blue', 'yellow', 'green', 'orange', 'violet', 'gold', 'silver']
def div(x, y):
try:
return x / y
except ZeroDivisionError:
return 0
ls, authors = [], []
for author in j:
if j[author]['lang'] != 'de': continue
if j[author]['word_count'] >= 10000: continue
total = sum(j[author]['color_counts'].values())
ls.append([div(j[author]['color_counts'].get(color, 0), total) for color in all_colors])
authors.append(author)
X = np.array(ls)
# k = KMeans(number_of_clusters)
k = AgglomerativeClustering(number_of_clusters)
k.fit(X)
clusters = defaultdict(list)
for aid, cid in zip(authors, k.labels_):
clusters[cid].append(aid)
# for key in clusters:
# clusters[key] = sum(clusters[key]) / len(clusters[key])
code.interact(local=locals())