-
Notifications
You must be signed in to change notification settings - Fork 1
/
clusters 2.txt
101 lines (67 loc) · 3.14 KB
/
clusters 2.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
file = open(r'C:\Users\NADA LABIB\Desktop\Analysis-of-Egyptian-Movies\synopses.txt', 'r', encoding = 'utf-8')
s = file.read()
file.close()
synopses = s.split('\n')
synopses = list(filter(None, synopses))
i = 0
filtered_synopses = []
for synopsis in synopses:
if(len(synopsis.split('||')) == 2):
filtered_synopses.append(synopsis)
with open('test123.txt', 'w', encoding='utf-8') as file:
for item in filtered_synopses:
file.write(item.split('||')[0].strip())
file.write('\n')
window = 10
documents = [TaggedDocument(synopsis_name.split('||')[1].strip().split(), [synopsis_name.split('||')[0].strip()]) for synopsis_name in filtered_synopses]
model = Doc2Vec(documents, vector_size = 500, window=window, min_count=5, workers=4, epochs=20)
vectors = []
for i in range(len(filtered_synopses)):
vectors.append(model.docvecs[filtered_synopses[i].split('||')[0].strip()])
kmeans = KMeans(n_clusters=4, random_state=0).fit(vectors)
m = metrics.silhouette_score(vectors, kmeans.labels_, metric='euclidean')
print(m, ' ', window)
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm
print(__doc__)
# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
range_n_clusters = [2, 3, 4, 5, 6]
for n_clusters in range_n_clusters:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(vectors) + (n_clusters + 1) * 10])
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(vectors)
silhouette_avg = silhouette_score(vectors, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
print(vectors)
sample_silhouette_values = silhouette_samples(vectors, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()