-
Notifications
You must be signed in to change notification settings - Fork 2
/
kmeansclass.py
75 lines (58 loc) · 2.15 KB
/
kmeansclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from nltk.cluster import KMeansClusterer
import nltk
from sklearn import cluster
from sklearn import metrics
from inspect import getmembers, isfunction
# training data
sentences = [['this', 'is', 'the', 'good', 'machine', 'learning', 'book'],
['this', 'is', 'another', 'book'],
['one', 'more', 'book'],
['this', 'is', 'the', 'new', 'post'],
['this', 'is', 'about', 'machine', 'learning', 'post'],
['and', 'this', 'is', 'the', 'last', 'post']]
new_sentence = ['another', 'machine', 'learning', 'book']
# training model
model = Word2Vec(sentences, min_count=1)
print (model)
print (getmembers(model))
new_tokens = model.infer_vector(new_sentence)
print ("most similar sentence")
print (model.most_similar([[new_tokens]]))
# get vector data
print (model.similarity('this', 'is'))
print (model.similarity('post', 'book'))
print ("Most similar example:")
print (model.most_similar(positive=['machine'], negative=[], topn=2))
print (model.most_similar(positive=['machine'], negative=[], topn=2))
print ("the")
print (model['the'])
X = model[model.vocab]
# model.wv
print ("vocab X")
print (X.vocab)
print ("vocab model.wv.vocab")
print (list(model.vocab))
print (len(list(model.vocab)))
NUM_CLUSTERS=3
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X.vectors, assign_clusters=True)
print ("assigned_clusters")
print (assigned_clusters)
words = list(X.vocab)
for i, word in enumerate(words):
print (word + ":" + str(assigned_clusters[i]))
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
print ("Silhouette_score: ")
print (silhouette_score)