-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword2vector.py
91 lines (69 loc) · 2.45 KB
/
word2vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import matplotlib.pyplot as plt
import re
import pandas as pd
from sklearn.manifold import TSNE
# lines = []
# with open("IMDB Dataset.csv", "r", encoding="UTF-8") as f:
# lines = f.readlines()
# splitedSentences = []
# processeed = 0
# maxProcessCount = 5000
# for line in lines:
# sentences = line.split(".")
# for sentence in sentences:
# splited = re.split('\W', sentence)
# splited = list(filter(None, splited))
# splitedSentences.append(splited)
# processeed += 1
# if processeed > maxProcessCount:
# break
# if processeed > maxProcessCount:
# break
# print(splitedSentences[0])
# model = Word2Vec(sentences=splitedSentences, vector_size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")
print("Loading Word2Vec model, please wait.")
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# model = Word2Vec.load("GoogleNews-vectors-negative300.bin")
# model.train([["hello", "world"]], total_examples=1, epochs=1)
print("Word2Vec loading complete!")
#vector = model['yes'] # get numpy vector of a word
#sims = model.most_similar('yes', topn=10) # get other similar words
#print(sims)
# get a list of word and simularity
result = model.most_similar(positive=['yes', 'absolutely'], negative=['no'], topn=100)
print(result)
result_words = [s[0] for s in result]
print(result_words)
# vocab = list(model.key_to_index)[:10]
X = model[result_words]
print(X)
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
df = pd.DataFrame(X_tsne, index=result_words, columns=['x', 'y'])
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
ax.annotate(word, pos)
plt.show()
yes_ok_sim = model.similarity('king', 'prince')
print(yes_ok_sim)
#result = model.most_similar(negative=['man'])
#print(result)
#vocab = list(model.index_to_key)
#while True:
#user_input = input("Input the words:")
#splited = re.split('\W', user_input)
#splited = list(filter(None, splited))
#yes_score = 0
#no_score = 0
#for word in splited:
#if word in vocab:
#yes_score += model.similarity('king', word)
#no_score += model.similarity('queen', word)
#print("Similarities Percentage: King: " + str(yes_score) + ", Queen: " + str(no_score))
# print(vector)