-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmake_word_vectors.py
53 lines (37 loc) · 1.56 KB
/
make_word_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#Usage : python make_word_vectors.py INPUT_FILE OUTPUT_MODEL_DIR
import gensim
import os, sys, codecs
INPUT_FILE = sys.argv[1]
OUTPUT_DIR = sys.argv[2]
# Create a Sentences iterator over the file
class MySentences(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
with codecs.open(self.filename, 'r') as f:
for line in f:
yield line.strip().split()
# Call the sentences iterator
'''
sentences = []
with codecs.open(INPUT_FILE, 'r', encoding='utf-8') as f:
for line in f:
sentences.append(line.strip())
'''
sentences = MySentences(INPUT_FILE) # a memory-friendly iterator, so we can generalise for huge data later even on lesser-RAM PCs.
print (sentences)
# Define the skipgram and cbow models for training word2vec
# min_count is set to 1 as the data we have is anyway less, for now.
model_skipgram = gensim.models.Word2Vec(sentences, min_count = 1, size = 100, sg = 1, window = 5)
print ("Training skipgram done...")
model_cbow = gensim.models.Word2Vec(sentences, min_count = 1, size = 100, sg = 0, window=5)
print ("Training cbow done...")
#Save the models to disk
if OUTPUT_DIR[-1] == '/':
OUTPUT_DIR=OUTPUT_DIR[:-1]
model_skipgram.wv.save_word2vec_format(OUTPUT_DIR+'/skipgram_w5_100.txt', binary=False)
model_cbow.wv.save_word2vec_format(OUTPUT_DIR+'/cbow_w5_100.txt', binary=False)
#model_skipgram.save(OUTPUT_DIR+'/skipgram_w5_100')
#model_cbow.save(OUTPUT_DIR+'/cbow_w5_100')
new_model = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_w5_100.txt')
print (new_model['lugal'])