-
Notifications
You must be signed in to change notification settings - Fork 14
/
embeddings.py
executable file
·241 lines (203 loc) · 10.9 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/python3
# coding: utf-8
import argparse
import multiprocessing
import random as rn
import sys
import time
import networkx as nx
import numpy as np
import tensorflow as tf
from keras import Input
from keras import backend
from keras import optimizers
from keras import regularizers
from keras.callbacks import TensorBoard, EarlyStopping
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers.merge import dot
from keras.models import Model
from smart_open import smart_open
import helpers
# This script trains word embeddings on pairs of words and their similarities.
# A possible source of such data is Wordnet and its shortest paths.
# Example cmd for running this script:
# python3 embeddings.py --input_file jcn-semcor.tsv.gz --vsize 300 --bsize 100 --lrate 0.001
# --vocab_file synsets_vocab.json.gz --neighbor_count 3 --use_neighbors True --epochs 15
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Learning graph embeddings with path2vec')
parser.add_argument('--input_file', required=True,
help='tab-separated gzipped file '
'with training pairs and their similarities')
parser.add_argument('--vsize', type=int, default=300, help='vector size')
parser.add_argument('--bsize', type=int, default=100, help='batch size')
parser.add_argument('--lrate', type=float, default=0.001, help='learning rate')
parser.add_argument('--vocab_file',
help='[optional] gzipped JSON file with the vocabulary (list of words)')
# If the vocabulary file is not provided, it will be inferred from the training set
# (can be painfully slow for large datasets)
parser.add_argument('--fix_seeds', default=True, help='fix seeds to ensure repeatability')
parser.add_argument('--use_neighbors', action="store_true",
help='whether or not to use the neighbor nodes-based regularizer '
'(currently works for the WordNet graph only)')
parser.add_argument('--neighbor_count', type=int, default=3,
help='number of adjacent nodes to consider for regularization')
parser.add_argument('--negative_count', type=int, default=3, help='number of negative samples')
parser.add_argument('--epochs', type=int, default=10, help='number of training epochs')
parser.add_argument('--regularize', type=bool, default=False,
help='L1 regularization of embeddings')
parser.add_argument('--name', default='graph_emb',
help='Run name, to be used in the file name')
parser.add_argument('--l1factor', type=float, default=1e-10, help='L1 regularizer coefficient')
parser.add_argument('--beta', type=float, default=0.01,
help='neighbors-based regularizer first coefficient')
parser.add_argument('--gamma', type=float, default=0.01,
help='neighbors-based regularizer second coefficient')
parser.add_argument('--full_graph', help='[optional] Path to an edge list file of the source '
'graph, used for nearest neighbors regularization. '
'If not present, WordNet graph is assumed.')
parser.add_argument('--train_size', type=int,
help='Number of pairs in the training set '
'(if absent, will be calculated on the fly)')
args = parser.parse_args()
trainfile = args.input_file # Gzipped file with pairs and their similarities
embedding_dimension = args.vsize # vector size (for example, 20)
batch_size = args.bsize # number of pairs in a batch (for example, 10)
learn_rate = args.lrate # Learning rate
neighbors_count = args.neighbor_count
negative = args.negative_count
run_name = args.name
l1_factor = args.l1factor
beta = args.beta
gamma = args.gamma
if args.fix_seeds:
# fix seeds for repeatability of experiments
np.random.seed(42)
rn.seed(12345)
tf.set_random_seed(2)
session_conf = \
tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
backend.set_session(sess)
cores = multiprocessing.cpu_count()
wordpairs = helpers.Wordpairs(trainfile)
if not args.vocab_file:
print('Building vocabulary from the training set...', file=sys.stderr)
no_train_pairs, vocab_dict, inverted_vocabulary = helpers.build_vocabulary(wordpairs)
print('Building vocabulary finished', file=sys.stderr)
else:
vocabulary_file = args.vocab_file # JSON file with the ready-made vocabulary
print('Loading vocabulary from file', vocabulary_file, file=sys.stderr)
vocab_dict, inverted_vocabulary = helpers.vocab_from_file(vocabulary_file)
if args.train_size:
no_train_pairs = int(args.train_size)
else:
print('Counting the number of pairs in the training set...')
no_train_pairs = 0
for line in wordpairs:
no_train_pairs += 1
print('Number of pairs in the training set:', no_train_pairs)
full_graph = None
if args.full_graph:
full_graph = nx.Graph()
reader = smart_open(args.full_graph, 'r')
for line in reader:
line = line.strip()
if line:
elements = line.split('\t')
if len(elements) == 2:
entity1 = elements[0].lower().strip()
entity2 = elements[1].lower().strip()
full_graph.add_edge(entity1, entity2)
reader.close()
neighbors_dict = helpers.build_neighbors_map(vocab_dict, full_graph)
vocab_size = len(vocab_dict)
# valid_size = 4 # Number of random words to log their nearest neighbours after each epoch
# valid_examples = np.random.choice(vocab_size, valid_size, replace=False)
# But for now we will just use a couple of known WordNet pairs to log their similarities:
# Gold similarities:
# measure.n.02 fundamental_quantity.n.01 0.930846519882644
# person.n.01 lover.n.03 0.22079177574204348
# valid_examples = ['measure.n.02', 'fundamental_quantity.n.01', 'person.n.01', 'lover.n.03']
if args.regularize:
word_embedding_layer = Embedding(vocab_size, embedding_dimension, input_length=1,
name='Word_embeddings',
embeddings_regularizer=regularizers.l1(l1_factor))
else:
word_embedding_layer = Embedding(vocab_size, embedding_dimension, input_length=1,
name='Word_embeddings')
# Model has 2 inputs: current word index, context word index
word_index = Input(shape=(1,), name='Word')
context_index = Input(shape=(1,), name='Context')
w_neighbors_indices = []
c_neighbors_indices = []
if args.use_neighbors:
for n in range(neighbors_count):
w_neighbors_indices.append(Input(shape=(1,), dtype='int32'))
c_neighbors_indices.append(Input(shape=(1,), dtype='int32'))
# All the inputs are processed through the embedding layer
word_embedding = word_embedding_layer(word_index)
word_embedding = Flatten(name='word_vector')(word_embedding)
context_embedding = word_embedding_layer(context_index)
context_embedding = Flatten(name='context_vector')(context_embedding)
w_neighbor_embeds = []
c_neighbor_embeds = []
if args.use_neighbors:
for n in range(neighbors_count):
w_neighbor_embeds.append(Flatten()(word_embedding_layer(w_neighbors_indices[n])))
c_neighbor_embeds.append(Flatten()(word_embedding_layer(c_neighbors_indices[n])))
# The current word embedding is multiplied (dot product) with the context word embedding
word_context_product = dot([word_embedding, context_embedding], axes=1, normalize=True,
name='word2context')
reg1_output = []
reg2_output = []
if args.use_neighbors:
for n in range(neighbors_count):
reg1_output.append(dot([word_embedding, w_neighbor_embeds[n]], axes=1, normalize=True))
reg2_output.append(dot([context_embedding, c_neighbor_embeds[n]], axes=1,
normalize=True))
inputs_list = [word_index, context_index]
if args.use_neighbors:
for i in range(neighbors_count):
inputs_list.append(w_neighbors_indices[i])
for i in range(neighbors_count):
inputs_list.append(c_neighbors_indices[i])
# Creating the model itself...
keras_model = Model(inputs=inputs_list, outputs=[word_context_product])
# Assigning attributes:
# keras_model.vexamples = valid_examples
keras_model.ivocab = inverted_vocabulary
keras_model.vsize = vocab_size
adam = optimizers.Adam(lr=learn_rate)
keras_model.compile(optimizer=adam,
loss=helpers.custom_loss(reg1_output, reg2_output, beta, gamma),
metrics=['mse'])
print(keras_model.summary())
print('Batch size:', batch_size)
train_name = trainfile.split('.')[0] + '_embeddings_vsize' + str(embedding_dimension) \
+ '_bsize' + str(batch_size) + '_lr' + str(learn_rate).split('.')[-1] \
+ '_nn-' + str(args.use_neighbors) + str(args.neighbor_count) + '_reg-' \
+ str(args.regularize)
# create a secondary validation model to run our similarity checks during training
# (in case you work with non-WordNet graph, modify this accordingly!)
# similarity = dot([word_embedding, context_embedding], axes=1, normalize=True)
# validation_model = Model(inputs=[word_index, context_index], outputs=[similarity])
# sim_cb = helpers.SimilarityCallback(validation_model=validation_model)
loss_plot = TensorBoard(log_dir=train_name + '_logs', write_graph=False)
earlystopping = EarlyStopping(monitor='loss', min_delta=0.0001, patience=1, verbose=1,
mode='auto')
# How many times per epoch we will ask the batch generator to yield a batch?
steps = no_train_pairs / batch_size
# Let's start training!
start = time.time()
history = keras_model.fit_generator(
helpers.batch_generator(wordpairs, vocab_dict, vocab_size, negative, batch_size,
args.use_neighbors, neighbors_count),
callbacks=[loss_plot, earlystopping], steps_per_epoch=steps,
epochs=args.epochs, workers=1, verbose=2)
end = time.time()
print('Training took:', int(end - start), 'seconds', file=sys.stderr)
# Saving the resulting vectors:
filename = train_name + '_' + run_name + '.vec.gz'
helpers.save_word2vec_format(filename, vocab_dict, word_embedding_layer.get_weights()[0])
backend.clear_session()