-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkeras_training.py
339 lines (261 loc) · 12.8 KB
/
keras_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python
# coding: utf-8
# # the archtecture of codes are from https://github.com/SNUDerek/NER_bLSTM-CRF
# # bidirectional-LSTM-CRF in Keras
#
# this is a bidirectional LSTM-CRF model for NER, inspired by:
#
# Huang, Xu, Yu: *Bidirectional LSTM-CRF Models for Sequence Tagging* (2015)
#
# ...though this is becoming a common architecture for sequence labeling in NLP.
from bert_embedding.bert import BertEmbedding
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import concatenate, Input, LSTM, Dropout, Embedding
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from gensim.models import Word2Vec
from keras_tqdm import TQDMNotebookCallback
from embedding import load_vocab
import feature_namelist
import sys
import re
import tensorflow as tf
from embeddings.elmo import ELMoEmbedding
# ### limit GPU usage for multi-GPU systems
#
# comment this if using a single GPU or CPU system
# In[2]:
# restrict GPU usage here
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# ## define hyperparameters
# network hyperparameters
MAX_LENGTH = 30
MAX_VOCAB = 25000 # see preprocessing.ipynb
WORDEMBED_SIZE = 300 # see data_preprocessing.ipynb
POS_EMBED_SIZE = 100 # see data_preprocessing.ipynb
NPOS_EMBED_SIZE = 100 # see data_preprocessing.ipynb
HIDDEN_SIZE = 400 # LSTM Nodes/Features/Dimension
BATCH_SIZE = 64
DROPOUTRATE = 0.25
MAX_EPOCHS = 20 # max iterations, early stop condition below
# load data from npys (see preprocessing.ipynb)
print("loading data...\n")
vocab = list(np.load('../encoded/vocab.npy'))
sentence_text = list(np.load('../encoded/sentence_text.npy'))
sentence_post = list(np.load('../encoded/sentence_post.npy'))
sentence_npost = list(np.load('../encoded/sentence_npost.npy'))
sentence_ners = list(np.load('../encoded/sentence_ners.npy'))
sentence_text_idx = np.load('../encoded/sentence_text_idx.npy')
sentence_post_idx = np.load('../encoded/sentence_post_idx.npy')
sentence_npost_idx = np.load('../encoded/sentence_post_n_idx.npy')
sentence_ners_idx = np.load('../encoded/sentence_ners_idx.npy')
word2idx = np.load('../encoded/word2idx.npy').item()
idx2word = np.load('../encoded/idx2word.npy').item()
pos2idx = np.load('../encoded/pos2idx.npy').item()
npos2idx = np.load('../encoded/npos2idx.npy').item()
idx2pos = np.load('../encoded/idx2pos.npy').item()
idx2npos = np.load('../encoded/idx2npos.npy').item()
ner2idx = np.load('../encoded/ner2idx.npy').item()
idx2ner = np.load('../encoded/idx2ner.npy').item()
train_idx = np.load('../encoded/train_idx.npy')
test_idx = np.load('../encoded/test_idx.npy')
X_train_sents = np.load('../encoded/X_train_sents.npy')
X_test_sents = np.load('../encoded/X_test_sents.npy')
X_train_pos = np.load('../encoded/X_train_pos.npy')
X_train_npos = np.load('../encoded/X_train_npos.npy')
X_test_pos = np.load('../encoded/X_test_pos.npy')
X_test_npos = np.load('../encoded/X_test_npos.npy')
y_train_ner = np.load('../encoded/y_train_ner.npy')
y_test_ner = np.load('../encoded/y_test_ner.npy')
X_test_features = np.load('../encoded/X_test_features.npy')
X_train_features = np.load('../encoded/X_train_features.npy')
X_train_sents_bert = np.load('../encoded/X_train_sents_bert.npy')
X_test_sents_bert = np.load('../encoded/X_test_sents_bert.npy')
# load embedding data
w2v_vocab, _ = load_vocab('embeddings/text_mapping.json')
w2v_model = Word2Vec.load('embeddings/text_embeddings.gensimmodel')
w2v_pvocab, _ = load_vocab('embeddings/pos_mapping.json')
w2v_pmodel = Word2Vec.load('embeddings/pos_embeddings.gensimmodel')
w2v_npvocab, _ = load_vocab('embeddings/npos_mapping.json')
w2v_npmodel = Word2Vec.load('embeddings/npos_embeddings.gensimmodel')
# ## pad sequences
#
# we must 'pad' our input and output sequences to a fixed length due to Tensorflow's fixed-graph representation.
# zero-pad the sequences to max length
print("zero-padding sequences...\n")
X_train_sents = sequence.pad_sequences(X_train_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_sents = sequence.pad_sequences(X_test_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_pos = sequence.pad_sequences(X_train_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_pos = sequence.pad_sequences(X_test_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_npos = sequence.pad_sequences(X_train_npos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_npos = sequence.pad_sequences(X_test_npos, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_train_ner = sequence.pad_sequences(y_train_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_test_ner = sequence.pad_sequences(y_test_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
print(X_train_pos.shape)
print(X_train_features.shape)
X_train_features = sequence.pad_sequences(X_train_features, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_features = sequence.pad_sequences(X_test_features, maxlen=MAX_LENGTH, truncating='post', padding='post')
print(X_train_features.shape)
print(X_train_sents_bert.shape)
X_test_sents_bert = sequence.pad_sequences(X_test_sents_bert, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_sents_bert = sequence.pad_sequences(X_train_sents_bert, maxlen=MAX_LENGTH, truncating='post', padding='post')
print(X_train_sents_bert.shape)
# expand X_features dimension
X_train_features = np.expand_dims(X_train_features, axis=2)
X_test_features = np.expand_dims(X_test_features, axis=2)
# get the size of pos-tags, ner tags
TAG_VOCAB = len(list(idx2pos.keys()))
NTAG_VOCAB = len(list(idx2npos.keys()))
NER_VOCAB = len(list(idx2ner.keys()))
# reshape data for CRF
y_train_ner = y_train_ner[:, :, np.newaxis]
y_test_ner = y_test_ner[:, :, np.newaxis]
# ## pre-load the pretrained embeddings
#
# as seen in previous studies such as Ma & Hovy 2016, loading the embedding layer with pretrained embedding vectors has been shown to improve network performance. here we initialize an embedding to zeros, and then load the embedding from the pretrained model (if it exists; it may not due to `Word2Vec` parameters).
# create embedding matrices from custom pretrained word2vec embeddings
word_embedding_matrix = np.zeros((MAX_VOCAB, WORDEMBED_SIZE))
c = 0
for word in word2idx.keys():
# get the word vector from the embedding model
# if it's there (check against vocab list)
if word in w2v_vocab:
c += 1
# get the word vector
word_vector = w2v_model[word]
# slot it in at the proper index
word_embedding_matrix[word2idx[word]] = word_vector
print("added", c, "vectors")
pos_embedding_matrix = np.zeros((TAG_VOCAB, POS_EMBED_SIZE))
c = 0
for word in pos2idx.keys():
# get the word vector from the embedding model
# if it's there (check against vocab list)
if word in w2v_pvocab:
c += 1
# get the word vector
word_vector = w2v_pmodel[word]
# slot it in at the proper index
pos_embedding_matrix[pos2idx[word]] = word_vector
print("added", c, "vectors")
npos_embedding_matrix = np.zeros((NTAG_VOCAB, NPOS_EMBED_SIZE))
c = 0
for word in npos2idx.keys():
# get the word vector from the embedding model
# if it's there (check against vocab list)
if word in w2v_pvocab:
c += 1
# get the word vector
word_vector = w2v_pmodel[word]
# slot it in at the proper index
npos_embedding_matrix[npos2idx[word]] = word_vector
print("added", c, "vectors")
# define model
# text layers : dense embedding > dropout > bi-LSTM
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, WORDEMBED_SIZE, input_length=MAX_LENGTH,
weights=[word_embedding_matrix],
name='txt_embedding', trainable=True, mask_zero=True)(txt_input)
txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)
# pos layers : dense embedding > dropout > bi-LSTM
pos_input = Input(shape=(MAX_LENGTH,), name='pos_input')
pos_embed = Embedding(TAG_VOCAB, POS_EMBED_SIZE, input_length=MAX_LENGTH,
weights=[pos_embedding_matrix],
name='pos_embedding', trainable=True, mask_zero=True)(pos_input)
pos_drpot = Dropout(DROPOUTRATE, name='pos_dropout')(pos_embed)
# nltk pos layers : dense embedding > dropout > bi-LSTM
npos_input = Input(shape=(MAX_LENGTH,), name='npos_input')
npos_embed = Embedding(NTAG_VOCAB, NPOS_EMBED_SIZE, input_length=MAX_LENGTH,
weights=[npos_embedding_matrix],
name='npos_embedding', trainable=True, mask_zero=True)(npos_input)
npos_drpot = Dropout(DROPOUTRATE, name='npos_dropout')(npos_embed)
# bert layer
bert_input = Input(shape=(MAX_LENGTH,768), name='bert_input')
bert_drpot = Dropout(DROPOUTRATE, name='bert_drpot')(bert_input)
# emlo layer
emlo_input = Input(shape=(MAX_LENGTH,), dtype=tf.int64, name='emlo_input')
emlo_embed = ELMoEmbedding(idx2word=idx2word,
output_mode="elmo", name='emlo_embedding', trainable=True)(emlo_input) # These two are interchangeable
#sentence_embedding = Embedding(len(idx2word), 1024, input_length=MAX_SEQUENCE_LENGTH, trainable=False)(sentence_input) # These two are interchangeable
# add auxiliary layer
auxiliary_input = Input(shape=(MAX_LENGTH,1), name='aux_input') #(None, 30, 1)
# merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
mrg_cncat = concatenate([emlo_embed, pos_drpot], axis=2)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
name='mrg_bidirectional_1')(mrg_cncat)
# extra LSTM layer, if wanted
mrg_drpot = Dropout(DROPOUTRATE, name='mrg_dropout')(mrg_lstml)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
name='mrg_bidirectional_2')(mrg_lstml)
# merge BLSTM layers and extenal layer
# mrg_cncat = concatenate([mrg_lstml, txt_drpot, npos_drpot, auxiliary_input], axis=2)
mrg_cncat = concatenate([mrg_lstml, txt_drpot, auxiliary_input], axis=2)
# final linear chain CRF layer
crf = CRF(NER_VOCAB, sparse_target=True)
mrg_chain = crf(mrg_cncat)
model = Model(inputs=[txt_input, emlo_input, pos_input, npos_input, auxiliary_input], outputs=mrg_chain)
# model = Model(inputs=[txt_input, emlo_input, pos_input, auxiliary_input], outputs=mrg_chain)
model.compile(optimizer='adam',
loss=crf.loss_function,
metrics=[crf.accuracy])
model.summary()
history = model.fit([X_train_sents, X_train_sents, X_train_pos, X_train_npos, X_train_features], y_train_ner,
batch_size=BATCH_SIZE,
epochs=MAX_EPOCHS,
verbose=2)
hist_dict = history.history
# save the model
# because we are using keras-contrib, we must save weights like this, and load into network
# (see decoding.ipynb)
save_load_utils.save_all_weights(model, '../model/nltkposcrf_model.h5')
np.save('../model/nltkhist_dict.npy', hist_dict)
print("models saved!\n")
preds = model.predict([X_test_sents, X_test_sents, X_test_pos, X_test_npos, X_test_features])
preds = np.argmax(preds, axis=-1)
preds.shape
print(preds[:5])
trues = np.squeeze(y_test_ner, axis=-1)
trues.shape
s_preds = [[idx2ner[t] for t in s] for s in preds]
s_trues = [[idx2ner[t] for t in s] for s in trues]
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
def bio_classification_report(y_true, y_pred):
"""
from scrapinghub's python-crfsuite example
Classification report for a list of BIO-encoded sequences.
It computes token-level metrics and discards "O" labels.
Note that it requires scikit-learn 0.15+ (or a version from github master)
to calculate averages properly!
"""
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O', 'PAD'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
print(y_pred_combined[:5])
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
print(bio_classification_report(s_trues, s_preds))