-
Notifications
You must be signed in to change notification settings - Fork 0
/
keras_model.py
356 lines (283 loc) · 12.2 KB
/
keras_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# import your libraries here
import nltk
from gensim.models import Word2Vec
import re
import gensim
import codecs
import pandas as pd
import csv
# Importing utility functions from Keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Embedding
from keras import metrics
################ README - NEURAL NETWORK MODEL IMPLEMENTATION ###################
"""
Authors: Rae Johanek & Christina Pathrose
CS4120 Final Project
This file should be used to build the Keras Sequential Model. To run use the following command format:
./keras_model.py
The Dataset used for this Project features Poems from the Poetry Foundation, and can be found here:
https://www.kaggle.com/tgdivy/poetry-foundation-poems
This file requires the following :
1. Poetry Dataset (PoetryFoundationData.csv)
2. Pretrained Word Embeddings Original file (GoogleNews-vectors-negative300.bin.gz)
"""
########################### IMPLEMENTATION ######################################
# CONSTANTS:
NGRAM = 3 # Define the size of the ngram language model you want to train:
EMBEDDINGS_SIZE = 200
# PRE-PROCESSING:
START = "<s>"
END = "</s>"
sent_delim=" . "
line_delim="\n"
def tokenize(sentences):
"""
break up the given sentences into tokens within the sentence and normalize the text
"""
sentences_tokenized = []
# lowercase the text, remove unwanted punctuation, tokenize each sentence
# and add the list of tokens to the list of tokenized sentences
for sent in sentences :
# pre-process to remove punctuation
sent_no_punc = re.sub('[!?.,;]', '', sent)
# tokenize
sent_tokens = sent_no_punc.lower().split()
# add start and end tokens
sent_tokens.insert(0, START)
sent_tokens.append(END)
# add the list of tokens to the list of sentences
sentences_tokenized.append(sent_tokens)
return sentences_tokenized
def pre_pend(sentences):
""" Add another sentence start token to the beginning of each sentence"""
pre_pended = []
for sentence in sentences:
new = sentence.copy()
new.insert(0, START)
pre_pended.append(new)
return pre_pended
# read in data from csv file
def pre_process(filePath, n):
data=pd.read_csv(filePath)
poems = []
for row in data.iterrows():
# row[0] is the id
# row[1] is the info: index at [0], title at [1], poem at [2], poet at [3], and tags at [4]
poems.append(row[1][2])
poems_joined_sentences = sent_delim.join(poems)
poems_joined_lines = line_delim.join(poems)
# example of line: Dog bone, stapler,
lines = tokenize(poems_joined_sentences.split('\n'))
lines = [x for x in lines if len(x) > 2]
# example of sentence: Dog bone, stapler, cribbage board, garlic press because this window is loose—lacks suction, lacks grip
sentences = tokenize(re.split('[.!?]', poems_joined_lines))
sentences = [x for x in sentences if len(x) > 2]
return sentences
# Initializing a Tokenizer
# It is used to vectorize a text corpus. Here, it just creates a mapping from word to a unique index.
# (Note: Indexing starts from 0)
# given a text returns a word to index map. The input is a list of tokenized sentences
def encode_text(text):
tokenizer = Tokenizer()
# generate sequences for all words in text
tokenizer.fit_on_texts(text)
encoded = tokenizer.texts_to_sequences(text)
# get word to sequence map
word_to_encoded_map = tokenizer.word_index
encoded_to_word_map = {k:v for k,v in zip(word_to_encoded_map.values(), word_to_encoded_map.keys())}
return (word_to_encoded_map, encoded_to_word_map)
def make_n_grams(tokens):
"""Creates tuples of n length representing the n-grams in the given tokens
Parameters:
tokens (list of str): the list of individual tokens to be grouped into n-grams
Returns:
n_grams (list of tuples of str): the list of n-grams created from the given tokens
"""
n_grams = []
if NGRAM == 1 :
return tokens
else:
for i in range(NGRAM - 1, len(tokens)):
n_gram = [tokens[j] for j in range(i - (NGRAM - 1), i + 1)]
n_grams.append(tuple(n_gram))
return n_grams
def generate_training_samples(word_to_encoded, data):
'''
Takes the encoded data map of word to index and
generates the training samples out of it.
Parameters:
encoded data map of word to index
return:
tuple of lists in the format ([all n-1 grams with last words excluded], [all last words of the n-grams])
'''
# list of all n-grams for the text in the encoded list
data_as_one_list = [token for sublist in data for token in sublist]
all_ngrams = make_n_grams(data_as_one_list)
all_n_minus_1_grams = []
all_last_word_in_grams = []
for ngram in all_ngrams:
all_n_minus_1_grams.append(ngram[:-1])
all_last_word_in_grams.append(ngram[-1])
return [all_n_minus_1_grams, all_last_word_in_grams]
def read_embeddings():
'''Loads and parses embeddings trained in earlier.
Parameters and return values are up to you.
'''
# word to embedding maps
wv_poem = model_poems.wv
return wv_poem
def pre_process_words_to_embeddings(embeddings, words):
""" Conver word lists to a new concatenated embedding represnting the embedding of each word"""
word_embeddings = []
for word in words:
word_embeddings.append((embeddings[word]).ravel())
return np.concatenate(word_embeddings)
def pre_pend(sentences):
""" Add another sentence start token to the beginning of each sentence"""
pre_pended = []
for sentence in sentences:
new = sentence.copy()
new.insert(0, "<s>")
pre_pended.append(new)
return pre_pended
################ DATAGENERATOR CLASS ###################
class DataGenerator(Sequence):
'Generates data for Keras'
def __init__(self, X, y, batch_size, embeddings, word_to_encoded):
'Initialization'
self.batch_size = num_sequences_per_batch
self.outputs = y
self.inputs = X
self.embeddings = embeddings
self.word_to_encoded = word_to_encoded
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.inputs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of X and y values
list_X = [self.inputs[j] for j in indexes]
list_y = [self.outputs[k] for k in indexes]
# Generate data
return self.__data_generation(list_X, list_y)
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.inputs))
def __data_generation(self, X: list, y: list) -> (list,list):
#X : n-1 grams
#y: last token in the gram
vocab_size = len(self.word_to_encoded)
n_1_grams_embeddings = np.empty((self.batch_size, EMBEDDINGS_SIZE*(NGRAM - 1)))
last_tokens = np.empty((self.batch_size))
for i in range(len(X)):
words = X[i]
word_embeddings = []
for word in words:
word_embeddings.append((self.embeddings[word]).ravel())
concatenated = np.concatenate(word_embeddings)
n_1_grams_embeddings[i] = concatenated
last_tokens[i] = self.word_to_encoded[y[i]]
one_hot = to_categorical(last_tokens, vocab_size)
return n_1_grams_embeddings, one_hot
'''
Returns data generator to be used by feed_forward
https://wiki.python.org/moin/Generators
https://realpython.com/introduction-to-python-generators/
Yields batches of embeddings and labels to go with them.
Use one hot vectors to encode the labels
(see the to_categorical function)
'''
# makes the neural network language model
def make_neural_language(embedding_matrix):
model = Sequential()
# hidden layer
model.add(Dense(500, activation="relu", input_dim=EMBEDDINGS_SIZE*(NGRAM - 1)))
# output layer
model.add(Dense(len(embedding_matrix), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])
print(model.summary())
return model
# generate a sequence from the model
def generate_seq(model: Sequential,
embeddings,
seed: list,
length: int):
'''
Parameters:
model: your neural network
tokenizer: the keras preprocessing tokenizer
seed: [w1, w2, ... w(m-1)]
length: generate a sentence of length n_words
Returns: string sentence
'''
# seed: initial list of words or could just be [<s>]
# generate next word(s) in sequence
# given current sequence, use model to predict what comes next
# take input, run it through the model (apply nonlinearity),
starting = np.expand_dims(pre_process_words_to_embeddings(embeddings, seed), axis=0)
previous_word = seed[1]
sentence = []
while len(sentence) < length:
prob_dist = model.predict(starting).ravel()
new_word = np.random.choice(embeddings.index_to_key, p=prob_dist)
# don't include sentence end tokens
while new_word == ("/n") or new_word == ("<s>"):
new_word = np.random.choice(embeddings.index_to_key, p=prob_dist)
sentence.append(new_word)
starting = np.expand_dims(pre_process_words_to_embeddings(embeddings, [previous_word, new_word]), axis=0)
previous_word = new_word
return " ".join(sentence)
# generates sentences given model, embeddings, number of sentences, sentence length:
def generate_sentences(model, embeddings, num_sentences=50, sentence_length=20):
""" create the the specified number of sentences at the specified length"""
sentences = []
for i in range(num_sentences):
sentences.append(generate_seq(model, embeddings, ["splashing", "boiling"], sentence_length))
return sentences
######################## main method for training ##################################
def main():
"""
Train and save the Neural Network Model
"""
# PRE-PROCESS the training data:
training_data = pre_process("PoetryFoundationData.csv", 3)
# Create Word Embeddings with Training Data:
model_poems = Word2Vec(sentences=training_data, vector_size=EMBEDDINGS_SIZE, window=5, min_count=1, sg=1)
# save Word2Vec model:
model_poems.save("word2vec.model")
# print out the vocabulary size
#print('Vocab size {}'.format(len(model_poems.wv)))
# PRE-TRAINED WORD EMBEDDINGS:
W2V_PATH="GoogleNews-vectors-negative300.bin.gz"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)
model_w2v.save("pretrained_embeddings.model")
(word_to_encoded, encoded_to_word) = encode_text(training_data)
# [X: n-1 gram word to encoding map entries (lists), y: nth tokenword to encoding map entries (strings)]
samples = generate_training_samples(word_to_encoded, training_data)
embeddings = read_embeddings()
num_sequences_per_batch = 1024 #1024 is new batch size
steps_per_epoch = len(samples[0])//num_sequences_per_batch # Number of batches per epoch
#print(steps_per_epoch)
train_generator = DataGenerator(samples[0], samples[1], num_sequences_per_batch, embeddings, word_to_encoded)
sample1 = train_generator.__getitem__(0) # this is how you get data out of generators
print(sample1[0].shape) # (batch_size, (n-1)*EMBEDDING_SIZE) (128, 200)
# code to create a feedforward neural language model
# with a set of given word embeddings
nn_model_poems = make_neural_language(embeddings)
# Start training the model
nn_model_poems.fit(train_generator,
steps_per_epoch=steps_per_epoch,
epochs=1)
#nn_model_poems.save("./poems_model")
if __name__ == '__main__':
main()