-
Notifications
You must be signed in to change notification settings - Fork 88
/
inputHandler.py
147 lines (123 loc) · 6.36 KB
/
inputHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
import numpy as np
import gc
def train_word2vec(documents, embedding_dim):
"""
train word2vector over traning documents
Args:
documents (list): list of document
embedding_dim (int): outpu wordvector size
Returns:
word_vectors(dict): dict containing words and their respective vectors
"""
model = Word2Vec(documents, min_count=1, size=embedding_dim)
word_vectors = model.wv
del model
return word_vectors
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
"""
Create embedding matrix containing word indexes and respective vectors from word vectors
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
word_vectors (dict): dict containing word and their respective vectors
embedding_dim (int): dimention of word vector
Returns:
"""
nb_words = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
embedding_matrix = np.zeros((nb_words, embedding_dim))
print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
for word, i in word_index.items():
try:
embedding_vector = word_vectors[word]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
except KeyError:
print("vector not found for word - %s" % word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
return embedding_matrix
def word_embed_meta_data(documents, embedding_dim):
"""
Load tokenizer object for given vocabs list
Args:
documents (list): list of document
embedding_dim (int): embedding dimension
Returns:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
embedding_matrix (dict): dict with word_index and vector mapping
"""
documents = [x.lower().split() for x in documents]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
word_vector = train_word2vec(documents, embedding_dim)
embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
del word_vector
gc.collect()
return tokenizer, embedding_matrix
def create_train_dev_set(tokenizer, sentences_pair, is_similar, max_sequence_length, validation_split_ratio):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
sentences_pair (list): list of tuple of sentences pairs
is_similar (list): list containing labels if respective sentences in sentence1 and sentence2
are same or not (1 if same else 0)
max_sequence_length (int): max sequence length of sentences to apply padding
validation_split_ratio (float): contain ratio to split training data into validation data
Returns:
train_data_1 (list): list of input features for training set from sentences1
train_data_2 (list): list of input features for training set from sentences2
labels_train (np.array): array containing similarity score for training data
leaks_train(np.array): array of training leaks features
val_data_1 (list): list of input features for validation set from sentences1
val_data_2 (list): list of input features for validation set from sentences1
labels_val (np.array): array containing similarity score for validation data
leaks_val (np.array): array of validation leaks features
"""
sentences1 = [x[0].lower() for x in sentences_pair]
sentences2 = [x[1].lower() for x in sentences_pair]
train_sequences_1 = tokenizer.texts_to_sequences(sentences1)
train_sequences_2 = tokenizer.texts_to_sequences(sentences2)
leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(train_sequences_1, train_sequences_2)]
train_padded_data_1 = pad_sequences(train_sequences_1, maxlen=max_sequence_length)
train_padded_data_2 = pad_sequences(train_sequences_2, maxlen=max_sequence_length)
train_labels = np.array(is_similar)
leaks = np.array(leaks)
shuffle_indices = np.random.permutation(np.arange(len(train_labels)))
train_data_1_shuffled = train_padded_data_1[shuffle_indices]
train_data_2_shuffled = train_padded_data_2[shuffle_indices]
train_labels_shuffled = train_labels[shuffle_indices]
leaks_shuffled = leaks[shuffle_indices]
dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio))
del train_padded_data_1
del train_padded_data_2
gc.collect()
train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:]
train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:]
labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:]
leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:]
return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
test_sentences_pair (list): list of tuple of sentences pairs
max_sequence_length (int): max sequence length of sentences to apply padding
Returns:
test_data_1 (list): list of input features for training set from sentences1
test_data_2 (list): list of input features for training set from sentences2
"""
test_sentences1 = [x[0].lower() for x in test_sentences_pair]
test_sentences2 = [x[1].lower() for x in test_sentences_pair]
test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(test_sequences_1, test_sequences_2)]
leaks_test = np.array(leaks_test)
test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
return test_data_1, test_data_2, leaks_test