-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathformat_data.py
125 lines (112 loc) · 5.84 KB
/
format_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import copy
import numpy as np
import tensorflow as tf
def get_one_hot_vector (position, size):
vector = [0] * size
vector[position] = 1
return vector
def format_data(data_list, seq_width, src2id, target2id):
"""
:param data_list: list with the sentences to be used for training/testing
:param seq_width: list with the lengths of the sentences (needed for the LSTM cell)
:param src2id: mapping from source words to ids
:param trgt2id: mapping from target words to ids
:return: triple of lists containing the input words, expected output words and sequence lengths
"""
input_data = np.empty([len(data_list), seq_width], dtype=int)
labels = np.empty([len(data_list), seq_width], dtype=int)
seq_length = np.empty([len(data_list)], dtype=int)
for count, sent in enumerate(data_list):
if len(sent) > 50:
sent = sent[:50]
# Create a [seq_width, vocab_size]-shaped array, pad it with empty vectors when necessary.
input_padded = [src2id[word] if word in src2id else src2id["UNK"] for word,_ in sent] \
+ (seq_width - len(sent)) * [0]
input_array = np.asarray(input_padded)
input_data[count] = input_array
labels_padded = [target2id[word] if word in target2id
else target2id["UNK"] for _,word in sent] \
+ (seq_width-len(sent)) * [0]
#labels_padded = [get_one_hot_vector(target2id[word], len(target2id)) if word in target2id
# else get_one_hot_vector(target2id["UNK"], len(target2id)) for _,word in sent] \
# + (seq_width-len(sent)) * [empty_embedding]
labels_array = np.asarray(labels_padded)
labels[count] = labels_array
seq_length[count] = len(sent)
return input_data, labels, seq_length
def format_data_fullsoftmax(data_list, seq_width, src2id, target2id):
num_classes = len(target2id)
input_data = np.empty([len(data_list), seq_width], dtype=int)
labels = np.empty([len(data_list), seq_width, num_classes], dtype=int)
seq_length = np.empty([len(data_list)], dtype=int)
#empty_embedding = embedding_size * [0]
empty_prob_distribution = np.zeros([num_classes], dtype=int)
for count, sent in enumerate(data_list):
if len(sent) > 50:
sent = sent[:50]
# Create a [seq_width, vocab_size]-shaped array, pad it with empty vectors when necessary.
input_padded = [src2id[word] if word in src2id else src2id["UNK"] for word,_ in sent] \
+ (seq_width - len(sent)) * [0]
input_array = np.asarray(input_padded)
input_data[count] = input_array
labels_temp = []
for _,word in sent:
if word in target2id:
w_id = target2id[word]
one_hot_pos = copy.copy(empty_prob_distribution)
one_hot_pos[w_id] = 1
else:
one_hot_pos = copy.copy(empty_prob_distribution)
one_hot_pos[0] = 1
labels_temp.append(one_hot_pos)
labels_padded = labels_temp + (seq_width-len(sent)) * [empty_prob_distribution]
labels_array = np.asarray(labels_padded)
labels[count] = labels_array
seq_length[count] = len(sent)
return input_data, labels, seq_length
def format_data_embeddings(data_list, seq_width, src2id, target2id, target_embeddings, embedding_size):
input_data = np.empty([len(data_list), seq_width], dtype=int)
labels = np.empty([len(data_list), seq_width, embedding_size], dtype=float)
labels_ids = np.empty([len(data_list), seq_width], dtype=int)
seq_length = np.empty([len(data_list)], dtype=int)
empty_embedding = np.zeros([embedding_size], dtype=int)
for count, sent in enumerate(data_list):
if len(sent) > 50:
sent = sent[:50]
# Create a [seq_width, vocab_size]-shaped array, pad it with empty vectors when necessary.
input_padded = [src2id[word] if word in src2id else src2id["UNK"]
for word,_ in sent] \
+ (seq_width - len(sent)) * [0]
input_array = np.asarray(input_padded)
input_data[count] = input_array
labels_padded = [target_embeddings[target2id[word]] if word in target2id else target_embeddings[target2id["UNK"]]
for word,_ in sent] \
+ (seq_width - len(sent)) * [empty_embedding]
labels_array = np.asarray(labels_padded)
labels[count] = labels_array
# We need these for the accuracy calculations
labels_ids_list = [target2id[word] if word in target2id else target2id["UNK"] for word,_ in sent] \
+ (seq_width - len(sent)) * [0]
labels_ids_array = np.asarray(labels_ids_list)
labels_ids[count] = labels_ids_array
seq_length[count] = len(sent)
return input_data, labels, labels_ids, seq_length
def format_data_app(data_list, seq_width, src2id):
"""
:param data_list: list with the sentences to be used for training/testing
:param seq_width: list with the lengths of the sentences (needed for the LSTM cell)
:param src2id: mapping from source words to ids
:return: tuple of lists containing the input words and sequence lengths
"""
input_data = np.empty([len(data_list), seq_width], dtype=int)
seq_length = np.empty([len(data_list)], dtype=int)
for count, sent in enumerate(data_list):
if len(sent) > 50:
sent = sent[:50]
# Create a [seq_width, vocab_size]-shaped array, pad it with empty vectors when necessary.
input_padded = [src2id[word] if word in src2id else src2id["UNK"] for word in sent] \
+ (seq_width - len(sent)) * [0]
input_array = np.asarray(input_padded)
input_data[count] = input_array
seq_length[count] = len(sent)
return input_data, seq_length