-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluation.py
215 lines (170 loc) · 9.05 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import logging
import os
import re
import matplotlib
import tensorflow as tf
from CustomSchedule import CustomSchedule
from hparams import basic_param
from Transformer import Transformer
import matplotlib.pyplot as plt
from load_dictionary import load_dictionary
from load_dataset import load_dataset
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
logging.basicConfig(level=logging.ERROR)
def evaluate(inp_sentence, en_dict, zh_dict, transformer, max_length):
# 準備英文句子前後會加上的 <start>, <end>
start_token = [en_dict.vocab_size]
end_token = [en_dict.vocab_size + 1]
# inp_sentence 是字串,我們用 Subword Tokenizer 將其變成子詞的索引序列
# 並在前後加上 BOS / EOS
inp_sentence = start_token + en_dict.encode(inp_sentence) + end_token
encoder_input = tf.expand_dims(inp_sentence, 0)
# 跟我們在影片裡看到的一樣,Decoder 在第一個時間點吃進去的輸入
# 是一個只包含一個中文 <start> token 的序列
decoder_input = [zh_dict.vocab_size]
output = tf.expand_dims(decoder_input, 0) # 增加 batch 維度
# auto-regressive,一次生成一個中文字並將預測加到輸入再度餵進 Transformer
for i in range(max_length):
# 每多一個生成的字就得產生新的遮罩
# predictions.shape == (batch_size, seq_len, vocab_size)
predictions, attention_weights = transformer.__call__(encoder_input,
output,
False,
)
# 將序列中最後一個 distribution 取出,並將裡頭值最大的當作模型最新的預測字
predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size)
predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
# 遇到 <end> token 就停止回傳,代表模型已經產生完結果
if tf.equal(predicted_id, zh_dict.vocab_size + 1):
return tf.squeeze(output, axis=0), attention_weights
# 將 Transformer 新預測的中文索引加到輸出序列中,讓 Decoder 可以在產生
# 下個中文字的時候關注到最新的 `predicted_id`
output = tf.concat([output, predicted_id], axis=-1)
# 將 batch 的維度去掉後回傳預測的中文索引序列
return tf.squeeze(output, axis=0), attention_weights
def plot_attention_weights(attention, sentence, result, layer_name, en_dict, zh_dict, max_len_tar=18):
zhfont = matplotlib.font_manager.FontProperties(fname='SimHei.ttf')
plt.style.use("seaborn-whitegrid")
fig = plt.figure(figsize=(17, 7))
sentence = en_dict.encode(sentence)
# 只顯示中文序列前 `max_len_tar` 個字以避免畫面太過壅擠
if max_len_tar:
predicted_seq = result[:max_len_tar]
else:
max_len_tar = len(result)
# 將某一個特定 Decoder layer 裡頭的 MHA 1 或 MHA2 的注意權重拿出來並去掉 batch 維度
attention_weights = tf.squeeze(attention[layer_name], axis=0)
# (num_heads, tar_seq_len, inp_seq_len)
# 將每個 head 的注意權重畫出
for head in range(attention_weights.shape[0]):
ax = fig.add_subplot(2, 4, head + 1)
# [注意]我為了將長度不短的英文子詞顯示在 y 軸,將注意權重做了 transpose
attn_map = np.transpose(attention_weights[head][:max_len_tar, :])
ax.matshow(attn_map, cmap='viridis') # (inp_seq_len, tar_seq_len)
fontdict = {"fontproperties": zhfont}
ax.set_xticks(range(max(max_len_tar, len(predicted_seq))))
ax.set_xlim(-0.5, max_len_tar - 1.5)
ax.set_yticks(range(len(sentence) + 2))
ax.set_xticklabels([zh_dict.decode([i]) for i in predicted_seq
if i < zh_dict.vocab_size],
fontdict=fontdict, fontsize=18)
ax.set_yticklabels(
['<start>'] + [en_dict.decode([i]) for i in sentence] + ['<end>'],
fontdict=fontdict)
ax.set_xlabel('Head {}'.format(head + 1))
ax.tick_params(axis="x", labelsize=12)
ax.tick_params(axis="y", labelsize=12)
plt.tight_layout()
plt.show()
plt.close(fig)
def translate(sentence, en_dict, zh_dict, transformer, max_length, plot='decoder_layer4_dec_enc'):
result, attention_weights = evaluate(sentence, en_dict, zh_dict, transformer, max_length)
print("attention_weights.keys():")
for layer_name, attn in attention_weights.items():
print(f"{layer_name}.shape: {attn.shape}")
predicted_sentence = zh_dict.decode([i for i in result
if i < zh_dict.vocab_size])
print('Input: {}'.format(sentence))
print('Predicted translation: {}'.format(predicted_sentence))
if plot:
plot_attention_weights(attention_weights, sentence, result, plot, en_dict, zh_dict)
def encode(en_t, zh_t):
# 因為字典的索引從 0 開始,
# 我們可以使用 subword_encoder_en.vocab_size 這個值作為 BOS 的索引值
# 用 subword_encoder_en.vocab_size + 1 作為 EOS 的索引值
en_indices = [en_dict.vocab_size] + en_dict.encode(
en_t.numpy()) + [en_dict.vocab_size + 1]
# 同理,不過是使用中文字典的最後一個索引 + 1
zh_indices = [zh_dict.vocab_size] + zh_dict.encode(
zh_t.numpy()) + [zh_dict.vocab_size + 1]
return en_indices, zh_indices
def tf_encode(en_t, zh_t):
# 在 `tf_encode` 函式裡頭的 `en_t` 與 `zh_t` 都不是 Eager Tensors
# 要到 `tf.py_funtion` 裡頭才是
# 另外因為索引都是整數,所以使用 `tf.int64`
return tf.py_function(encode, [en_t, zh_t], [tf.int64, tf.int64])
def filter_max_length(en, zh, max_length=40):
# en, zh 分別代表英文與中文的索引序列
return tf.logical_and(tf.size(en) <= max_length,
tf.size(zh) <= max_length)
if __name__ == '__main__':
p = basic_param()
train_examples, test_examples, _ = load_dataset()
test_dataset = (test_examples
.map(tf_encode)
.filter(filter_max_length)
)
en_dict, zh_dict = load_dictionary()
output_dir = "nmt"
checkpoint_path = os.path.join(output_dir, "checkpoints")
log_dir = os.path.join(output_dir, 'logs')
# train_perc = 20
run_id = f"{p.num_layers}layers_{p.d_model}d_{p.num_heads}heads_{p.dff}dff_{p.train_perc}train_perc"
checkpoint_path = os.path.join(checkpoint_path, run_id)
# log_dir = os.path.join(log_dir, run_id)
learning_rate = CustomSchedule(p.d_model)
p.add_param('input_vocab_size', 8137)
p.add_param('target_vocab_size', 4203)
p.add_param('learning_rate', learning_rate)
transformer = Transformer(p.num_layers, p.d_model, p.num_heads, p.dff, p.input_vocab_size, p.target_vocab_size,
p.dropout_rate)
optimizer = tf.keras.optimizers.Adam(p.learning_rate, beta_1=0.9, beta_2=0.98,
epsilon=1e-9)
# tf.train.Checkpoint 可以幫我們把想要存下來的東西整合起來,方便儲存與讀取
# 一般來說你會想存下模型以及 optimizer 的狀態
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
if ckpt_manager.latest_checkpoint:
print("exist")
ckpt.restore(ckpt_manager.latest_checkpoint)
last_epoch = int(ckpt_manager.latest_checkpoint.split("-")[-1])
print(f'Loaded latest checkpoint,model has already be trained for {last_epoch} epochs。')
# transformer = tf.saved_model.load('models/transformer_base')
print("...Model loaded...")
print('-' * 20)
translate("China, India, and others have enjoyed continuing growth.", en_dict, zh_dict, transformer,
p.max_length)
# score_list = []
# for (x, (inp, tar)) in enumerate(test_dataset):
# # for x in range(p.batch_size):
# print('batch id:', x)
# sentence = en_dict.decode([i for i in inp if i < en_dict.vocab_size])
# print('English sentence: ',sentence)
# print('Encoded input:', inp)
# print('Real output: ',tar)
# result, _ = evaluate(sentence, en_dict, zh_dict, transformer, p.max_length)
# print('Prediction: ',result)
# print('Prediction sentence: ', zh_dict.decode([i for i in result if i < zh_dict.vocab_size]) )
# target = tar.numpy().tolist()
# result = result.numpy().tolist()
# # print(tar)
# # print(result)
# score = sentence_bleu([target], result)
# print('BLEU: ',score)
# score_list.append(score)
#
# print(sum(score_list/len(score_list)))
# ckpt_manager 會去 checkpoint_path 看有沒有符合 ckpt 裡頭定義的東西
# 存檔的時候只保留最近 5 次 checkpoints,其他自動刪除
# ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)