-
Notifications
You must be signed in to change notification settings - Fork 1
/
pred_nmt
109 lines (90 loc) · 3.55 KB
/
pred_nmt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
# from nltk.translate.bleu_score import corpus_bleu
model = load_model('model.h5')
# load a clean dataset
def load_clean_sentences(filename):
return load(open(filename, 'rb'))
# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# max sentence length
def max_length(lines):
return max(len(line.split()) for line in lines)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
X = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X, maxlen=length, padding='post')
return X
# map an integer to a word
def word_for_id(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
prediction = model.predict(source, verbose=0)[0]
integers = [argmax(vector) for vector in prediction]
target = list()
for i in integers:
word = word_for_id(i, tokenizer)
if word is None:
break
target.append(word)
return ' '.join(target)
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
actual, predicted = list(), list()
for i, source in enumerate(sources):
# translate encoded source text
source = source.reshape((1, source.shape[0]))
translation = predict_sequence(model, eng_tokenizer, source)
print(translation)
# raw_target, raw_src = raw_dataset[i]
# if i < 10:
# print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
# actual.append([raw_target.split()])
# predicted.append(translation.split())
# calculate BLEU score
# print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
# print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
# print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
# print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
# ger_tokenizer = create_tokenizer(dataset[:, 1])
# ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_length = max_length(dataset[:, 1])
#
# trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
# testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
ger_tokenizer1=create_tokenizer(dataset[:, 1])
ger_vocab_size1 = len(ger_tokenizer1.word_index) + 1
ger_length1 = max_length(dataset[:, 1])
# load model
question = ""
while question != "exit()":
sent=input()
list_Sent=[sent]
trainX1 = encode_sequences(ger_tokenizer1, ger_length1, list_Sent)
evaluate_model(model, eng_tokenizer, trainX1, train)
# evaluate_model(model, eng_tokenizer, [trainX[0]], train)
# test on some test sequences
# print('test')
# evaluate_model(model, eng_tokenizer, testX, test)