-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecode-cover
112 lines (93 loc) · 5.18 KB
/
decode-cover
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
import optparse
import sys
import models
import numpy as np
from collections import namedtuple
# load in command line options
optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=20, type="int", help="Limit on number of translations to consider per phrase (default=20)")
optparser.add_option("-s", "--stack-size", dest="s", default=250, type="int", help="Maximum stack size (default=250)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]
tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
# french input sentences
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]
# if word unknown, translate as is
for word in set(sum(french,())):
if (word,) not in tm:
tm[(word,)] = [models.phrase(word, 0.0)]
# define our hypothesis tuples, now included coverage
hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase, coverage, start, end")
def coverage_hypothesis(h, f, start, end):
# if this phrase does not exist in tm, we can't translate it
if f[start:end] not in tm:
return None
# if a portion of this phrase has been translated, don't retranslate
for i in range(start, end):
if h.coverage[i] is 1:
return None
# update coverage vector
covered = [k for k in h.coverage]
for i in range(start, end):
covered[i] = 1
covered = tuple(covered)
# expand hypothesis for phrase translations
new_hypotheses = []
for phrase in tm[f[start:end]]:
# compute log probability
logprob = h.logprob + phrase.logprob
lm_state = h.lm_state
for word in phrase.english.split():
(lm_state, word_logprob) = lm.score(lm_state, word)
logprob += word_logprob
# add ending probability
logprob += lm.end(lm_state) if sum(covered)==len(f) else 0.0
# generate new hypothesis
new_hypothesis = hypothesis(logprob, lm_state, h, phrase, covered, start, end)
new_hypotheses.append(new_hypothesis)
return new_hypotheses
# decode
sys.stderr.write("Decoding %s...\n" % (opts.input,))
for f in french:
stacks = [{} for _ in f] + [{}]
# create an initial hypothesis and add it to the stack
cover = tuple([0 for _ in f])
initial_hypothesis = hypothesis(0.0, lm.begin(), None, None, cover, 0, 0)
# hypothesis identified by the last phrase it translated and its coverage
stacks[0][((0,0), cover)] = initial_hypothesis
# expand the top k hypotheses in each stack
for (x, stack) in enumerate(stacks[:-1]):
for h in sorted(stack.itervalues(), key = lambda h: -h.logprob)[:opts.s]:
# range of phrases before hypothesis
for i in range(0, h.start):
for j in range(i+1, h.start+1):
new_hypotheses = coverage_hypothesis(h, f, i, j)
if new_hypotheses:
# add hypothesis to stack
for new_hypothesis in new_hypotheses:
new_key = ((new_hypothesis.start, new_hypothesis.end), new_hypothesis.coverage)
new_stack_num = sum(new_hypothesis.coverage)
# If hypothesis new or better score, set
if new_key not in stacks[new_stack_num] or stacks[new_stack_num][new_key].logprob < new_hypothesis.logprob:
stacks[new_stack_num][new_key] = new_hypothesis
# range of phrases after hypothesis
for i in range(h.end, len(f)):
for j in range(i+1, len(f)+1):
new_hypotheses = coverage_hypothesis(h, f, i, j)
if new_hypotheses:
for new_hypothesis in new_hypotheses:
new_key = ((new_hypothesis.start, new_hypothesis.end), new_hypothesis.coverage)
new_stack_num = sum(new_hypothesis.coverage)
# if hypothesis new or superior score
if new_key not in stacks[new_stack_num] or stacks[new_stack_num][new_key].logprob < new_hypothesis.logprob:
stacks[new_stack_num][new_key] = new_hypothesis
winner = max(stacks[-1].itervalues(), key=lambda h: h.logprob)
def extract_english(h):
return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)
print extract_english(winner)