decode-cover

#!/usr/bin/env python

import optparse
import sys
import models
import numpy as np
from collections import namedtuple


# load in command line options
optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=20, type="int", help="Limit on number of translations to consider per phrase (default=20)")
optparser.add_option("-s", "--stack-size", dest="s", default=250, type="int", help="Maximum stack size (default=250)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
# french input sentences
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]
# if word unknown, translate as is
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]

# define our hypothesis tuples, now included coverage
hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase, coverage, start, end")

def coverage_hypothesis(h, f, start, end):
    # if this phrase does not exist in tm, we can't translate it
    if f[start:end] not in tm:
        return None

    # if a portion of this phrase has been translated, don't retranslate
    for i in range(start, end):
        if h.coverage[i] is 1:
            return None   

    # update coverage vector
    covered = [k for k in h.coverage]
    for i in range(start, end):
        covered[i] = 1
    covered = tuple(covered)

    # expand hypothesis for phrase translations
    new_hypotheses = []
    for phrase in tm[f[start:end]]: 
        # compute log probability
        logprob = h.logprob + phrase.logprob
        lm_state = h.lm_state
        for word in phrase.english.split():
            (lm_state, word_logprob) = lm.score(lm_state, word)
            logprob += word_logprob

        # add ending probability
        logprob += lm.end(lm_state) if sum(covered)==len(f) else 0.0

        # generate new hypothesis
        new_hypothesis = hypothesis(logprob, lm_state, h, phrase, covered, start, end)
        new_hypotheses.append(new_hypothesis)
    return new_hypotheses

# decode
sys.stderr.write("Decoding %s...\n" % (opts.input,))
for f in french:
    stacks = [{} for _ in f] + [{}]
    # create an initial hypothesis and add it to the stack
    cover = tuple([0 for _ in f])
    initial_hypothesis = hypothesis(0.0, lm.begin(), None, None, cover, 0, 0)

    # hypothesis identified by the last phrase it translated and its coverage
    stacks[0][((0,0), cover)] = initial_hypothesis 

    # expand the top k hypotheses in each stack
    for (x, stack) in enumerate(stacks[:-1]):
        for h in sorted(stack.itervalues(), key = lambda h: -h.logprob)[:opts.s]:
            # range of phrases before hypothesis
            for i in range(0, h.start):
                for j in range(i+1, h.start+1):
                    new_hypotheses = coverage_hypothesis(h, f, i, j)

                    if new_hypotheses: 
                        # add hypothesis to stack
                        for new_hypothesis in new_hypotheses:
                            new_key = ((new_hypothesis.start, new_hypothesis.end), new_hypothesis.coverage)
                            new_stack_num = sum(new_hypothesis.coverage)
                            # If hypothesis new or better score, set
                            if new_key not in stacks[new_stack_num] or stacks[new_stack_num][new_key].logprob < new_hypothesis.logprob:
                                stacks[new_stack_num][new_key] = new_hypothesis
            
            # range of phrases after hypothesis
            for i in range(h.end, len(f)):
                for j in range(i+1, len(f)+1):
                    new_hypotheses = coverage_hypothesis(h, f, i, j)
                    
                    if new_hypotheses:                    
                         for new_hypothesis in new_hypotheses:
                            new_key = ((new_hypothesis.start, new_hypothesis.end), new_hypothesis.coverage)
                            new_stack_num = sum(new_hypothesis.coverage)

                            # if hypothesis new or superior score
                            if new_key not in stacks[new_stack_num] or stacks[new_stack_num][new_key].logprob < new_hypothesis.logprob:
                                stacks[new_stack_num][new_key] = new_hypothesis

    winner = max(stacks[-1].itervalues(), key=lambda h: h.logprob)
    def extract_english(h): 
        return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)
    print extract_english(winner)