Skip to content

Commit

Permalink
Support augmentation (#13)
Browse files Browse the repository at this point in the history
- Add support for dictionary-based augmentation
- Fix bugs
  • Loading branch information
patrickphat authored Sep 6, 2020
1 parent 9167792 commit 68952ec
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 33 deletions.
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

setuptools.setup(
name='urbamt',
version='0.0.1-b2',
version='0.0.1-b3',
author="Patrick Phat Nguyen",
author_email="[email protected]",
description="Universal Rule-based Machine Translation Toolkit (URBaMT)",
description="URBaMT: Universal Rule-based Machine Translation Toolkit",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/urbamt/urbamt",
packages=setuptools.find_packages(exclude=['docs', 'tests']),
packages=setuptools.find_packages(exclude=['docs', 'tests', 'experiments']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down
27 changes: 6 additions & 21 deletions urbamt/translator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, List
from .utils.tree_manipulation import translate_tree_grammar
from .utils.tree_manipulation import translate_trees_grammar
from .utils.misc import remove_trailing_space
import nltk
from nltk.parse.chart import BottomUpLeftCornerChartParser as Parser
Expand Down Expand Up @@ -65,27 +65,12 @@ def translate(self, sentences: List[str] or str, allow_multiple_translation = Fa
for sentence in sentences:
sentence = self.__process_text_input(sentence)
trees = self.parser.parse(sentence.split())
list_trees = [tree for tree in trees]

trans_sentence = translate_trees_grammar(list_trees, self.src_to_tgt_grammar, self.src_to_tgt_dictionary)

# Flag to check if there are trees in generator (grammar matched)
translated = False

for t in trees:
translated = True

# Translate grammar
trans_gram_sentence = translate_tree_grammar(t,self.src_to_tgt_grammar)

# Translate words
trans_lang_sentence = ' '.join([self.src_to_tgt_dictionary.get(word,word) for word in trans_gram_sentence.split()])

translated_sentences.append(trans_lang_sentence)

# Get 1 sentence only, will support multi sentence
break

if translated == False:
failed_sentences.append(sentence)

translated_sentences.append(trans_sentence)

# String to display failed sentence
failed_sentences = '\n'.join(failed_sentences)

Expand Down
89 changes: 80 additions & 9 deletions urbamt/utils/tree_manipulation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import nltk
from nltk import ParentedTree as PTree
from typing import List
import random

def tree_to_ptree(tree: nltk.Tree):
tree_str = tree.__str__()
Expand All @@ -13,26 +15,37 @@ def get_grammar(tree: nltk.Tree):
grammar += f" {sub.label}"


def swap_tree_given_left(left_tree: nltk.Tree, displacement: list):
def swap_tree_given_left(left_tree: nltk.Tree, displacement: List[int], new_words= List[str]):
"""
swap left node with right node within a parent node
"""

nodes = [left_tree]
right_tree = left_tree.right_sibling()
parent_tree = left_tree.parent()

# Get all tree pointer
for i in range(len(displacement)-1):
for disp in displacement:
# disp = -1 indicates that is a new word, skip
if disp == -1:
continue
nodes.append(right_tree)

right_tree = right_tree.right_sibling()
if right_tree == None:
break

# Remove all siblings and left-most self
for node in nodes:
parent_tree.remove(node)

# Append with new displacement
for disp in displacement:
parent_tree.append(nodes[disp])
# disp = -1 indicates that is a new word
if disp == -1:
new_word = PTree('NEW', [new_words.pop(0)])
parent_tree.append(new_word)
else:
parent_tree.append(nodes[disp])

return parent_tree

Expand Down Expand Up @@ -60,24 +73,82 @@ def build_grammar_str_from_left_most(tree: nltk.Tree):


def translate_tree_grammar(tree: nltk.Tree, grammar_substitutions: dict):

# Number of substitution done
num_subs = 0
# Convert tree to ParentedTree
ptree = tree_to_ptree(tree)

# Traverse through subtrees
for sub in ptree.subtrees():
# Create grammar string from left-most node. E.g: NP -> JJ NP,
# in this case, JJ is left-most node
grammar_str = build_grammar_str_from_left_most(sub)
for src_grammar, tgt_grammar in grammar_substitutions.items():
if grammar_str == src_grammar:
disp = calculate_displacement(src_grammar,tgt_grammar)
swap_tree_given_left(sub,disp)
num_subs += 1

# Calculate displacement between 2 grammar strings
disp, new_words = calculate_displacement(src_grammar,tgt_grammar)

# Change tree nodes positions thanks to new displacement
swap_tree_given_left(sub, disp, new_words)

translated_grammar_sentence = " ".join(ptree.leaves())
return translated_grammar_sentence
return translated_grammar_sentence, num_subs
def translate_sentence_words(sentence, src_to_tgt_dictionary):
words_list = []

for word in sentence.split():
target_word = src_to_tgt_dictionary.get(word,word)

if isinstance(target_word, list):
target_word = random.choice(target_word)

words_list.append(target_word)

return ' '.join(words_list)

def translate_trees_grammar(list_trees: List[nltk.Tree], src_to_tgt_grammar, src_to_tgt_dictionary):

# Flag to check if there are trees in generator (grammar matched)
translated = False

# translated sentence map with number of grammar substitution found
trans_map = {}

for tree in list_trees:
translated = True

# Translate grammar
trans_gram_sentence, num_subs = translate_tree_grammar(tree, src_to_tgt_grammar)

# Translate words
trans_lang_sentence = translate_sentence_words(trans_gram_sentence, src_to_tgt_dictionary)

# Append to trans map
trans_map[trans_lang_sentence] = num_subs

# Return translation that has the most displacement
return max(trans_map, key=trans_map.get)

def calculate_displacement(src_grammar, tgt_grammar):
src_grammar_lst = src_grammar.split()
tgt_grammar_lst = tgt_grammar.split()

src_grammar_lst = src_grammar_lst[src_grammar_lst.index("->")+1:]
tgt_grammar_lst = tgt_grammar_lst[tgt_grammar_lst.index("->")+1:]

displacement = []
new_words = []

for word in tgt_grammar_lst:
displacement.append(src_grammar_lst.index(word))
return displacement
try:
displacement.append(src_grammar_lst.index(word))
except ValueError:
# Resolve ValueError: substring not found
# Which indicates this is a new word
displacement.append(-1)
new_words.append(word)

return displacement, new_words

0 comments on commit 68952ec

Please sign in to comment.