From 9167792b078b25e14dff73000d185370fcc59b19 Mon Sep 17 00:00:00 2001 From: Patrick Nguyen Date: Wed, 2 Sep 2020 23:39:10 +0700 Subject: [PATCH] Remove recursive parser (#12) * remove recursive parser * fix :bug: --- setup.py | 2 +- urbamt/translator.py | 35 ++++++++++++++++++++----------- urbamt/utils/tree_manipulation.py | 1 - 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index bf4e40a..41481d9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='urbamt', - version='0.0.1-b1', + version='0.0.1-b2', author="Patrick Phat Nguyen", author_email="me@patrickphat.com", description="Universal Rule-based Machine Translation Toolkit (URBaMT)", diff --git a/urbamt/translator.py b/urbamt/translator.py index 5320baa..3ba9186 100644 --- a/urbamt/translator.py +++ b/urbamt/translator.py @@ -2,7 +2,7 @@ from .utils.tree_manipulation import translate_tree_grammar from .utils.misc import remove_trailing_space import nltk -from nltk import RecursiveDescentParser as Parser +from nltk.parse.chart import BottomUpLeftCornerChartParser as Parser class URBAMT_Translator: """""" @@ -11,7 +11,6 @@ def __init__(self, src_to_tgt_grammar: Dict, src_to_tgt_dictionary: Dict): """Initialize the translator - Args: src_grammar (str): source language grammar written in nltk style E.g: src_grammar = \""" @@ -28,14 +27,10 @@ def __init__(self, JJ -> 'good' NN -> 'school' \""" - src_to_tgt_grammar (Dict): Transition between source grammar and target grammar as a dict E.g: src2target_grammar = { "NP1 -> JJ NN": "NP1 -> NN JJ" } - - - src_to_tgt_dictionary (Dict): Dictionary of word-by-word transition from src language to target language E.g: en_to_vi_dict = { "I":"tôi", @@ -54,24 +49,28 @@ def __init__(self, def __process_text_input(txt): return remove_trailing_space(txt) - def translate(self, sentences: List[str] or str): + def translate(self, sentences: List[str] or str, allow_multiple_translation = False): """Translate a list of sentences - Args: sentences (List[str]): A list of str-typed sentences - Returns: List[str]: A list of translated sentences """ if isinstance(sentences,str): sentences = [sentences] - translated_sentence = [] + translated_sentences = [] + failed_sentences = [] + for sentence in sentences: sentence = self.__process_text_input(sentence) trees = self.parser.parse(sentence.split()) + # Flag to check if there are trees in generator (grammar matched) + translated = False + for t in trees: + translated = True # Translate grammar trans_gram_sentence = translate_tree_grammar(t,self.src_to_tgt_grammar) @@ -79,6 +78,18 @@ def translate(self, sentences: List[str] or str): # Translate words trans_lang_sentence = ' '.join([self.src_to_tgt_dictionary.get(word,word) for word in trans_gram_sentence.split()]) - translated_sentence.append(trans_lang_sentence) + translated_sentences.append(trans_lang_sentence) + + # Get 1 sentence only, will support multi sentence + break + + if translated == False: + failed_sentences.append(sentence) + + # String to display failed sentence + failed_sentences = '\n'.join(failed_sentences) + + if len(failed_sentences) > 0: + raise ValueError(f"Please check your grammar again, failed to translated these sentence \n {failed_sentences}") - return translated_sentence \ No newline at end of file + return translated_sentences \ No newline at end of file diff --git a/urbamt/utils/tree_manipulation.py b/urbamt/utils/tree_manipulation.py index 85a2ec7..0e8c3d4 100644 --- a/urbamt/utils/tree_manipulation.py +++ b/urbamt/utils/tree_manipulation.py @@ -75,7 +75,6 @@ def calculate_displacement(src_grammar, tgt_grammar): src_grammar_lst = src_grammar.split() tgt_grammar_lst = tgt_grammar.split() - print(src_grammar_lst.index("->")) src_grammar_lst = src_grammar_lst[src_grammar_lst.index("->")+1:] tgt_grammar_lst = tgt_grammar_lst[tgt_grammar_lst.index("->")+1:] displacement = []