Skip to content

Commit

Permalink
Restructure main lark parser (WiP)
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Nov 26, 2024
1 parent 9ad365b commit eedec78
Show file tree
Hide file tree
Showing 19 changed files with 227 additions and 162 deletions.
9 changes: 0 additions & 9 deletions ebl/atf_importer/domain/atf_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,6 @@
# Remove. Use `legacy_atf_visitor` instead


class StripSigns(Visitor):
def ebl_atf_text_line__legacy_uncertain_sign_prefix(self, tree: Tree) -> None:
if (
tree.data == "ebl_atf_text_line__legacy_uncertain_sign_prefix"
and tree.children[0] == "$"
):
tree.children[0] = ""


class DepthFirstSearch(Visitor):
def visit_topdown(self, tree: Tree, result: str) -> str:
if not hasattr(tree, "data"):
Expand Down
41 changes: 19 additions & 22 deletions ebl/atf_importer/domain/atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@


class AtfPreprocessor(AtfPreprocessorBase):
def convert_lines(self, file: str, filename: str) -> List[Dict[str, Any]]:
legacy_visitor = LegacyAtfVisitor()

def convert_lines_from_string(self, text: str) -> List[Dict[str, Any]]:
return self._convert_lines(text.split("\n"))

def convert_lines_from_path(self, path: str, filename: str) -> List[Dict[str, Any]]:
self.logger.info(Util.print_frame(f'Converting: "{filename}.atf"'))
lines = self.read_lines_from_path(path)
return self._convert_lines(lines)

lines = self.read_lines(file)
def _convert_lines(self, lines: List[str]) -> List[Dict[str, Any]]:
processed_lines = []
for line in lines:
result = self.process_line(line)
if self.stop_preprocessing:
break
processed_lines.append(
{
"c_line": result[0],
Expand All @@ -28,6 +33,10 @@ def convert_lines(self, file: str, filename: str) -> List[Dict[str, Any]]:
self.logger.info(Util.print_frame("Preprocessing finished"))
return processed_lines

def get_line_tree_data(self, tree: Tree) -> Tuple[Tree, List[Any], str, List[Any]]:
words = self.serialize_words(tree)
return (tree, words, tree.data, [])

def process_line(
self, original_atf_line: str
) -> Tuple[Optional[str], Optional[List[Any]], Optional[str], Optional[List[Any]]]:
Expand All @@ -37,11 +46,6 @@ def process_line(
try:
if atf_line.startswith("#lem"):
raise Exception("Special handling for #lem lines.")
if atf_line.startswith("@translation") or atf_line.startswith("@("):
# ToDo: Handle translations
# @translation labeled en project
# @(t.e. 1)
return self.parse_and_convert_line("")
return self.check_original_line(original_atf_line)
except Exception:
atf_line = (
Expand All @@ -51,17 +55,13 @@ def process_line(
)
return self.parse_and_convert_line(atf_line)

def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]]:
print("! check_original_line.")
def check_original_line(
self, atf: str
) -> Tuple[Optional[Tree], List[Any], str, List[Any]]:
if self.style == 2 and atf[0] == "#" and atf[1] == " ":
atf = atf.replace("#", "#note:")
atf = atf.replace("# note:", "#note:")
# input(f"! before parse:\n{atf}")
tree = self.ebl_parser.parse(atf)
# print(tree.pretty())
# input(f"! after parse:\n{self.line_tree_to_string(tree)}")
# input("! before transform")
# input("! after transform")
tree = self.transform_legacy_atf(tree)
self.logger.info("Line successfully parsed")
self.logger.debug(f"Parsed line as {tree.data}")
Expand All @@ -71,10 +71,9 @@ def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]]
return self.get_line_tree_data(tree)

def transform_legacy_atf(self, tree: Tree) -> Tree:
visitor = LegacyAtfVisitor()
visitor.visit(tree)
self.legacy_visitor.visit(tree)
# print('!!!! visitor.legacy_found', visitor.legacy_found)
if visitor.legacy_found:
if self.legacy_visitor.legacy_found:
self.logger.info("Legacy line successfully parsed")
return tree

Expand All @@ -90,10 +89,8 @@ def parse_and_convert_line(
return tree
elif tree.data == "lem_line":
result = self.convert_lem_line(atf, tree)
elif tree.data == "text_line":
result = self.get_line_tree_data(tree)
else:
result = self.unused_line(tree)
result = self.get_line_tree_data(tree)
except Exception:
self.logger.error(traceback.format_exc())
self.logger.error(f"Could not convert line: {atf}", "unparsable_lines")
Expand Down
32 changes: 1 addition & 31 deletions ebl/atf_importer/domain/atf_preprocessor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,8 @@
from typing import Tuple, Optional, List, Any
from lark import Lark, Tree
from ebl.atf_importer.domain.atf_conversions import (
# ConvertLineDividers,
# ConvertLineJoiner,
StripSigns,
GetLemmaValuesAndGuidewords,
GetWords,
LineSerializer,
)

opening_half_bracket = {"⌈", "⸢"}
Expand Down Expand Up @@ -114,7 +110,6 @@ def __init__(self, logdir: str, style: int) -> None:
self.logger.setLevel(logging.DEBUG)
self.skip_next_lem_line = False
self.unused_lines = unused_lines
self.stop_preprocessing = False
self.logdir = logdir
self.style = style
self.open_found = False
Expand Down Expand Up @@ -152,16 +147,6 @@ def do_cdli_replacements(self, atf: str) -> str:
def reorder_bracket_punctuation(self, atf: str) -> str:
return re.sub(r"\]([\?!]+)", lambda match: match.group(1) + "]", atf)

def unused_line(
self, tree
) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]:
if tree.data in self.unused_lines:
return (self.line_tree_to_string(tree), None, tree.data, None)
self.logger.warning(
f"Attempting to process a line not marked as unused: {tree.data}"
)
return (None, None, tree.data, None)

def convert_lem_line(
self, atf: str, tree: Tree
) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]:
Expand All @@ -179,12 +164,6 @@ def convert_lem_line(
)
return atf, lemmas_and_guidewords_array, tree.data, []

def line_tree_to_string(self, tree: Tree) -> str:
# ToDo: Remove
line_serializer = LineSerializer()
line_serializer.visit_topdown(tree)
return line_serializer.line.strip(" ")

def serialize_words(self, tree: Tree) -> List[Any]:
words_serializer = GetWords()
words_serializer.result = []
Expand All @@ -196,16 +175,7 @@ def serizalize_lemmas_and_guidewords(self, tree: Tree) -> List[Any]:
lemmas_and_guidewords_serializer.visit(tree)
return lemmas_and_guidewords_serializer.result

def get_line_tree_data(self, tree: Tree) -> Tuple[str, List[Any], str, List[Any]]:
# ConvertLineDividers().visit(tree)
# ConvertLineJoiner().visit(tree)
StripSigns().visit(tree) # ToDo: Move
converted_line = self.line_tree_to_string(tree)
input(f"converted line: {converted_line}")
words = self.serialize_words(tree)
return (converted_line, words, tree.data, [])

def read_lines(self, file: str) -> List[str]:
def read_lines_from_path(self, file: str) -> List[str]:
with codecs.open(file, "r", encoding="utf8") as f:
return f.read().split("\n")

Expand Down
63 changes: 57 additions & 6 deletions ebl/atf_importer/domain/legacy_atf_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,23 +233,74 @@ def ebl_atf_text_line__VALUE_CHARACTER(self, token: Token) -> Token:


class LegacyColumnTransformer(LegacyTransformer):
column_number = 1
prefix = "ebl_atf_at_line"

# ToDo: Fix this. It seems like this transformer is not being called
# for some unclear reason. The error is probably not here, but in the
# visitor or elsewhere.
# All methods with `test` are just for testing.

def __init__(self, **kwargs) -> None:
prefix = "ebl_atf_at_line" # noqa: F841
super().__init__(**kwargs)
self.column_number = 1

@v_args(inline=True)
def ebl_atf_at_line__legacy_column(self, column: Tree) -> Tree:
self.legacy_found = True
print("!!!!", column, self.column_number)
print("!!!!", column)
input()
self.column_number += 1
return column # self.to_tree("bla_bla", [])
return self.to_tree(
"legacy_column",
# [self.to_token("INT", str(self.column_number)), self.to_tree("status", [])],
[self.to_token("INT", "1"), self.to_tree("status", [])],
)

@v_args(inline=True)
def ebl_atf_at_line__INT(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_at_line__ebl_atf_common__INT(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_common__INT(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_at_line__face(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_at_line__LCASE_LETTER(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_at_line__status(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test)
input()
return test

@v_args(inline=True)
def ebl_atf_at_line__at_line_value(self, test: Tree) -> Tree:
def ebl_atf_at_line__SURFACE(self, test: Tree) -> Tree:
self.legacy_found = True
print("!!!!", test, self.column_number)
print("!!!!", test)
input()
return test
7 changes: 5 additions & 2 deletions ebl/atf_importer/domain/legacy_atf_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(self):
self.legacy_found = False
for suffix, transformers in self.nodes_to_visit.items():
prefix = self.text_line_prefix
if "at_line" in suffix:
if suffix in ["at_line"]:
prefix = self.at_line_prefix
self._set_rules(suffix, transformers, prefix)

Expand All @@ -76,6 +76,7 @@ def _set_rules(
transformers: Sequence[Tuple[LegacyTransformer, str]],
prefix: str,
) -> None:
print(f"{prefix}__{suffix}")
setattr(
self,
f"{prefix}__{suffix}",
Expand All @@ -92,7 +93,9 @@ def _method(tree: Tree) -> Tree:

return _method

def _transform(self, tree: Tree, transformer: LegacyTransformer, replace: str):
def _transform(
self, tree: Tree, transformer: LegacyTransformer, replace: str
) -> None:
transformer.clear()
transformer.current_tree = tree
transformed_tree = transformer.transform(tree)
Expand Down
6 changes: 3 additions & 3 deletions ebl/corpus/domain/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ebl.corpus.domain.manuscript import Manuscript
from ebl.errors import DataError
from ebl.transliteration.domain.dollar_line import DollarLine
from ebl.transliteration.domain.atf_parsers.lark_parser import CHAPTER_PARSER
from ebl.transliteration.domain.atf_parsers.lark_parser import CHAPTER_PARSER, PARATEXT_PARSER
from ebl.transliteration.domain.atf_parsers.lark_parser_errors import PARSE_ERRORS
from ebl.transliteration.domain.note_line import NoteLine

Expand All @@ -14,12 +14,12 @@ def parse_chapter(
atf: str, manuscripts: Iterable[Manuscript], start: Optional[str] = None
) -> Sequence[Line]:
try:
tree = CHAPTER_PARSER.parse(atf, start=start)
tree = CHAPTER_PARSER.parse(atf)
return ChapterTransformer(manuscripts).transform(tree)
except PARSE_ERRORS as error:
raise DataError(error) from error


def parse_paratext(atf: str) -> Union[NoteLine, DollarLine]:
tree = CHAPTER_PARSER.parse(atf, start="paratext")
tree = PARATEXT_PARSER.parse(atf)
return ChapterTransformer(()).transform(tree)
28 changes: 28 additions & 0 deletions ebl/tests/atf_importer/test_atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,30 @@

# ToDo: All transformers should be tested

TRANSLATION_LEGACY = """
@obverse
1. a-na
2. a-bi-ya
@translation en labelled
@label(o 1-o 2)
To my father
"""

TRANSLATION_EXPECTED = """
@obverse
1. a-na
# tr.en.(o 2): To my father
2. a-bi-ya
"""

PARSE_AND_TRANSFORM_LEGACY = [
("", ""),
("@column", "@column 1"),
("@column", "@column 2"),
("@face a", "@face a"),
("@obverse", "@obverse"),
("@reverse", "@reverse"),
("$ obverse broken", "$ obverse broken"),
("$ single ruling", "$ single ruling"),
("1. a'", "1. aʾ"),
("1′. A", "1'. A"),
Expand Down Expand Up @@ -82,6 +99,17 @@
]


def test_legacy_translation():
atf_preprocessor = AtfPreprocessor("../logs", 0)
legacy_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_LEGACY)
expected_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_EXPECTED)
print("RESULT:\n", legacy_tree) # .pretty())
print("EXPECTED:\n", expected_tree) # .pretty())
# input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"

assert legacy_tree == expected_tree


@pytest.mark.parametrize(
"legacy_line,ebl_line",
[
Expand Down
4 changes: 3 additions & 1 deletion ebl/transliteration/domain/atf_parsers/lark_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@
ATF_GRAMMAR_PATH, **kwargs_lark, start="translation_line"
)
PARATEXT_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark, start="paratext")
CHAPTER_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark, start="chapter")
CHAPTER_PARSER = Lark.open(
ATF_GRAMMAR_PATH, **kwargs_lark, start="chapter", debug=True
) # ToDo: Remove debug
LINE_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark)


Expand Down
Loading

0 comments on commit eedec78

Please sign in to comment.