diff --git a/ebl/atf_importer/domain/atf_conversions.py b/ebl/atf_importer/domain/atf_conversions.py index d177aebf5..cc6734c9b 100644 --- a/ebl/atf_importer/domain/atf_conversions.py +++ b/ebl/atf_importer/domain/atf_conversions.py @@ -5,15 +5,6 @@ # Remove. Use `legacy_atf_visitor` instead -class StripSigns(Visitor): - def ebl_atf_text_line__legacy_uncertain_sign_prefix(self, tree: Tree) -> None: - if ( - tree.data == "ebl_atf_text_line__legacy_uncertain_sign_prefix" - and tree.children[0] == "$" - ): - tree.children[0] = "" - - class DepthFirstSearch(Visitor): def visit_topdown(self, tree: Tree, result: str) -> str: if not hasattr(tree, "data"): diff --git a/ebl/atf_importer/domain/atf_preprocessor.py b/ebl/atf_importer/domain/atf_preprocessor.py index d2341baaf..00d579b47 100644 --- a/ebl/atf_importer/domain/atf_preprocessor.py +++ b/ebl/atf_importer/domain/atf_preprocessor.py @@ -8,15 +8,20 @@ class AtfPreprocessor(AtfPreprocessorBase): - def convert_lines(self, file: str, filename: str) -> List[Dict[str, Any]]: + legacy_visitor = LegacyAtfVisitor() + + def convert_lines_from_string(self, text: str) -> List[Dict[str, Any]]: + return self._convert_lines(text.split("\n")) + + def convert_lines_from_path(self, path: str, filename: str) -> List[Dict[str, Any]]: self.logger.info(Util.print_frame(f'Converting: "{filename}.atf"')) + lines = self.read_lines_from_path(path) + return self._convert_lines(lines) - lines = self.read_lines(file) + def _convert_lines(self, lines: List[str]) -> List[Dict[str, Any]]: processed_lines = [] for line in lines: result = self.process_line(line) - if self.stop_preprocessing: - break processed_lines.append( { "c_line": result[0], @@ -28,6 +33,10 @@ def convert_lines(self, file: str, filename: str) -> List[Dict[str, Any]]: self.logger.info(Util.print_frame("Preprocessing finished")) return processed_lines + def get_line_tree_data(self, tree: Tree) -> Tuple[Tree, List[Any], str, List[Any]]: + words = self.serialize_words(tree) + return (tree, words, tree.data, []) + def process_line( self, original_atf_line: str ) -> Tuple[Optional[str], Optional[List[Any]], Optional[str], Optional[List[Any]]]: @@ -37,11 +46,6 @@ def process_line( try: if atf_line.startswith("#lem"): raise Exception("Special handling for #lem lines.") - if atf_line.startswith("@translation") or atf_line.startswith("@("): - # ToDo: Handle translations - # @translation labeled en project - # @(t.e. 1) - return self.parse_and_convert_line("") return self.check_original_line(original_atf_line) except Exception: atf_line = ( @@ -51,17 +55,13 @@ def process_line( ) return self.parse_and_convert_line(atf_line) - def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]]: - print("! check_original_line.") + def check_original_line( + self, atf: str + ) -> Tuple[Optional[Tree], List[Any], str, List[Any]]: if self.style == 2 and atf[0] == "#" and atf[1] == " ": atf = atf.replace("#", "#note:") atf = atf.replace("# note:", "#note:") - # input(f"! before parse:\n{atf}") tree = self.ebl_parser.parse(atf) - # print(tree.pretty()) - # input(f"! after parse:\n{self.line_tree_to_string(tree)}") - # input("! before transform") - # input("! after transform") tree = self.transform_legacy_atf(tree) self.logger.info("Line successfully parsed") self.logger.debug(f"Parsed line as {tree.data}") @@ -71,10 +71,9 @@ def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]] return self.get_line_tree_data(tree) def transform_legacy_atf(self, tree: Tree) -> Tree: - visitor = LegacyAtfVisitor() - visitor.visit(tree) + self.legacy_visitor.visit(tree) # print('!!!! visitor.legacy_found', visitor.legacy_found) - if visitor.legacy_found: + if self.legacy_visitor.legacy_found: self.logger.info("Legacy line successfully parsed") return tree @@ -90,10 +89,8 @@ def parse_and_convert_line( return tree elif tree.data == "lem_line": result = self.convert_lem_line(atf, tree) - elif tree.data == "text_line": - result = self.get_line_tree_data(tree) else: - result = self.unused_line(tree) + result = self.get_line_tree_data(tree) except Exception: self.logger.error(traceback.format_exc()) self.logger.error(f"Could not convert line: {atf}", "unparsable_lines") diff --git a/ebl/atf_importer/domain/atf_preprocessor_base.py b/ebl/atf_importer/domain/atf_preprocessor_base.py index 3ac1ea1b5..fbf5628dd 100644 --- a/ebl/atf_importer/domain/atf_preprocessor_base.py +++ b/ebl/atf_importer/domain/atf_preprocessor_base.py @@ -4,12 +4,8 @@ from typing import Tuple, Optional, List, Any from lark import Lark, Tree from ebl.atf_importer.domain.atf_conversions import ( - # ConvertLineDividers, - # ConvertLineJoiner, - StripSigns, GetLemmaValuesAndGuidewords, GetWords, - LineSerializer, ) opening_half_bracket = {"⌈", "⸢"} @@ -114,7 +110,6 @@ def __init__(self, logdir: str, style: int) -> None: self.logger.setLevel(logging.DEBUG) self.skip_next_lem_line = False self.unused_lines = unused_lines - self.stop_preprocessing = False self.logdir = logdir self.style = style self.open_found = False @@ -152,16 +147,6 @@ def do_cdli_replacements(self, atf: str) -> str: def reorder_bracket_punctuation(self, atf: str) -> str: return re.sub(r"\]([\?!]+)", lambda match: match.group(1) + "]", atf) - def unused_line( - self, tree - ) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]: - if tree.data in self.unused_lines: - return (self.line_tree_to_string(tree), None, tree.data, None) - self.logger.warning( - f"Attempting to process a line not marked as unused: {tree.data}" - ) - return (None, None, tree.data, None) - def convert_lem_line( self, atf: str, tree: Tree ) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]: @@ -179,12 +164,6 @@ def convert_lem_line( ) return atf, lemmas_and_guidewords_array, tree.data, [] - def line_tree_to_string(self, tree: Tree) -> str: - # ToDo: Remove - line_serializer = LineSerializer() - line_serializer.visit_topdown(tree) - return line_serializer.line.strip(" ") - def serialize_words(self, tree: Tree) -> List[Any]: words_serializer = GetWords() words_serializer.result = [] @@ -196,16 +175,7 @@ def serizalize_lemmas_and_guidewords(self, tree: Tree) -> List[Any]: lemmas_and_guidewords_serializer.visit(tree) return lemmas_and_guidewords_serializer.result - def get_line_tree_data(self, tree: Tree) -> Tuple[str, List[Any], str, List[Any]]: - # ConvertLineDividers().visit(tree) - # ConvertLineJoiner().visit(tree) - StripSigns().visit(tree) # ToDo: Move - converted_line = self.line_tree_to_string(tree) - input(f"converted line: {converted_line}") - words = self.serialize_words(tree) - return (converted_line, words, tree.data, []) - - def read_lines(self, file: str) -> List[str]: + def read_lines_from_path(self, file: str) -> List[str]: with codecs.open(file, "r", encoding="utf8") as f: return f.read().split("\n") diff --git a/ebl/atf_importer/domain/legacy_atf_transformers.py b/ebl/atf_importer/domain/legacy_atf_transformers.py index eb712c330..7688e93b3 100644 --- a/ebl/atf_importer/domain/legacy_atf_transformers.py +++ b/ebl/atf_importer/domain/legacy_atf_transformers.py @@ -233,23 +233,74 @@ def ebl_atf_text_line__VALUE_CHARACTER(self, token: Token) -> Token: class LegacyColumnTransformer(LegacyTransformer): - column_number = 1 + prefix = "ebl_atf_at_line" + + # ToDo: Fix this. It seems like this transformer is not being called + # for some unclear reason. The error is probably not here, but in the + # visitor or elsewhere. + # All methods with `test` are just for testing. def __init__(self, **kwargs) -> None: - prefix = "ebl_atf_at_line" # noqa: F841 super().__init__(**kwargs) + self.column_number = 1 @v_args(inline=True) def ebl_atf_at_line__legacy_column(self, column: Tree) -> Tree: self.legacy_found = True - print("!!!!", column, self.column_number) + print("!!!!", column) input() self.column_number += 1 - return column # self.to_tree("bla_bla", []) + return self.to_tree( + "legacy_column", + # [self.to_token("INT", str(self.column_number)), self.to_tree("status", [])], + [self.to_token("INT", "1"), self.to_tree("status", [])], + ) + + @v_args(inline=True) + def ebl_atf_at_line__INT(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test + + @v_args(inline=True) + def ebl_atf_at_line__ebl_atf_common__INT(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test + + @v_args(inline=True) + def ebl_atf_common__INT(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test + + @v_args(inline=True) + def ebl_atf_at_line__face(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test + + @v_args(inline=True) + def ebl_atf_at_line__LCASE_LETTER(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test + + @v_args(inline=True) + def ebl_atf_at_line__status(self, test: Tree) -> Tree: + self.legacy_found = True + print("!!!!", test) + input() + return test @v_args(inline=True) - def ebl_atf_at_line__at_line_value(self, test: Tree) -> Tree: + def ebl_atf_at_line__SURFACE(self, test: Tree) -> Tree: self.legacy_found = True - print("!!!!", test, self.column_number) + print("!!!!", test) input() return test diff --git a/ebl/atf_importer/domain/legacy_atf_visitor.py b/ebl/atf_importer/domain/legacy_atf_visitor.py index 4ff515407..dcbf7147b 100644 --- a/ebl/atf_importer/domain/legacy_atf_visitor.py +++ b/ebl/atf_importer/domain/legacy_atf_visitor.py @@ -66,7 +66,7 @@ def __init__(self): self.legacy_found = False for suffix, transformers in self.nodes_to_visit.items(): prefix = self.text_line_prefix - if "at_line" in suffix: + if suffix in ["at_line"]: prefix = self.at_line_prefix self._set_rules(suffix, transformers, prefix) @@ -76,6 +76,7 @@ def _set_rules( transformers: Sequence[Tuple[LegacyTransformer, str]], prefix: str, ) -> None: + print(f"{prefix}__{suffix}") setattr( self, f"{prefix}__{suffix}", @@ -92,7 +93,9 @@ def _method(tree: Tree) -> Tree: return _method - def _transform(self, tree: Tree, transformer: LegacyTransformer, replace: str): + def _transform( + self, tree: Tree, transformer: LegacyTransformer, replace: str + ) -> None: transformer.clear() transformer.current_tree = tree transformed_tree = transformer.transform(tree) diff --git a/ebl/corpus/domain/parser.py b/ebl/corpus/domain/parser.py index 1297fd6d3..dd97c7e36 100644 --- a/ebl/corpus/domain/parser.py +++ b/ebl/corpus/domain/parser.py @@ -5,7 +5,7 @@ from ebl.corpus.domain.manuscript import Manuscript from ebl.errors import DataError from ebl.transliteration.domain.dollar_line import DollarLine -from ebl.transliteration.domain.atf_parsers.lark_parser import CHAPTER_PARSER +from ebl.transliteration.domain.atf_parsers.lark_parser import CHAPTER_PARSER, PARATEXT_PARSER from ebl.transliteration.domain.atf_parsers.lark_parser_errors import PARSE_ERRORS from ebl.transliteration.domain.note_line import NoteLine @@ -14,12 +14,12 @@ def parse_chapter( atf: str, manuscripts: Iterable[Manuscript], start: Optional[str] = None ) -> Sequence[Line]: try: - tree = CHAPTER_PARSER.parse(atf, start=start) + tree = CHAPTER_PARSER.parse(atf) return ChapterTransformer(manuscripts).transform(tree) except PARSE_ERRORS as error: raise DataError(error) from error def parse_paratext(atf: str) -> Union[NoteLine, DollarLine]: - tree = CHAPTER_PARSER.parse(atf, start="paratext") + tree = PARATEXT_PARSER.parse(atf) return ChapterTransformer(()).transform(tree) diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index 325d29a6e..b8be250dc 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -4,6 +4,22 @@ # ToDo: All transformers should be tested +TRANSLATION_LEGACY = """ +@obverse +1. a-na +2. a-bi-ya +@translation en labelled +@label(o 1-o 2) +To my father +""" + +TRANSLATION_EXPECTED = """ +@obverse +1. a-na +# tr.en.(o 2): To my father +2. a-bi-ya +""" + PARSE_AND_TRANSFORM_LEGACY = [ ("", ""), ("@column", "@column 1"), @@ -11,6 +27,7 @@ ("@face a", "@face a"), ("@obverse", "@obverse"), ("@reverse", "@reverse"), + ("$ obverse broken", "$ obverse broken"), ("$ single ruling", "$ single ruling"), ("1. a'", "1. aʾ"), ("1′. A", "1'. A"), @@ -82,6 +99,17 @@ ] +def test_legacy_translation(): + atf_preprocessor = AtfPreprocessor("../logs", 0) + legacy_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_LEGACY) + expected_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_EXPECTED) + print("RESULT:\n", legacy_tree) # .pretty()) + print("EXPECTED:\n", expected_tree) # .pretty()) + # input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" + + assert legacy_tree == expected_tree + + @pytest.mark.parametrize( "legacy_line,ebl_line", [ diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser.py b/ebl/transliteration/domain/atf_parsers/lark_parser.py index cfe1e7746..d734d2259 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser.py +++ b/ebl/transliteration/domain/atf_parsers/lark_parser.py @@ -43,7 +43,9 @@ ATF_GRAMMAR_PATH, **kwargs_lark, start="translation_line" ) PARATEXT_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark, start="paratext") -CHAPTER_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark, start="chapter") +CHAPTER_PARSER = Lark.open( + ATF_GRAMMAR_PATH, **kwargs_lark, start="chapter", debug=True +) # ToDo: Remove debug LINE_PARSER = Lark.open(ATF_GRAMMAR_PATH, **kwargs_lark) diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark index eb9bf7b9b..e7f565418 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark @@ -1,10 +1,13 @@ -%import common.CR -%import common.LF -%import common.WS_INLINE -%import .ebl_atf_abbreviations (PROVENANCE, PERIOD, TYPE) -%import .ebl_atf_text_line (text_line, any_word, note_line, parallel_line, translation_line, labels, manuscript_line, markup) +%import .ebl_atf_text_line (text_line, any_word) +%import .ebl_atf_parallel_line (parallel_line) +%import .ebl_atf_manuscript_line (manuscript_line, paratext) +%import .ebl_atf_translation_line (translation_line) +%import .ebl_atf_note_line (note_line, markup) %import .ebl_atf_dollar_line (dollar_line) %import .ebl_atf_at_line (at_line) +%import .ebl_atf_chapter (chapter) +%import .ebl_atf_empty_line (empty_line) +%import .ebl_atf_common (labels) %import .oracc_atf_lem_line (lem_line) ?start: line @@ -19,26 +22,4 @@ | control_line | lem_line -empty_line: /\s+/? -!control_line.-2: ("=:" | "&" | "#") /.+/? - -?paratext: note_line | dollar_line - -chapter: chapter_line (_NEWLINE _NEWLINE+ chapter_line)* - -chapter_line: [chapter_translation] line_variant (_NEWLINE line_variant)* -chapter_translation: (translation_line _NEWLINE)+ - -line_variant: reconstruction (_NEWLINE manuscript_line)* - -reconstruction: text_line [_NEWLINE note_line] (_NEWLINE parallel_line)* - -manuscript_line: _WHITE_SPACE? siglum [" " labels] manuscript_text paratext_line* -?manuscript_text: " " text_line | empty_line -?paratext_line: _NEWLINE _WHITE_SPACE? paratext -_WHITE_SPACE: WS_INLINE -_NEWLINE: (CR? LF) - -siglum: [PROVENANCE] PERIOD [TYPE] [DISAMBIQUATOR] - | "Std" [DISAMBIQUATOR] -> standard_text_siglum -DISAMBIQUATOR: /[\S]+/ +!control_line.-2: ("=:" | "&" | "#") /.+/? \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark index 9cf07e11e..519978fdf 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark @@ -1,9 +1,9 @@ -%import .ebl_atf_common (INT, LCASE_LETTER) +%import .ebl_atf_common (INT) %import .ebl_atf_common (status) %import .ebl_atf_common (object, OBJECT, generic_object, fragment) %import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) %import .ebl_atf_common (seal, free_text) -%import .ebl_atf_text_line(_markup) +%import .ebl_atf_note_line(_markup) ?at_line: "@" at_line_value diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_chapter.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_chapter.lark new file mode 100644 index 000000000..dd7fd03e0 --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_chapter.lark @@ -0,0 +1,19 @@ +%import common.CR +%import common.LF + +%import .ebl_atf_manuscript_line (manuscript_line) +%import .ebl_atf_text_line (text_line) +%import .ebl_atf_note_line (note_line) +%import .ebl_atf_parallel_line (parallel_line) +%import .ebl_atf_translation_line (translation_line) + +chapter: chapter_line (_NEWLINE _NEWLINE+ chapter_line)* + +chapter_line: [chapter_translation] line_variant (_NEWLINE line_variant)* +chapter_translation: (translation_line _NEWLINE)+ + +line_variant: reconstruction (_NEWLINE manuscript_line)* + +reconstruction: text_line [_NEWLINE note_line] (_NEWLINE parallel_line)* + +_NEWLINE: (CR? LF) \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark index 3caa9e266..60147fe8d 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark @@ -1,6 +1,8 @@ %import common.DIGIT %import common.INT %import common.LCASE_LETTER +%import common.LETTER + %import .legacy_atf (LEGACY_VALUE_CHARACTER_ACCENTED, LEGACY_VALUE_CHARACTER) %import .legacy_atf (LEGACY_LOGOGRAM_CHARACTER_ACCENTED) @@ -71,6 +73,10 @@ LEFT: "l.e." RIGHT: "r.e." TOP: "t.e." +?line_number: line_number_range | single_line_number +line_number_range: single_line_number "-" single_line_number +single_line_number: [LETTER "+"] INT [PRIME | LEGACY_PRIME] [LETTER] + status: (PRIME | LEGACY_PRIME | UNCERTAIN | CORRECTION | COLLATION | NO_LONGER_VISIBLE)* PRIME: "'" LEGACY_PRIME: "′" | "’" diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_empty_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_empty_line.lark new file mode 100644 index 000000000..4eeaad054 --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_empty_line.lark @@ -0,0 +1 @@ +empty_line: /\s+/? \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_manuscript_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_manuscript_line.lark new file mode 100644 index 000000000..6ae8f9da1 --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_manuscript_line.lark @@ -0,0 +1,24 @@ +%import common.CR +%import common.LF +%import common.WS_INLINE + +%import .ebl_atf_common (labels) +%import .ebl_atf_text_line (text_line) +%import .ebl_atf_note_line (note_line) +%import .ebl_atf_empty_line (empty_line) +%import .ebl_atf_dollar_line (dollar_line) + +%import .ebl_atf_abbreviations (PROVENANCE, PERIOD, TYPE) + + +manuscript_line: _WHITE_SPACE? siglum [" " labels] manuscript_text paratext_line* +?manuscript_text: " " text_line | empty_line +?paratext_line: _NEWLINE _WHITE_SPACE? paratext +siglum: [PROVENANCE] PERIOD [TYPE] [DISAMBIQUATOR] + | "Std" [DISAMBIQUATOR] -> standard_text_siglum +DISAMBIQUATOR: /[\S]+/ + +_WHITE_SPACE: WS_INLINE +_NEWLINE: (CR? LF) + +?paratext: note_line | dollar_line \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_note_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_note_line.lark new file mode 100644 index 000000000..01ca4c206 --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_note_line.lark @@ -0,0 +1,14 @@ +%import .ebl_atf_text_line (text) + +note_line: "#note: " _markup +markup: _markup +_markup: (emphasis_part | language_part | bibliography_part | string_part | url_part)+ +language_part: "@" LANGUAGE "{" text "}" +emphasis_part: "@i{" note_text "}" +bibliography_part: "@bib{" escaped_text ("@" escaped_text)? "}" +escaped_text: (/[^@{}\\\n\r]/ | ("\\" /[@{}\\]/))+ +string_part: note_text +note_text: /[^@{}\n\r]+/ +LANGUAGE: "akk" | "sux" | "es" +url_part: "@url{" url "}" ("{" note_text "}")? +url: /[^}]+/ \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_parallel_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_parallel_line.lark new file mode 100644 index 000000000..00b2025d0 --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_parallel_line.lark @@ -0,0 +1,22 @@ + +%import common.INT +%import .ebl_atf_abbreviations (PERIOD) +%import .ebl_atf_common (CAPITAL_ROMAN_NUMERAL) +%import .ebl_atf_common (object_label, surface_label, column_label) +%import .ebl_atf_common (line_number) + +?parallel_line: parallel_fragment | parallel_text | parallel_composition +parallel_fragment: PARALLEL_LINE_PREFIX [CF] "F " museum_number " " [DUPLICATES " "] [object_label " "] [surface_label " "] [column_label " "] line_number +!museum_number: /./+ "." /[^.]/+ ("." /[^.]/+)? +DUPLICATES: "&d" +parallel_text: PARALLEL_LINE_PREFIX [CF] text_id " " [chapter_name " "] line_number +text_id: GENRE " " CATEGORY "." INT +GENRE: "Lex" | "Med" | "L" | "D" | "Mag" +CATEGORY: "0" | CAPITAL_ROMAN_NUMERAL +chapter_name: STAGE " " [quoted_string " "] quoted_string +quoted_string: "\"" /./+ "\"" + +STAGE: PERIOD | "SB" +parallel_composition: PARALLEL_LINE_PREFIX [CF] "(" /./+ " " line_number ")" +CF: "cf. " +PARALLEL_LINE_PREFIX: "// " \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark index 577fc34cf..8f1bdf8f2 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark @@ -1,58 +1,12 @@ -%import common.INT -%import common.LETTER -%import common.LCASE_LETTER -%import .ebl_atf_common (free_text) -%import .ebl_atf_common (seal) %import .ebl_atf_common (AKKADIAN_ALPHABET, GREEK_ALPHABET) %import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER) -%import .ebl_atf_common (labels) -%import .ebl_atf_common (CAPITAL_ROMAN_NUMERAL) -%import .ebl_atf_common (PRIME, LEGACY_PRIME, UNCERTAIN, CORRECTION, COLLATION, NO_LONGER_VISIBLE) -%import .ebl_atf_common (object_label, surface_label, column_label) -%import .ebl_atf_abbreviations (PERIOD) +%import .ebl_atf_common (UNCERTAIN, CORRECTION, COLLATION, NO_LONGER_VISIBLE) +%import .ebl_atf_common (line_number) %import .legacy_atf (LEGACY_OPEN_HALF_BRACKET, LEGACY_CLOSE_HALF_BRACKET) %import .legacy_atf (LEGACY_ORACC_JOINER, LEGACY_ORACC_DISH_DIVIDER) %import .legacy_atf (legacy_uncertain_sign_prefix) -note_line: "#note: " _markup -markup: _markup - -_markup: (emphasis_part | language_part | bibliography_part | string_part | url_part)+ -language_part: "@" LANGUAGE "{" text "}" -emphasis_part: "@i{" note_text "}" -bibliography_part: "@bib{" escaped_text ("@" escaped_text)? "}" -escaped_text: (/[^@{}\\\n\r]/ | ("\\" /[@{}\\]/))+ -string_part: note_text -note_text: /[^@{}\n\r]+/ -LANGUAGE: "akk" | "sux" | "es" -url_part: "@url{" url "}" ("{" note_text "}")? -url: /[^}]+/ - -?parallel_line: parallel_fragment | parallel_text | parallel_composition -parallel_fragment: PARALLEL_LINE_PREFIX [CF] "F " museum_number " " [DUPLICATES " "] [object_label " "] [surface_label " "] [column_label " "] line_number -!museum_number: /./+ "." /[^.]/+ ("." /[^.]/+)? -DUPLICATES: "&d" -parallel_text: PARALLEL_LINE_PREFIX [CF] text_id " " [chapter_name " "] line_number -text_id: GENRE " " CATEGORY "." INT -GENRE: "Lex" | "Med" | "L" | "D" | "Mag" -CATEGORY: "0" | CAPITAL_ROMAN_NUMERAL -chapter_name: STAGE " " [quoted_string " "] quoted_string -quoted_string: "\"" /./+ "\"" -STAGE: PERIOD | "SB" -parallel_composition: PARALLEL_LINE_PREFIX [CF] "(" /./+ " " line_number ")" -CF: "cf. " -PARALLEL_LINE_PREFIX: "// " - - -translation_line: "#tr" [ "." TRANSLATION_LANGUAGE ] [".(" translation_extent ")"] ": " _markup -TRANSLATION_LANGUAGE: LCASE_LETTER~2 -translation_extent: [labels " "] line_number - - text_line: line_number "." _WORD_SEPARATOR+ text _WORD_SEPARATOR* -?line_number: line_number_range | single_line_number -line_number_range: single_line_number "-" single_line_number -single_line_number: [LETTER "+"] INT [PRIME | LEGACY_PRIME] [LETTER] text: (_shifted_text | _non_normalized_text) (_WORD_SEPARATOR+ _shifted_text)* _shifted_text: normalized_akkadian_shift _WORD_SEPARATOR+ _normalized_akkadian diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_translation_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_translation_line.lark new file mode 100644 index 000000000..0eed7ad8b --- /dev/null +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_translation_line.lark @@ -0,0 +1,8 @@ +%import common.LCASE_LETTER + +%import .ebl_atf_common (line_number, labels) +%import .ebl_atf_note_line (_markup) + +translation_line: "#tr" [ "." TRANSLATION_LANGUAGE ] [".(" translation_extent ")"] ": " _markup +TRANSLATION_LANGUAGE: LCASE_LETTER~2 +translation_extent: [labels " "] line_number \ No newline at end of file diff --git a/ebl/transliteration/domain/labels.py b/ebl/transliteration/domain/labels.py index ce941493c..302f1e47f 100644 --- a/ebl/transliteration/domain/labels.py +++ b/ebl/transliteration/domain/labels.py @@ -173,13 +173,13 @@ def labels(self, children) -> Sequence[Label]: return tuple(children) @v_args(inline=True) - def ebl_atf_text_line__column_label( + def ebl_atf_common__column_label( self, numeral: Token, status: Sequence[Status] ) -> ColumnLabel: return ColumnLabel.from_label(numeral, status) # pyre-ignore[6] @v_args(inline=True) - def ebl_atf_text_line__surface_label( + def ebl_atf_common__surface_label( self, surface: Token, status: Sequence[Status] ) -> SurfaceLabel: return SurfaceLabel.from_label( @@ -188,18 +188,12 @@ def ebl_atf_text_line__surface_label( ) @v_args(inline=True) - def ebl_atf_text_line__object_label( + def ebl_atf_common__object_label( self, object_: Token, status: Sequence[Status] ) -> ObjectLabel: return ObjectLabel.from_object(Object(object_), status) - @v_args(inline=True) - def ebl_atf_text_line__ebl_atf_common__object_label( - self, object_: Token, status: Sequence[Status] - ) -> ObjectLabel: - return ObjectLabel.from_object(Object(object_), status) - - def ebl_atf_text_line__ebl_atf_common__status( + def ebl_atf_common__status( self, children: Iterable[Token] ) -> Sequence[Status]: return tuple(Status(token) for token in children)