diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark index b55ec3f..7e7b107 100644 --- a/src/undate/converters/calendars/hebrew/hebrew.lark +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -11,23 +11,23 @@ hebrew_date: day month year | month year | year // PGP dates use qualifiers like "first decade of" (for beginning of month) // "first third of", seasons (can look for more examples) -// Hebrew calendar starts with year 1 in 3761 BCE +// Hebrew calendar starts with year 1 in 3761 BCE year: /\d+/ // months month: month_1 | month_2 - | month_3 - | month_4 - | month_5 - | month_6 - | month_7 - | month_8 - | month_9 - | month_10 - | month_11 - | month_12 - | month_13 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + | month_13 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py index a6d2888..8e39aeb 100644 --- a/src/undate/converters/calendars/hebrew/transformer.py +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -22,14 +22,14 @@ def hebrew_date(self, items): value = int(child.children[0]) parts[str(child.data)] = value - # initialize and return an undate with islamic year, month, day and - # islamic calendar + # initialize and return an undate with year, month, day in + # hebrew calendar return HebrewUndate(**parts) - # year translation is not needed since we want a tree with name year - # this is equivalent to a no-op - # def year(self, items): - # return Tree(data="year", children=[items[0]]) + def year(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) def month(self, items): # month has a nested tree for the rule and the value diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py index b575df9..45b4558 100644 --- a/src/undate/converters/calendars/hijri/transformer.py +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -28,8 +28,17 @@ def hijri_date(self, items): # year translation is not needed since we want a tree with name year # this is equivalent to a no-op - # def year(self, items): - # return Tree(data="year", children=[items[0]]) + def year(self, items): + # combine multiple parts into a single string + # (for some reason we're getting an anonymous token in combined parser) + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) + + def day(self, items): + # combine multiple parts into a single string + # (for some reason we're getting an anonymous token in combined parser) + value = "".join([str(i) for i in items]) + return Tree(data="day", children=[value]) def month(self, items): # month has a nested tree for the rule and the value diff --git a/src/undate/converters/combined.lark b/src/undate/converters/combined.lark new file mode 100644 index 0000000..eb559d4 --- /dev/null +++ b/src/undate/converters/combined.lark @@ -0,0 +1,32 @@ +%import common.WS +%ignore WS + +start: (edtf__start | hebrew__hebrew_date | hijri__hijri_date ) + +// Renaming of the import variables is required, as they receive the namespace of this file. +// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 + +// relative import from edtf/edtf.lark +// NOTE: this results in a prefix of edtf__edtf__ +%import .edtf.edtf.edtf -> edtf__start + +// relative import from calendars/hebrew/hebrew.lark +%import .calendars.hebrew.hebrew.hebrew_date -> hebrew__hebrew_date +%import .calendars.hebrew.hebrew.day -> hebrew__day +%import .calendars.hebrew.hebrew.month -> hebrew__month +%import .calendars.hebrew.hebrew.year -> hebrew__year + +// relative import from calendars/hijri/hijri.lark +%import .calendars.hijri.hijri.hijri_date -> hijri__hijri_date +%import .calendars.hijri.hijri.day -> hijri__day +%import .calendars.hijri.hijri.month -> hijri__month +%import .calendars.hijri.hijri.year -> hijri__year + + + +// override hebrew date to omit year-only, since year without calendar is ambiguous +// NOTE: potentially support year with calendar label +%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year + +// same for hijri date, year alone is ambiguous +%override hijri__hijri_date: hijri__day hijri__month hijri__year | hijri__month hijri__year diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py new file mode 100644 index 0000000..7f41afc --- /dev/null +++ b/src/undate/converters/combined.py @@ -0,0 +1,25 @@ +from lark import Lark +from lark.visitors import Transformer, merge_transformers + +from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer +from undate.converters.calendars.hijri.transformer import HijriDateTransformer + + +class CombinedDateTransformer(Transformer): + def start(self, children): + return children + + +# NOTE: we can't support year-only dates in combined parser because calendar +# is ambiguous, unless we want to add a calendar indicator + +combined_transformer = merge_transformers( + CombinedDateTransformer(), + edtf__edtf=EDTFTransformer(), # nested prefix due to nested import path + hebrew=HebrewDateTransformer(), + hijri=HijriDateTransformer(), +) + + +parser = Lark.open("combined.lark", rel_to=__file__, strict=True) diff --git a/src/undate/converters/edtf/transformer.py b/src/undate/converters/edtf/transformer.py index d5bcfcb..5268700 100644 --- a/src/undate/converters/edtf/transformer.py +++ b/src/undate/converters/edtf/transformer.py @@ -66,7 +66,10 @@ def day_unspecified(self, items): def date_level1(self, items): return self.date(items) - # year (including negative years) use default transformation + def year(self, items): + # combine parts (numeric & unknown) into a single string + value = "".join(self.get_values(items)) + return Tree(data="year", children=[value]) def year_fivedigitsplus(self, items): # strip off the leading Y and convert to integer diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py new file mode 100644 index 0000000..b90ca73 --- /dev/null +++ b/tests/test_converters/test_combined_parser.py @@ -0,0 +1,35 @@ +import pytest + +from undate.converters.combined import parser, combined_transformer + +from undate.undate import Undate, UndateInterval + +# for now, just test that valid dates can be parsed + +testcases = [ + # EDTF + ("1984", Undate(1984)), + ("201X", Undate("201X")), + ("20XX", Undate("20XX")), + ("2004-XX", Undate(2004, "XX")), + ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), + # Hebrew / Anno Mundi calendar + ("Tammuz 4816", Undate(4816, 4, calendar="Hebrew")), + # Islamic / Hijri calendar + ("Jumādā I 1243", Undate(1243, 5, calendar="Hijri")), + ("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Hijri")), + ("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Hijri")), +] + + +@pytest.mark.parametrize("date_string,expected", testcases) +def test_transform(date_string, expected): + transformer = combined_transformer + # parse the input string, then transform to undate object + parsetree = parser.parse(date_string) + print(parsetree) + # since the same unknown date is not considered strictly equal, + # compare object representations + transformed_date = transformer.transform(parsetree) + print(transformed_date) + assert repr(transformed_date[0]) == repr(expected)