Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preliminary combined date parser #112

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions src/undate/converters/calendars/hebrew/hebrew.lark
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ hebrew_date: day month year | month year | year
// PGP dates use qualifiers like "first decade of" (for beginning of month)
// "first third of", seasons (can look for more examples)

// Hebrew calendar starts with year 1 in 3761 BCE
// Hebrew calendar starts with year 1 in 3761 BCE
year: /\d+/

// months
month: month_1
| month_2
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/

Expand Down
12 changes: 6 additions & 6 deletions src/undate/converters/calendars/hebrew/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def hebrew_date(self, items):
value = int(child.children[0])
parts[str(child.data)] = value

# initialize and return an undate with islamic year, month, day and
# islamic calendar
# initialize and return an undate with year, month, day in
# hebrew calendar
return HebrewUndate(**parts)

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
13 changes: 11 additions & 2 deletions src/undate/converters/calendars/hijri/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,17 @@ def hijri_date(self, items):

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def day(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="day", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
32 changes: 32 additions & 0 deletions src/undate/converters/combined.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
%import common.WS
%ignore WS

start: (edtf__start | hebrew__hebrew_date | hijri__hijri_date )

// Renaming of the import variables is required, as they receive the namespace of this file.
// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565

// relative import from edtf/edtf.lark
// NOTE: this results in a prefix of edtf__edtf__
%import .edtf.edtf.edtf -> edtf__start

// relative import from calendars/hebrew/hebrew.lark
%import .calendars.hebrew.hebrew.hebrew_date -> hebrew__hebrew_date
%import .calendars.hebrew.hebrew.day -> hebrew__day
%import .calendars.hebrew.hebrew.month -> hebrew__month
%import .calendars.hebrew.hebrew.year -> hebrew__year

// relative import from calendars/hijri/hijri.lark
%import .calendars.hijri.hijri.hijri_date -> hijri__hijri_date
%import .calendars.hijri.hijri.day -> hijri__day
%import .calendars.hijri.hijri.month -> hijri__month
%import .calendars.hijri.hijri.year -> hijri__year



// override hebrew date to omit year-only, since year without calendar is ambiguous
// NOTE: potentially support year with calendar label
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year

// same for hijri date, year alone is ambiguous
%override hijri__hijri_date: hijri__day hijri__month hijri__year | hijri__month hijri__year
25 changes: 25 additions & 0 deletions src/undate/converters/combined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from lark import Lark
from lark.visitors import Transformer, merge_transformers

from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.hijri.transformer import HijriDateTransformer


class CombinedDateTransformer(Transformer):
def start(self, children):
return children


# NOTE: we can't support year-only dates in combined parser because calendar
# is ambiguous, unless we want to add a calendar indicator

combined_transformer = merge_transformers(
CombinedDateTransformer(),
edtf__edtf=EDTFTransformer(), # nested prefix due to nested import path
hebrew=HebrewDateTransformer(),
hijri=HijriDateTransformer(),
)


parser = Lark.open("combined.lark", rel_to=__file__, strict=True)
5 changes: 4 additions & 1 deletion src/undate/converters/edtf/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def day_unspecified(self, items):
def date_level1(self, items):
return self.date(items)

# year (including negative years) use default transformation
def year(self, items):
# combine parts (numeric & unknown) into a single string
value = "".join(self.get_values(items))
return Tree(data="year", children=[value])

def year_fivedigitsplus(self, items):
# strip off the leading Y and convert to integer
Expand Down
35 changes: 35 additions & 0 deletions tests/test_converters/test_combined_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest

from undate.converters.combined import parser, combined_transformer

from undate.undate import Undate, UndateInterval

# for now, just test that valid dates can be parsed

testcases = [
# EDTF
("1984", Undate(1984)),
("201X", Undate("201X")),
("20XX", Undate("20XX")),
("2004-XX", Undate(2004, "XX")),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
# Hebrew / Anno Mundi calendar
("Tammuz 4816", Undate(4816, 4, calendar="Hebrew")),
# Islamic / Hijri calendar
("Jumādā I 1243", Undate(1243, 5, calendar="Hijri")),
("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Hijri")),
("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Hijri")),
]


@pytest.mark.parametrize("date_string,expected", testcases)
def test_transform(date_string, expected):
transformer = combined_transformer
# parse the input string, then transform to undate object
parsetree = parser.parse(date_string)
print(parsetree)
# since the same unknown date is not considered strictly equal,
# compare object representations
transformed_date = transformer.transform(parsetree)
print(transformed_date)
assert repr(transformed_date[0]) == repr(expected)