Skip to content

Commit

Permalink
inspore utils
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Jun 5, 2024
1 parent d64392e commit d77cc9d
Show file tree
Hide file tree
Showing 13 changed files with 379 additions and 11 deletions.
2 changes: 1 addition & 1 deletion dags/aps/aps_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from common.exceptions import EmptyOutputFromPreviousTask
from common.scoap3_s3 import Scoap3Repository
from common.utils import create_or_update_article, upload_json_to_s3
from inspire_utils.record import get_value
from common.inspire_utils.inspire_utils import get_value
from structlog import get_logger

logger = get_logger()
Expand Down
2 changes: 1 addition & 1 deletion dags/aps/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from common.parsing.json_extractors import CustomExtractor, NestedValueExtractor
from common.parsing.parser import IParser
from common.utils import construct_license
from inspire_utils.record import get_value
from common.inspire_utils.inspire_utils import get_value
from structlog import get_logger

logger = get_logger()
Expand Down
298 changes: 298 additions & 0 deletions dags/common/inspire_utils/date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Utils to handle dates in INSPIRE."""

from __future__ import absolute_import, division, print_function

import datetime
import itertools
from functools import total_ordering

import six
from babel import dates
from dateutil.parser import parse as parse_date


@total_ordering
@six.python_2_unicode_compatible
class PartialDate(object):
"""Class for representing a partial date.
The standard constructor assumes that all date parts are known (or not
present) and have already been converted to `int` s. For more flexibility,
see :ref:`PartialDate.from_parts` and :ref:`PartialDate.parse`.
Two `PartialDate` s can be compared and a more complete date is considered
smaller than the same date with parts removed.
Raises:
TypeError: when the date parts are not `int` s or `None`.
ValueError: when the date is not valid.
"""

def __init__(self, year, month=None, day=None):
well_typed = all(
isinstance(part, int) or part is None for part in (year, month, day)
)
if not well_typed:
raise TypeError(
"arguments to {classname} must be of type int or None".format(
classname=type(self).__name__
)
)
if year is None or year < 1000:
raise ValueError("year must be an int >= 1000")
if day and not month:
raise TypeError("month must not be None if day is not None")
# delegate validation of number of months/days to datetime
completion = (part or 1 for part in (year, month, day))
datetime.date(*completion)

self.year = year
self.month = month
self.day = day

def __repr__(self):
return (
"PartialDate(year={self.year}, month={self.month}, day={self.day})".format(
self=self
)
)

def __eq__(self, other):
return (
self.year == other.year
and self.month == other.month
and self.day == other.day
)

def __lt__(self, other):
self_month = self.month or 99
self_day = self.day or 99
other_month = other.month or 99
other_day = other.day or 99

return (self.year, self_month, self_day) < (other.year, other_month, other_day)

def __str__(self):
return self.pprint()

@classmethod
def loads(cls, string):
"""Load a date from a string in a record.
This can also be used to validate a date.
Examples:
>>> PartialDate.loads('1686-06')
PartialDate(year=1686, month=6, day=None)
>>> PartialDate.loads('1686-42')
Traceback (most recent call last):
...
ValueError: month must be in 1..12
"""

date_parts = string.split("-")

if len(date_parts) >= 2 and (len(date_parts[1]) < 2 or date_parts[1] == "00"):
raise ValueError("Month must be in MM format")
if len(date_parts) == 3 and (len(date_parts[2]) < 2 or date_parts[2] == "00"):
raise ValueError("Day must be in DD format")
parts = (int(part) for part in date_parts)

return cls(*parts)

def dumps(self):
"""Dump the date for serialization into the record.
Returns:
str: normalized date, in the form ``YYYY-MM-DD``, ``YYYY-MM`` or
``YYYY`` (depending on the information present in the date)
"""
non_empty = itertools.takewhile(bool, (self.year, self.month, self.day))
# XXX: this only handles dates after 1000, which should be sufficient
formatted = ("{:02d}".format(part) for part in non_empty)
date = "-".join(formatted)

return date

@classmethod
def parse(cls, date, **kwargs):
"""Parse a date given in arbitrary format.
This attempts to parse the input date, given in an arbitrary format
Args:
date(str): date to normalize
**kwargs: these are passed to the `dateutil.parser.parse` function
which is used internally to parse the date. Most notably, the
`yearfirst` and `datefirst` flags can be used if the ordering
of the date parts is known.
Returns:
PartialDate: an object holding the parsed date.
Raises:
ValueError: when the date cannot be parsed or no year is present.
Examples:
>>> PartialDate.parse('30 Jun 1686')
PartialDate(year=1686, month=6, day=30)
"""
# In order to detect partial dates, parse twice with different defaults
# and compare the results.
default_date1 = datetime.datetime(1, 1, 1)
default_date2 = datetime.datetime(2, 2, 2)

parsed_date1 = parse_date(date, default=default_date1, **kwargs)
parsed_date2 = parse_date(date, default=default_date2, **kwargs)

has_year = parsed_date1.year == parsed_date2.year
has_month = parsed_date1.month == parsed_date2.month
has_day = parsed_date1.day == parsed_date2.day

if has_year:
year = parsed_date1.year
else:
raise ValueError("date does not contain a year")
month = parsed_date1.month if has_month else None
day = parsed_date1.day if has_day else None

return cls(year, month, day)

@classmethod
def from_parts(cls, year, month=None, day=None):
"""Build a PartialDate from its parts.
Unlike the standard constructor, the parts don't have to be `int` s but
can be strings containing textual month information.
Examples:
>>> PartialDate.from_parts('1686', 'June', '30')
PartialDate(year=1686, month=6, day=30)
"""
# XXX: 0 is not a valid year/month/day
non_empty = itertools.takewhile(
bool, (str(part) if part else None for part in (year, month, day))
)
return cls.parse("-".join(non_empty), yearfirst=True)

def pprint(self):
"""Pretty print the date.
Examples:
>>> PartialDate(1686, 6, 30).pprint()
u'Jun 30, 1686'
"""
if not self.month:
return dates.format_date(
datetime.date(self.year, 1, 1), "yyyy", locale="en"
)
if not self.day:
return dates.format_date(
datetime.date(self.year, self.month, 1), "MMM, yyyy", locale="en"
)
return dates.format_date(
datetime.date(self.year, self.month, self.day), "MMM d, yyyy", locale="en"
)


def normalize_date(date, **kwargs):
"""Normalize a date to the be schema-compliant.
This is a convenience wrapper around :ref:`PartialDate`, which should be
used instead if more features are needed.
Note:
When ``date`` is ``None`` this returns ``None`` instead of raising
an exception because this makes ``DoJSON``'s code simpler, as it
already knows how to strip ``None`` values at the end.
Args:
date(str): date to normalize
**kwargs: these are passed to the `dateutil.parser.parse` function
that is used internally to parse the date. Most notably, the
`yearfirst` and `datefirst` flags can be used if the ordering
of the date parts is know.
Returns:
str: normalized date, in the form ``YYYY-MM-DD``, ``YYYY-MM`` or
``YYYY`` (depending on the information present in the date).
Raises:
ValueError: when the date cannot be parsed or no year is present.
Examples:
>>> normalize_date(None)
>>> normalize_date('30 Jun 1686')
'1686-06-30'
"""
if date is None:
return

return PartialDate.parse(date, **kwargs).dumps()


def format_date(date):
"""Format a schema-compliant date string in a human-friendy format.
This is a convenience wrapper around :ref:`PartialDate`, which should be
used instead if more features are needed.
"""
return PartialDate.loads(date).pprint()


def earliest_date(dates):
"""Return the earliest among the schema-compliant dates.
This is a convenience wrapper around :ref:`PartialDate`, which should be
used instead if more features are needed.
Args:
dates(list): List of dates from which oldest/earliest one will be returned
Returns:
str: Earliest date from provided list
"""
min_date = min(PartialDate.loads(date) for date in dates)
return min_date.dumps()


def fill_missing_date_parts(date):
"""Sets missing day and/or month to 1. Useful to avoid errors when saving to DB."""

if date is None:
return

date_obj = PartialDate.loads(date)
if not date_obj.month:
date_obj.month = 1
if not date_obj.day:
date_obj.day = 1
return date_obj.dumps()
71 changes: 71 additions & 0 deletions dags/common/inspire_utils/inspire_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import re

from six import string_types

SPLIT_KEY_PATTERN = re.compile(r"\.|\[")


def get_value(record, key, default=None):
"""Return item as `dict.__getitem__` but using 'smart queries'.
.. note::
Accessing one value in a normal way, meaning d['a'], is almost as
fast as accessing a regular dictionary. But using the special
name convention is a bit slower than using the regular access:
.. code-block:: python
>>> %timeit x = dd['a[0].b']
100000 loops, best of 3: 3.94 us per loop
>>> %timeit x = dd['a'][0]['b']
1000000 loops, best of 3: 598 ns per loop
"""

def getitem(k, v, default):
if isinstance(v, string_types):
raise KeyError
elif isinstance(v, dict):
return v[k]
elif "]" in k:
k = k[:-1].replace("n", "-1")
# Work around for list indexes and slices
try:
return v[int(k)]
except IndexError:
return default
except ValueError:
return v[
slice(
*map(
lambda x: int(x.strip()) if x.strip() else None,
k.split(":"),
)
)
]
else:
tmp = []
for inner_v in v:
try:
tmp.append(getitem(k, inner_v, default))
except KeyError:
continue
return tmp

# Wrap a top-level list in a dict
if isinstance(record, list):
record = {"record": record}
key = ".".join(["record", key])

# Check if we are using python regular keys
try:
return record[key]
except KeyError:
pass

keys = SPLIT_KEY_PATTERN.split(key)
value = record
for k in keys:
try:
value = getitem(k, value, default)
except KeyError:
return default
return value
2 changes: 1 addition & 1 deletion dags/common/parsing/json_extractors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from common.parsing.extractor import IExtractor
from inspire_utils.record import get_value
from common.inspire_utils.inspire_utils import get_value


class NestedValueExtractor(IExtractor):
Expand Down
Loading

0 comments on commit d77cc9d

Please sign in to comment.