From d77cc9da70a9e53d13aa12d6b41bcce66780e0f3 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 5 Jun 2024 12:29:57 +0300 Subject: [PATCH] inspore utils --- dags/aps/aps_process_file.py | 2 +- dags/aps/parser.py | 2 +- dags/common/inspire_utils/date.py | 298 +++++++++++++++++++++ dags/common/inspire_utils/inspire_utils.py | 71 +++++ dags/common/parsing/json_extractors.py | 2 +- dags/common/utils.py | 2 +- dags/elsevier/elsevier_file_processing.py | 2 +- dags/hindawi/hindawi_file_processing.py | 2 +- dags/iop/iop_process_file.py | 2 +- dags/iop/parser.py | 2 +- dags/oup/oup_process_file.py | 2 +- dags/springer/springer_process_file.py | 2 +- requirements.txt | 1 - 13 files changed, 379 insertions(+), 11 deletions(-) create mode 100644 dags/common/inspire_utils/date.py create mode 100644 dags/common/inspire_utils/inspire_utils.py diff --git a/dags/aps/aps_process_file.py b/dags/aps/aps_process_file.py index 49ae03c8..8641e80d 100644 --- a/dags/aps/aps_process_file.py +++ b/dags/aps/aps_process_file.py @@ -9,7 +9,7 @@ from common.exceptions import EmptyOutputFromPreviousTask from common.scoap3_s3 import Scoap3Repository from common.utils import create_or_update_article, upload_json_to_s3 -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from structlog import get_logger logger = get_logger() diff --git a/dags/aps/parser.py b/dags/aps/parser.py index 6d10b10a..d51ed4d5 100644 --- a/dags/aps/parser.py +++ b/dags/aps/parser.py @@ -3,7 +3,7 @@ from common.parsing.json_extractors import CustomExtractor, NestedValueExtractor from common.parsing.parser import IParser from common.utils import construct_license -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from structlog import get_logger logger = get_logger() diff --git a/dags/common/inspire_utils/date.py b/dags/common/inspire_utils/date.py new file mode 100644 index 00000000..d25f4633 --- /dev/null +++ b/dags/common/inspire_utils/date.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2014-2017 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +"""Utils to handle dates in INSPIRE.""" + +from __future__ import absolute_import, division, print_function + +import datetime +import itertools +from functools import total_ordering + +import six +from babel import dates +from dateutil.parser import parse as parse_date + + +@total_ordering +@six.python_2_unicode_compatible +class PartialDate(object): + """Class for representing a partial date. + + The standard constructor assumes that all date parts are known (or not + present) and have already been converted to `int` s. For more flexibility, + see :ref:`PartialDate.from_parts` and :ref:`PartialDate.parse`. + + Two `PartialDate` s can be compared and a more complete date is considered + smaller than the same date with parts removed. + + Raises: + TypeError: when the date parts are not `int` s or `None`. + ValueError: when the date is not valid. + + """ + + def __init__(self, year, month=None, day=None): + well_typed = all( + isinstance(part, int) or part is None for part in (year, month, day) + ) + if not well_typed: + raise TypeError( + "arguments to {classname} must be of type int or None".format( + classname=type(self).__name__ + ) + ) + if year is None or year < 1000: + raise ValueError("year must be an int >= 1000") + if day and not month: + raise TypeError("month must not be None if day is not None") + # delegate validation of number of months/days to datetime + completion = (part or 1 for part in (year, month, day)) + datetime.date(*completion) + + self.year = year + self.month = month + self.day = day + + def __repr__(self): + return ( + "PartialDate(year={self.year}, month={self.month}, day={self.day})".format( + self=self + ) + ) + + def __eq__(self, other): + return ( + self.year == other.year + and self.month == other.month + and self.day == other.day + ) + + def __lt__(self, other): + self_month = self.month or 99 + self_day = self.day or 99 + other_month = other.month or 99 + other_day = other.day or 99 + + return (self.year, self_month, self_day) < (other.year, other_month, other_day) + + def __str__(self): + return self.pprint() + + @classmethod + def loads(cls, string): + """Load a date from a string in a record. + + This can also be used to validate a date. + + Examples: + >>> PartialDate.loads('1686-06') + PartialDate(year=1686, month=6, day=None) + >>> PartialDate.loads('1686-42') + Traceback (most recent call last): + ... + ValueError: month must be in 1..12 + + """ + + date_parts = string.split("-") + + if len(date_parts) >= 2 and (len(date_parts[1]) < 2 or date_parts[1] == "00"): + raise ValueError("Month must be in MM format") + if len(date_parts) == 3 and (len(date_parts[2]) < 2 or date_parts[2] == "00"): + raise ValueError("Day must be in DD format") + parts = (int(part) for part in date_parts) + + return cls(*parts) + + def dumps(self): + """Dump the date for serialization into the record. + + Returns: + str: normalized date, in the form ``YYYY-MM-DD``, ``YYYY-MM`` or + ``YYYY`` (depending on the information present in the date) + + """ + non_empty = itertools.takewhile(bool, (self.year, self.month, self.day)) + # XXX: this only handles dates after 1000, which should be sufficient + formatted = ("{:02d}".format(part) for part in non_empty) + date = "-".join(formatted) + + return date + + @classmethod + def parse(cls, date, **kwargs): + """Parse a date given in arbitrary format. + + This attempts to parse the input date, given in an arbitrary format + + Args: + date(str): date to normalize + **kwargs: these are passed to the `dateutil.parser.parse` function + which is used internally to parse the date. Most notably, the + `yearfirst` and `datefirst` flags can be used if the ordering + of the date parts is known. + + Returns: + PartialDate: an object holding the parsed date. + + Raises: + ValueError: when the date cannot be parsed or no year is present. + + Examples: + >>> PartialDate.parse('30 Jun 1686') + PartialDate(year=1686, month=6, day=30) + + """ + # In order to detect partial dates, parse twice with different defaults + # and compare the results. + default_date1 = datetime.datetime(1, 1, 1) + default_date2 = datetime.datetime(2, 2, 2) + + parsed_date1 = parse_date(date, default=default_date1, **kwargs) + parsed_date2 = parse_date(date, default=default_date2, **kwargs) + + has_year = parsed_date1.year == parsed_date2.year + has_month = parsed_date1.month == parsed_date2.month + has_day = parsed_date1.day == parsed_date2.day + + if has_year: + year = parsed_date1.year + else: + raise ValueError("date does not contain a year") + month = parsed_date1.month if has_month else None + day = parsed_date1.day if has_day else None + + return cls(year, month, day) + + @classmethod + def from_parts(cls, year, month=None, day=None): + """Build a PartialDate from its parts. + + Unlike the standard constructor, the parts don't have to be `int` s but + can be strings containing textual month information. + + Examples: + >>> PartialDate.from_parts('1686', 'June', '30') + PartialDate(year=1686, month=6, day=30) + + """ + # XXX: 0 is not a valid year/month/day + non_empty = itertools.takewhile( + bool, (str(part) if part else None for part in (year, month, day)) + ) + return cls.parse("-".join(non_empty), yearfirst=True) + + def pprint(self): + """Pretty print the date. + + Examples: + >>> PartialDate(1686, 6, 30).pprint() + u'Jun 30, 1686' + + """ + if not self.month: + return dates.format_date( + datetime.date(self.year, 1, 1), "yyyy", locale="en" + ) + if not self.day: + return dates.format_date( + datetime.date(self.year, self.month, 1), "MMM, yyyy", locale="en" + ) + return dates.format_date( + datetime.date(self.year, self.month, self.day), "MMM d, yyyy", locale="en" + ) + + +def normalize_date(date, **kwargs): + """Normalize a date to the be schema-compliant. + + This is a convenience wrapper around :ref:`PartialDate`, which should be + used instead if more features are needed. + + Note: + When ``date`` is ``None`` this returns ``None`` instead of raising + an exception because this makes ``DoJSON``'s code simpler, as it + already knows how to strip ``None`` values at the end. + + Args: + date(str): date to normalize + **kwargs: these are passed to the `dateutil.parser.parse` function + that is used internally to parse the date. Most notably, the + `yearfirst` and `datefirst` flags can be used if the ordering + of the date parts is know. + + Returns: + str: normalized date, in the form ``YYYY-MM-DD``, ``YYYY-MM`` or + ``YYYY`` (depending on the information present in the date). + + Raises: + ValueError: when the date cannot be parsed or no year is present. + + Examples: + >>> normalize_date(None) + >>> normalize_date('30 Jun 1686') + '1686-06-30' + + """ + if date is None: + return + + return PartialDate.parse(date, **kwargs).dumps() + + +def format_date(date): + """Format a schema-compliant date string in a human-friendy format. + + This is a convenience wrapper around :ref:`PartialDate`, which should be + used instead if more features are needed. + """ + return PartialDate.loads(date).pprint() + + +def earliest_date(dates): + """Return the earliest among the schema-compliant dates. + + This is a convenience wrapper around :ref:`PartialDate`, which should be + used instead if more features are needed. + + Args: + dates(list): List of dates from which oldest/earliest one will be returned + Returns: + str: Earliest date from provided list + """ + min_date = min(PartialDate.loads(date) for date in dates) + return min_date.dumps() + + +def fill_missing_date_parts(date): + """Sets missing day and/or month to 1. Useful to avoid errors when saving to DB.""" + + if date is None: + return + + date_obj = PartialDate.loads(date) + if not date_obj.month: + date_obj.month = 1 + if not date_obj.day: + date_obj.day = 1 + return date_obj.dumps() diff --git a/dags/common/inspire_utils/inspire_utils.py b/dags/common/inspire_utils/inspire_utils.py new file mode 100644 index 00000000..af01a61b --- /dev/null +++ b/dags/common/inspire_utils/inspire_utils.py @@ -0,0 +1,71 @@ +import re + +from six import string_types + +SPLIT_KEY_PATTERN = re.compile(r"\.|\[") + + +def get_value(record, key, default=None): + """Return item as `dict.__getitem__` but using 'smart queries'. + + .. note:: + + Accessing one value in a normal way, meaning d['a'], is almost as + fast as accessing a regular dictionary. But using the special + name convention is a bit slower than using the regular access: + .. code-block:: python + >>> %timeit x = dd['a[0].b'] + 100000 loops, best of 3: 3.94 us per loop + >>> %timeit x = dd['a'][0]['b'] + 1000000 loops, best of 3: 598 ns per loop + """ + + def getitem(k, v, default): + if isinstance(v, string_types): + raise KeyError + elif isinstance(v, dict): + return v[k] + elif "]" in k: + k = k[:-1].replace("n", "-1") + # Work around for list indexes and slices + try: + return v[int(k)] + except IndexError: + return default + except ValueError: + return v[ + slice( + *map( + lambda x: int(x.strip()) if x.strip() else None, + k.split(":"), + ) + ) + ] + else: + tmp = [] + for inner_v in v: + try: + tmp.append(getitem(k, inner_v, default)) + except KeyError: + continue + return tmp + + # Wrap a top-level list in a dict + if isinstance(record, list): + record = {"record": record} + key = ".".join(["record", key]) + + # Check if we are using python regular keys + try: + return record[key] + except KeyError: + pass + + keys = SPLIT_KEY_PATTERN.split(key) + value = record + for k in keys: + try: + value = getitem(k, value, default) + except KeyError: + return default + return value diff --git a/dags/common/parsing/json_extractors.py b/dags/common/parsing/json_extractors.py index 20253537..25f2362e 100644 --- a/dags/common/parsing/json_extractors.py +++ b/dags/common/parsing/json_extractors.py @@ -1,5 +1,5 @@ from common.parsing.extractor import IExtractor -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value class NestedValueExtractor(IExtractor): diff --git a/dags/common/utils.py b/dags/common/utils.py index 34399b82..688378b1 100644 --- a/dags/common/utils.py +++ b/dags/common/utils.py @@ -9,7 +9,7 @@ from ftplib import error_perm from io import StringIO from stat import S_ISDIR, S_ISREG -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value import backoff import pycountry diff --git a/dags/elsevier/elsevier_file_processing.py b/dags/elsevier/elsevier_file_processing.py index 258a8f6f..f5cd8eeb 100644 --- a/dags/elsevier/elsevier_file_processing.py +++ b/dags/elsevier/elsevier_file_processing.py @@ -11,7 +11,7 @@ ) from elsevier.parser import ElsevierParser from elsevier.repository import ElsevierRepository -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from structlog import get_logger logger = get_logger() diff --git a/dags/hindawi/hindawi_file_processing.py b/dags/hindawi/hindawi_file_processing.py index 103ff671..9731d88e 100644 --- a/dags/hindawi/hindawi_file_processing.py +++ b/dags/hindawi/hindawi_file_processing.py @@ -9,7 +9,7 @@ from common.utils import create_or_update_article, upload_json_to_s3 from hindawi.parser import HindawiParser from hindawi.repository import HindawiRepository -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from structlog import get_logger logger = get_logger() diff --git a/dags/iop/iop_process_file.py b/dags/iop/iop_process_file.py index f280b2ae..61406253 100644 --- a/dags/iop/iop_process_file.py +++ b/dags/iop/iop_process_file.py @@ -8,7 +8,7 @@ from common.exceptions import EmptyOutputFromPreviousTask from common.scoap3_s3 import Scoap3Repository from common.utils import create_or_update_article, upload_json_to_s3 -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from iop.parser import IOPParser from iop.repository import IOPRepository from structlog import get_logger diff --git a/dags/iop/parser.py b/dags/iop/parser.py index 4a8ec1ca..03c7646f 100644 --- a/dags/iop/parser.py +++ b/dags/iop/parser.py @@ -21,7 +21,7 @@ parse_country_from_value ) from idutils import is_arxiv -from inspire_utils.date import PartialDate +from common.inspire_utils.date import PartialDate from structlog import get_logger diff --git a/dags/oup/oup_process_file.py b/dags/oup/oup_process_file.py index ac8481d5..cdd6c9a2 100644 --- a/dags/oup/oup_process_file.py +++ b/dags/oup/oup_process_file.py @@ -12,7 +12,7 @@ parse_without_names_spaces, upload_json_to_s3, ) -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from jsonschema import validate from oup.parser import OUPParser from oup.repository import OUPRepository diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py index 83d82565..f44349f3 100644 --- a/dags/springer/springer_process_file.py +++ b/dags/springer/springer_process_file.py @@ -9,7 +9,7 @@ from common.exceptions import EmptyOutputFromPreviousTask from common.scoap3_s3 import Scoap3Repository from common.utils import create_or_update_article, upload_json_to_s3 -from inspire_utils.record import get_value +from common.inspire_utils.inspire_utils import get_value from jsonschema import validate from springer.parser import SpringerParser from springer.repository import SpringerRepository diff --git a/requirements.txt b/requirements.txt index de52da92..619aa6da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ paramiko boto3 backoff bleach==6.0.0 -inspire-utils==3.0.25 idutils==1.2.1 furl==2.1.3 busypie==0.4.5