From 25de8ea70134251d665cae558d6dcafc55896579 Mon Sep 17 00:00:00 2001 From: Miguel Garcia Garcia Date: Fri, 23 Aug 2024 16:16:34 +0200 Subject: [PATCH] global: fix parser imports and add missing deps * ref: https://github.com/cern-sis/issues-inspire/issues/546 --- inspire_schemas/parsers/arxiv.py | 14 +-- inspire_schemas/parsers/elsevier.py | 2 +- inspire_schemas/parsers/jats.py | 2 +- inspire_schemas/parsers/utils.py | 140 ++++++++++++++++++++++++++ setup.py | 2 + tests/unit/test_parsers_arxiv.py | 2 +- tests/unit/test_parsers_author_xml.py | 2 +- tests/unit/test_parsers_crossref.py | 2 +- tests/unit/test_parsers_elsevier.py | 2 +- tests/unit/test_parsers_jats.py | 2 +- 10 files changed, 156 insertions(+), 14 deletions(-) create mode 100644 inspire_schemas/parsers/utils.py diff --git a/inspire_schemas/parsers/arxiv.py b/inspire_schemas/parsers/arxiv.py index 191e87e5..154f1654 100644 --- a/inspire_schemas/parsers/arxiv.py +++ b/inspire_schemas/parsers/arxiv.py @@ -29,13 +29,6 @@ import six from inspire_utils.dedupers import dedupe_list from inspire_utils.helpers import maybe_int -from inspire_utils.utils import ( - CONFERENCE_WORDS, - THESIS_WORDS, - coll_cleanforthe, - get_node, - split_fullname, -) from pylatexenc.latex2text import ( EnvironmentTextSpec, LatexNodes2Text, @@ -44,6 +37,13 @@ ) from inspire_schemas.api import LiteratureBuilder +from inspire_schemas.parsers.utils import ( + CONFERENCE_WORDS, + THESIS_WORDS, + coll_cleanforthe, + get_node, + split_fullname, +) from inspire_schemas.utils import classify_field, normalize_arxiv_category RE_CONFERENCE = re.compile( diff --git a/inspire_schemas/parsers/elsevier.py b/inspire_schemas/parsers/elsevier.py index 7985da21..30fcb18b 100644 --- a/inspire_schemas/parsers/elsevier.py +++ b/inspire_schemas/parsers/elsevier.py @@ -28,9 +28,9 @@ import six from inspire_utils.date import PartialDate from inspire_utils.helpers import maybe_int, remove_tags -from inspire_utils.utils import get_node from inspire_schemas.api import LiteratureBuilder, ReferenceBuilder +from inspire_schemas.parsers.utils import get_node DOCTYPE_MAPPING = { "abs": "abstract", diff --git a/inspire_schemas/parsers/jats.py b/inspire_schemas/parsers/jats.py index ae6c9f61..d6397aa4 100644 --- a/inspire_schemas/parsers/jats.py +++ b/inspire_schemas/parsers/jats.py @@ -30,9 +30,9 @@ from idutils import normalize_orcid from inspire_utils.date import PartialDate from inspire_utils.helpers import maybe_int, remove_tags -from inspire_utils.utils import get_node from inspire_schemas.api import LiteratureBuilder, ReferenceBuilder +from inspire_schemas.parsers.utils import get_node from inspire_schemas.utils import split_page_artid JOURNAL_TITLES_MAPPING = {"Physics": "APS Physics"} diff --git a/inspire_schemas/parsers/utils.py b/inspire_schemas/parsers/utils.py new file mode 100644 index 00000000..6c6b67d3 --- /dev/null +++ b/inspire_schemas/parsers/utils.py @@ -0,0 +1,140 @@ +from __future__ import ( + absolute_import, + division, + print_function, +) + +import re + +from scrapy.selector import Selector + +RE_FOR_THE = re.compile( + r'\b(?:for|on behalf of|representing)\b', + re.IGNORECASE, +) +INST_PHRASES = ['for the development', ] + + +def get_node(text, namespaces=None): + """Get a scrapy selector for the given text node.""" + node = Selector(text=text, type="xml") + if namespaces: + for ns in namespaces: + node.register_namespace(ns[0], ns[1]) + return node + + +def coll_cleanforthe(coll): + """ Cleanup collaboration, try to find author """ + author = None + + if any(phrase for phrase in INST_PHRASES if phrase in coll.lower()): + # don't touch it, doesn't look like a collaboration + return coll, author + + coll = coll.strip('.; ') + + if RE_FOR_THE.search(coll): + # get strings leading and trailing 'for the' + (lead, trail) = RE_FOR_THE.split(coll, maxsplit=1) + if re.search(r'\w', lead): + author = lead.strip() + if re.search(r'\w', trail): + coll = trail + + coll = re.sub('(?i)^ *the ', '', coll) + coll = re.sub('(?i) *collaborations? *', '', coll) + coll = coll.strip() + + return coll, author + + +def split_fullname(author, switch_name_order=False): + """Split an author name to surname and given names. + + It accepts author strings with and without comma separation. + As default surname is first in case of comma separation, otherwise last. + Multi-part surnames are incorrectly detected in strings without comma + separation. + """ + if not author: + return "", "" + + if "," in author: + fullname = [n.strip() for n in author.split(',')] + surname_first = True + else: + fullname = [n.strip() for n in author.split()] + surname_first = False + + if switch_name_order: + surname_first = not surname_first + + if surname_first: + surname = fullname[0] + given_names = " ".join(fullname[1:]) + else: + surname = fullname[-1] + given_names = " ".join(fullname[:-1]) + + return surname, given_names + + +CONFERENCE_WORDS = [ + 'colloquium', + 'colloquiums', + 'conf', + 'conference', + 'conferences', + 'contrib', + 'contributed', + 'contribution', + 'contributions', + 'forum', + 'lecture', + 'lectures', + 'meeting', + 'meetings', + 'pres', + 'presented', + 'proc', + 'proceeding', + 'proceedings', + 'rencontre', + 'rencontres', + 'school', + 'schools', + 'seminar', + 'seminars', + 'symp', + 'symposium', + 'symposiums', + 'talk', + 'talks', + 'workshop', + 'workshops' +] + +THESIS_WORDS = [ + 'diploma', + 'diplomarbeit', + 'diplome', + 'dissertation', + 'doctoraal', + 'doctoral', + 'doctorat', + 'doctorate', + 'doktorarbeit', + 'dottorato', + 'habilitationsschrift', + 'hochschule', + 'inauguraldissertation', + 'memoire', + 'phd', + 'proefschrift', + 'schlussbericht', + 'staatsexamensarbeit', + 'tesi', + 'thesis', + 'travail' +] diff --git a/setup.py b/setup.py index 25df2db4..41778c82 100644 --- a/setup.py +++ b/setup.py @@ -223,6 +223,8 @@ def do_setup(): # requests requires a urllib3 version <1.26 but not 1.25.0 and 1.25.1 # we pin it down here to solve dependency problems 'urllib3>=1.21.1,<1.26,!=1.25.0,!=1.25.1', + 'scrapy', + 'pylatexenc', ], tests_require=tests_require, extras_require=extras_require, diff --git a/tests/unit/test_parsers_arxiv.py b/tests/unit/test_parsers_arxiv.py index 6e1938e0..91c425d1 100644 --- a/tests/unit/test_parsers_arxiv.py +++ b/tests/unit/test_parsers_arxiv.py @@ -26,7 +26,7 @@ print_function, ) -from inspire_utils.parsers.arxiv import ArxivParser +from inspire_schemas.parsers.arxiv import ArxivParser def test_latex_to_unicode_handles_arxiv_escape_sequences(): diff --git a/tests/unit/test_parsers_author_xml.py b/tests/unit/test_parsers_author_xml.py index 56236c8d..379547b6 100644 --- a/tests/unit/test_parsers_author_xml.py +++ b/tests/unit/test_parsers_author_xml.py @@ -26,7 +26,7 @@ print_function, ) -from inspire_utils.parsers.author_xml import AuthorXMLParser +from inspire_schemas.parsers.author_xml import AuthorXMLParser def test_parsing_author_xml(): diff --git a/tests/unit/test_parsers_crossref.py b/tests/unit/test_parsers_crossref.py index 396cd59a..641efd3c 100644 --- a/tests/unit/test_parsers_crossref.py +++ b/tests/unit/test_parsers_crossref.py @@ -31,8 +31,8 @@ import pytest import yaml from fixtures import get_test_suite_path -from inspire_utils.parsers.crossref import CrossrefParser +from inspire_schemas.parsers.crossref import CrossrefParser from inspire_schemas.utils import validate diff --git a/tests/unit/test_parsers_elsevier.py b/tests/unit/test_parsers_elsevier.py index ad6cf0ee..32e9f6b8 100644 --- a/tests/unit/test_parsers_elsevier.py +++ b/tests/unit/test_parsers_elsevier.py @@ -32,8 +32,8 @@ import yaml from deepdiff import DeepDiff from fixtures import get_test_suite_path -from inspire_utils.parsers.elsevier import ElsevierParser +from inspire_schemas.parsers.elsevier import ElsevierParser from inspire_schemas.utils import validate diff --git a/tests/unit/test_parsers_jats.py b/tests/unit/test_parsers_jats.py index 8ecbddb8..3876a483 100644 --- a/tests/unit/test_parsers_jats.py +++ b/tests/unit/test_parsers_jats.py @@ -32,8 +32,8 @@ import yaml from deepdiff import DeepDiff from fixtures import get_test_suite_path -from inspire_utils.parsers.jats import JatsParser +from inspire_schemas.parsers.jats import JatsParser from inspire_schemas.utils import validate