From 5c811c14ca378c2071de89be813881d59d0010f4 Mon Sep 17 00:00:00 2001 From: Harris Tzovanakis Date: Tue, 27 Feb 2024 14:30:15 +0100 Subject: [PATCH] common: add journal mapping * Adds `journal_title` mappings. * ref: cern-sis/issues-scoap3#308 --- dags/common/constants.py | 2 + dags/common/parsing/generic_parsing.py | 8 ++- .../common/parsing/test_generic_parsing.py | 56 +++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/dags/common/constants.py b/dags/common/constants.py index 30262864..c4a8dc4e 100644 --- a/dags/common/constants.py +++ b/dags/common/constants.py @@ -12,3 +12,5 @@ WHITE_SPACES = re.compile(r"[\n\t]{1,}" + r"\s{2,}") CDATA_PATTERN = re.compile(r"<\?CDATA(.*)\?>") FN_REGEX = re.compile(r"") + +JOURNAL_MAPPING = {"PLB": "Physics Letters B", "NUPHB": "Nuclear Physics B"} diff --git a/dags/common/parsing/generic_parsing.py b/dags/common/parsing/generic_parsing.py index 377f71a8..25bba2dc 100644 --- a/dags/common/parsing/generic_parsing.py +++ b/dags/common/parsing/generic_parsing.py @@ -1,6 +1,8 @@ import re from datetime import date +from dags.common.constants import JOURNAL_MAPPING + def take_first(arr): try: @@ -82,9 +84,13 @@ def parse_thesis_supervisors(value): def publication_info(article): if "publication_info" in article: return article["publication_info"] + + journal_title = article.get("journal_title", "") + journal_title = JOURNAL_MAPPING.get(journal_title, journal_title) + return [ { - "journal_title": article.get("journal_title", ""), + "journal_title": journal_title, "journal_volume": article.get("journal_volume", ""), "year": int(article.get("journal_year", 0)) or "", "journal_issue": article.get("journal_issue", ""), diff --git a/tests/units/common/parsing/test_generic_parsing.py b/tests/units/common/parsing/test_generic_parsing.py index 35759fce..6ab268a3 100644 --- a/tests/units/common/parsing/test_generic_parsing.py +++ b/tests/units/common/parsing/test_generic_parsing.py @@ -226,6 +226,62 @@ def test_parse_thesis_supervisors(test_input, expected): ], id="Some values populated", ), + param( + { + "journal_title": "NUPHB", + "journal_volume": "Test Value", + "journal_year": "2022", + "journal_issue": "Test Value", + "journal_artid": "", + "journal_fpage": "", + "journal_lpage": "", + "journal_doctype": "", + "pubinfo_freetext": "", + "another_field": "Test Another Field", + }, + [ + { + "journal_title": "Nuclear Physics B", + "journal_volume": "Test Value", + "year": 2022, + "journal_issue": "Test Value", + "artid": "", + "page_start": "", + "page_end": "", + "material": "", + "pubinfo_freetext": "", + } + ], + id="Test journal title NUPHB to Nuclear Physics B", + ), + param( + { + "journal_title": "PLB", + "journal_volume": "Test Value", + "journal_year": "2022", + "journal_issue": "Test Value", + "journal_artid": "", + "journal_fpage": "", + "journal_lpage": "", + "journal_doctype": "", + "pubinfo_freetext": "", + "another_field": "Test Another Field", + }, + [ + { + "journal_title": "Physics Letters B", + "journal_volume": "Test Value", + "year": 2022, + "journal_issue": "Test Value", + "artid": "", + "page_start": "", + "page_end": "", + "material": "", + "pubinfo_freetext": "", + } + ], + id="Test journal title PLB to Physics Letters B", + ), param( { "journal_title": "Test Value",