From 9c48acb7fe5f6567369ccc3ba14cfcf6c31990c7 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Thu, 18 Jan 2024 15:47:39 +0100 Subject: [PATCH 1/7] common: added countries mapping --- dags/common/countries_mapping.py | 174 +++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 dags/common/countries_mapping.py diff --git a/dags/common/countries_mapping.py b/dags/common/countries_mapping.py new file mode 100644 index 00000000..61004047 --- /dev/null +++ b/dags/common/countries_mapping.py @@ -0,0 +1,174 @@ +COUNTRIES_DEFAULT_MAPPING = { + "INFN": "Italy", + "Democratic People's Republic of Korea": "North Korea", + "DPR Korea": "North Korea", + "DPR. Korea": "North Korea", + "CERN": "CERN", + "European Organization for Nuclear Research": "CERN", + "KEK": "Japan", + "DESY": "Germany", + "FERMILAB": "USA", + "FNAL": "USA", + "SLACK": "USA", + "Stanford Linear Accelerator Center": "USA", + "Joint Institute for Nuclear Research": "JINR", + "JINR": "JINR", + "Northern Cyprus": "Turkey", + "North Cyprus": "Turkey", + "New Mexico": "USA", + "South China Normal University": "China", + "Hong Kong China": "Hong Kong", + "Hong-Kong China": "Hong Kong", + "Hong Kong, China": "Hong Kong", + "Hong Kong": "Hong Kong", + "Hong-Kong": "Hong Kong", + "Algeria": "Algeria", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belgique": "Belgium", + "Bangladesh": "Bangladesh", + "Brazil": "Brazil", + "Brasil": "Brazil", + "Benin": "Benin", + "Bulgaria": "Bulgaria", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Canada": "Canada", + "Chile": "Chile", + "ROC": "Taiwan", + "R.O.C": "Taiwan", + "Republic of China": "Taiwan", + "China (PRC)": "China", + "PR China": "China", + "China": "China", + "People's Republic of China": "China", + "Republic of China": "China", + "Colombia": "Colombia", + "Costa Rica": "Costa Rica", + "Cuba": "Cuba", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czech Republic": "Czech Republic", + "Czech": "Czech Republic", + "Czechia": "Czech Republic", + "Denmark": "Denmark", + "Egypt": "Egypt", + "Estonia": "Estonia", + "Ecuador": "Ecuador", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Deutschland": "Germany", + "Greece": "Greece", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Italia": "Italy", + "Japan": "Japan", + "Jamaica": "Jamaica", + "Korea": "South Korea", + "Republic of Korea": "South Korea", + "South Korea": "South Korea", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macedonia": "Macedonia", + "Mexico": "Mexico", + "Monaco": "Monaco", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Netherlands": "Netherlands", + "The Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Zealand": "New Zealand", + "Norway": "Norway", + "Oman": "Oman", + "Sultanate of Oman": "Oman", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Philipines": "Philipines", + "Poland": "Poland", + "Portugalo": "Portugal", + "Portugal": "Portugal", + "P.R.China": "China", + "People’s Republic of China": "China", + "Republic of Belarus": "Belarus", + "Republic of Benin": "Benin", + "Republic of Korea": "South Korea", + "Republic of San Marino": "San Marino", + "Republic of South Africa": "South Africa", + "Romania": "Romania", + "Russia": "Russia", + "Russian Federation": "Russia", + "Saudi Arabia": "Saudi Arabia", + "Kingdom of Saudi Arabia": "Saudi Arabia", + "Arabia": "Saudi Arabia", + "Serbia": "Serbia", + "Singapore": "Singapore", + "Slovak Republic": "Slovakia", + "Slovak": "Slovakia", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "Africa": "South Africa", + "España": "Spain", + "Spain": "Spain", + "Sudan": "Sudan", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Thailand": "Thailand", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Ukraine": "Ukraine", + "United Kingdom": "UK", + "Kingdom": "UK", + "United Kingdom of Great Britain and Northern Ireland": "UK", + "UK": "UK", + "England": "UK", + "Scotland": "UK", + "Wales": "UK", + "New South Wales": "Australia", + "U.K": "UK", + "United States of America": "USA", + "United States": "USA", + "USA": "USA", + "U.S.A": "USA", + "America": "USA", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Peru": "Peru", + "Kuwait": "Kuwait", + "Sri Lanka": "Sri Lanka", + "Lanka": "Sri Lanka", + "Kazakhstan": "Kazakhstan", + "Mongolia": "Mongolia", + "United Arab Emirates": "United Arab Emirates", + "Emirates": "United Arab Emirates", + "Malaysia": "Malaysia", + "Qatar": "Qatar", + "Kyrgyz Republic": "Kyrgyz Republic", + "Jordan": "Jordan", + "Belgrade": "Serbia", + "Istanbul": "Turkey", + "Ankara": "Turkey", + "Rome": "Italy", + "Georgia": "Georgia", +} From aa9f46d85d488be4dd48ad6d4a1e76623e034078 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Fri, 19 Jan 2024 14:51:22 +0100 Subject: [PATCH 2/7] example for parsing --- dags/aps/parser.py | 3 +- dags/common/countries_mapping.py | 1 + dags/common/exceptions.py | 5 + dags/common/parsing/generic_parsing.py | 2 - dags/common/utils.py | 20 +- dags/elsevier/parser.py | 3 +- requirements.txt | 1 + tests/units/aps/test_aps_parser.py | 6 +- tests/units/elsevier/test_elsevier_parser.py | 300 +++++++++---------- 9 files changed, 183 insertions(+), 158 deletions(-) diff --git a/dags/aps/parser.py b/dags/aps/parser.py index 43b263a4..86ed7ab1 100644 --- a/dags/aps/parser.py +++ b/dags/aps/parser.py @@ -3,6 +3,7 @@ from common.parsing.json_extractors import CustomExtractor, NestedValueExtractor from common.parsing.parser import IParser from common.utils import construct_license +from common.utils import parse_country_from_value from inspire_utils.record import get_value from structlog import get_logger @@ -101,7 +102,7 @@ def _get_affiliations(self, article, affiliationIds): { "value": affiliation["name"], "organization": (",").join(affiliation["name"].split(",")[:-1]), - "country": affiliation["name"].split(", ")[-1:][0], + "country": parse_country_from_value(affiliation["name"]) } for affiliation in article["affiliations"] if affiliation["id"] in affiliationIds diff --git a/dags/common/countries_mapping.py b/dags/common/countries_mapping.py index 61004047..37a149af 100644 --- a/dags/common/countries_mapping.py +++ b/dags/common/countries_mapping.py @@ -1,6 +1,7 @@ COUNTRIES_DEFAULT_MAPPING = { "INFN": "Italy", "Democratic People's Republic of Korea": "North Korea", + "Korea, Democratic People's Republic of": "North Korea", "DPR Korea": "North Korea", "DPR. Korea": "North Korea", "CERN": "CERN", diff --git a/dags/common/exceptions.py b/dags/common/exceptions.py index 62007141..7f265e90 100644 --- a/dags/common/exceptions.py +++ b/dags/common/exceptions.py @@ -29,3 +29,8 @@ def __init__(self, license): class EmptyOutputFromPreviousTask(Exception): def __init__(self, taks_name): super().__init__(f"The output from previous task is empty: {taks_name}") + +class FoundMoreThanOneMatchOrNone(Exception): + def __init__(self, country_value): + super().__init__(f"Found more than one or zero match for a country: {country_value}") + diff --git a/dags/common/parsing/generic_parsing.py b/dags/common/parsing/generic_parsing.py index 377f71a8..6efb7e99 100644 --- a/dags/common/parsing/generic_parsing.py +++ b/dags/common/parsing/generic_parsing.py @@ -1,7 +1,5 @@ import re from datetime import date - - def take_first(arr): try: return next(filter(None, arr)) diff --git a/dags/common/utils.py b/dags/common/utils.py index de04535c..39a5cb62 100644 --- a/dags/common/utils.py +++ b/dags/common/utils.py @@ -3,11 +3,13 @@ import os import re import tarfile +import pycountry import xml.etree.ElementTree as ET import zipfile from ftplib import error_perm from io import StringIO from stat import S_ISDIR, S_ISREG +from common.countries_mapping import COUNTRIES_DEFAULT_MAPPING import backoff import requests @@ -19,7 +21,8 @@ CREATIVE_COMMONS_PATTERN, LICENSE_PATTERN, ) -from common.exceptions import UnknownFileExtension, UnknownLicense +from common.exceptions import UnknownFileExtension, UnknownLicense, FoundMoreThanOneMatchOrNone +from common.constants import COUNTRY_PARSING_PATTERN from structlog import get_logger logger = get_logger() @@ -255,3 +258,18 @@ def create_or_update_article(data): ) response.raise_for_status() return response.json() + +def parse_country_from_value(affiliation_value): + country = COUNTRY_PARSING_PATTERN.search(affiliation_value).group(0) + try: + mapped_countries = pycountry.countries.search_fuzzy(country) + if len(mapped_countries) > 1 or len(mapped_countries) == 0: + raise FoundMoreThanOneMatchOrNone(affiliation_value) + return COUNTRIES_DEFAULT_MAPPING[mapped_countries[0].name] + except: + return find_country_match_from_mapping(affiliation_value) + +def find_country_match_from_mapping(affiliation_value): + for key in COUNTRIES_DEFAULT_MAPPING: + if re.search(r'\b%s\b' % key, affiliation_value, flags=re.IGNORECASE): + return COUNTRIES_DEFAULT_MAPPING[key] diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py index 07a3e2df..865f0fd1 100644 --- a/dags/elsevier/parser.py +++ b/dags/elsevier/parser.py @@ -6,7 +6,7 @@ CustomExtractor, TextExtractor, ) -from common.utils import extract_text +from common.utils import extract_text, parse_country_from_value from structlog import get_logger @@ -185,6 +185,7 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]): field_name="country", dois=self.dois, ) + country = country and parse_country_from_value(country) if affiliation_value and organization and country: affiliations.append( { diff --git a/requirements.txt b/requirements.txt index 8ace8342..e4d768d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ busypie==0.4.5 pydantic==1.10.7 jsonschema==4.17.3 plyvel==1.5.0 +pycountry==23.12.11 diff --git a/tests/units/aps/test_aps_parser.py b/tests/units/aps/test_aps_parser.py index dc8565f3..ac8d9c59 100644 --- a/tests/units/aps/test_aps_parser.py +++ b/tests/units/aps/test_aps_parser.py @@ -101,7 +101,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - "country": "Canada M5S1A7", + "country": "Canada", } ], }, @@ -113,7 +113,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - "country": "Canada M5S1A7", + "country": "Canada", } ], }, @@ -125,7 +125,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - "country": "Canada M5S1A7", + "country": "Canada", } ], }, diff --git a/tests/units/elsevier/test_elsevier_parser.py b/tests/units/elsevier/test_elsevier_parser.py index fd044e41..cf63e533 100644 --- a/tests/units/elsevier/test_elsevier_parser.py +++ b/tests/units/elsevier/test_elsevier_parser.py @@ -231,7 +231,7 @@ def parsed_articles(parser, articles): { "value": "Korea Institute of Science and Technology Information, Daejeon, Republic of Korea", "organization": "Korea Institute of Science and Technology Information", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -242,7 +242,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -348,7 +348,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -489,7 +489,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -555,7 +555,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -621,7 +621,7 @@ def parsed_articles(parser, articles): { "value": "Gangneung-Wonju National University, Gangneung, Republic of Korea", "organization": "Gangneung-Wonju National University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -775,7 +775,7 @@ def parsed_articles(parser, articles): { "value": "Nuclear Physics Group, STFC Daresbury Laboratory, Daresbury, United Kingdom", "organization": "Nuclear Physics Group", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -808,7 +808,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -938,7 +938,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -1004,7 +1004,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -1267,7 +1267,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -1336,7 +1336,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -1378,7 +1378,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -1409,7 +1409,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -1467,7 +1467,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -1549,7 +1549,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -1640,7 +1640,7 @@ def parsed_articles(parser, articles): { "value": "University of Kansas, Lawrence, KS, United States", "organization": "University of Kansas", - "country": "United States", + "country": "USA", } ], }, @@ -1678,7 +1678,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -1824,7 +1824,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -1868,7 +1868,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -1978,7 +1978,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -1989,7 +1989,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -2132,7 +2132,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -2159,7 +2159,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -2247,7 +2247,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -2323,7 +2323,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -2356,7 +2356,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -2620,7 +2620,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of California, Berkeley, CA, United States", "organization": "Department of Physics", - "country": "United States", + "country": "USA", } ], }, @@ -2705,7 +2705,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of California, Berkeley, CA, United States", "organization": "Department of Physics", - "country": "United States", + "country": "USA", } ], }, @@ -2813,7 +2813,7 @@ def parsed_articles(parser, articles): { "value": "Technical University of Košice, Košice, Slovak Republic", "organization": "Technical University of Košice", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -2824,7 +2824,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -2846,7 +2846,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -2956,7 +2956,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -3020,7 +3020,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -3042,7 +3042,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -3192,7 +3192,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -3203,7 +3203,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -3397,7 +3397,7 @@ def parsed_articles(parser, articles): { "value": "Chicago State University, Chicago, IL, United States", "organization": "Chicago State University", - "country": "United States", + "country": "USA", } ], }, @@ -3452,7 +3452,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -3463,7 +3463,7 @@ def parsed_articles(parser, articles): { "value": "University of Kansas, Lawrence, KS, United States", "organization": "University of Kansas", - "country": "United States", + "country": "USA", } ], }, @@ -3578,7 +3578,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -3600,7 +3600,7 @@ def parsed_articles(parser, articles): { "value": "Wayne State University, Detroit, MI, United States", "organization": "Wayne State University", - "country": "United States", + "country": "USA", } ], }, @@ -3699,7 +3699,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -3953,7 +3953,7 @@ def parsed_articles(parser, articles): { "value": "Yonsei University, Seoul, Republic of Korea", "organization": "Yonsei University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -3964,7 +3964,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -3997,7 +3997,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -4008,7 +4008,7 @@ def parsed_articles(parser, articles): { "value": "Chicago State University, Chicago, IL, United States", "organization": "Chicago State University", - "country": "United States", + "country": "USA", } ], }, @@ -4030,7 +4030,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -4063,7 +4063,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -4173,7 +4173,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -4228,7 +4228,7 @@ def parsed_articles(parser, articles): { "value": "Yonsei University, Seoul, Republic of Korea", "organization": "Yonsei University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -4261,7 +4261,7 @@ def parsed_articles(parser, articles): { "value": "Creighton University, Omaha, NE, United States", "organization": "Creighton University", - "country": "United States", + "country": "USA", } ], }, @@ -4294,7 +4294,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -4338,7 +4338,7 @@ def parsed_articles(parser, articles): { "value": "Ohio State University, Columbus, OH, United States", "organization": "Ohio State University", - "country": "United States", + "country": "USA", } ], }, @@ -4360,7 +4360,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -4382,7 +4382,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -4455,7 +4455,7 @@ def parsed_articles(parser, articles): { "value": "University of Kansas, Lawrence, KS, United States", "organization": "University of Kansas", - "country": "United States", + "country": "USA", } ], }, @@ -4517,7 +4517,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -4539,7 +4539,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -4550,7 +4550,7 @@ def parsed_articles(parser, articles): { "value": "Technical University of Košice, Košice, Slovak Republic", "organization": "Technical University of Košice", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -4561,7 +4561,7 @@ def parsed_articles(parser, articles): { "value": "Technical University of Košice, Košice, Slovak Republic", "organization": "Technical University of Košice", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -4627,7 +4627,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -4649,7 +4649,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", }, { "value": "Westfälische Wilhelms-Universität Münster, Institut für Kernphysik, Münster, Germany", @@ -4665,7 +4665,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -4725,7 +4725,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -4763,7 +4763,7 @@ def parsed_articles(parser, articles): { "value": "Institute of Experimental Physics, Slovak Academy of Sciences, Košice, Slovak Republic", "organization": "Institute of Experimental Physics", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -5013,7 +5013,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Pusan National University, Pusan, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5024,7 +5024,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Pusan National University, Pusan, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5046,7 +5046,7 @@ def parsed_articles(parser, articles): { "value": "Jeonbuk National University, Jeonju, Republic of Korea", "organization": "Jeonbuk National University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5057,7 +5057,7 @@ def parsed_articles(parser, articles): { "value": "Yonsei University, Seoul, Republic of Korea", "organization": "Yonsei University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5068,7 +5068,7 @@ def parsed_articles(parser, articles): { "value": "Gangneung-Wonju National University, Gangneung, Republic of Korea", "organization": "Gangneung-Wonju National University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5090,7 +5090,7 @@ def parsed_articles(parser, articles): { "value": "Jeonbuk National University, Jeonju, Republic of Korea", "organization": "Jeonbuk National University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5112,7 +5112,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Sejong University, Seoul, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5123,7 +5123,7 @@ def parsed_articles(parser, articles): { "value": "Yonsei University, Seoul, Republic of Korea", "organization": "Yonsei University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5187,7 +5187,7 @@ def parsed_articles(parser, articles): { "value": "California Polytechnic State University, San Luis Obispo, CA, United States", "organization": "California Polytechnic State University", - "country": "United States", + "country": "USA", } ], }, @@ -5209,7 +5209,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -5264,7 +5264,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -5421,7 +5421,7 @@ def parsed_articles(parser, articles): { "value": "Institute of Experimental Physics, Slovak Academy of Sciences, Košice, Slovak Republic", "organization": "Institute of Experimental Physics", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -5432,7 +5432,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -5454,12 +5454,12 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", }, { "value": "Institute of Experimental Physics, Slovak Academy of Sciences, Košice, Slovak Republic", "organization": "Institute of Experimental Physics", - "country": "Slovak Republic", + "country": "Slovakia", }, ], }, @@ -5673,7 +5673,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -5684,7 +5684,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5695,7 +5695,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5706,7 +5706,7 @@ def parsed_articles(parser, articles): { "value": "Yonsei University, Seoul, Republic of Korea", "organization": "Yonsei University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5739,7 +5739,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -5868,7 +5868,7 @@ def parsed_articles(parser, articles): { "value": "Nuclear Physics Group, STFC Daresbury Laboratory, Daresbury, United Kingdom", "organization": "Nuclear Physics Group", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -5901,7 +5901,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of California, Berkeley, CA, United States", "organization": "Department of Physics", - "country": "United States", + "country": "USA", } ], }, @@ -5967,7 +5967,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -5978,7 +5978,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Pusan National University, Pusan, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -5989,7 +5989,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Pusan National University, Pusan, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -6033,7 +6033,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of California, Berkeley, CA, United States", "organization": "Department of Physics", - "country": "United States", + "country": "USA", } ], }, @@ -6055,7 +6055,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -6086,7 +6086,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -6416,7 +6416,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -6460,7 +6460,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -6699,7 +6699,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -6710,7 +6710,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -6967,7 +6967,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -7044,7 +7044,7 @@ def parsed_articles(parser, articles): { "value": "Institute of Experimental Physics, Slovak Academy of Sciences, Košice, Slovak Republic", "organization": "Institute of Experimental Physics", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -7143,7 +7143,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -7278,7 +7278,7 @@ def parsed_articles(parser, articles): { "value": "Chungbuk National University, Cheongju, Republic of Korea", "organization": "Chungbuk National University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -7298,7 +7298,7 @@ def parsed_articles(parser, articles): { "value": "University of Liverpool, Liverpool, United Kingdom", "organization": "University of Liverpool", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -7393,7 +7393,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -7404,7 +7404,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -7563,7 +7563,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -7590,7 +7590,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -7776,7 +7776,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -7814,7 +7814,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -7852,7 +7852,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -7885,7 +7885,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -7940,7 +7940,7 @@ def parsed_articles(parser, articles): { "value": "Lawrence Berkeley National Laboratory, Berkeley, CA, United States", "organization": "Lawrence Berkeley National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -8004,7 +8004,7 @@ def parsed_articles(parser, articles): { "value": "Wayne State University, Detroit, MI, United States", "organization": "Wayne State University", - "country": "United States", + "country": "USA", } ], }, @@ -8057,7 +8057,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -8068,7 +8068,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -8194,12 +8194,12 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", }, { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", }, ], }, @@ -8276,7 +8276,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -8859,7 +8859,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -8914,7 +8914,7 @@ def parsed_articles(parser, articles): { "value": "Wayne State University, Detroit, MI, United States", "organization": "Wayne State University", - "country": "United States", + "country": "USA", } ], }, @@ -8969,7 +8969,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -8980,7 +8980,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", } ], }, @@ -9079,7 +9079,7 @@ def parsed_articles(parser, articles): { "value": "Oak Ridge National Laboratory, Oak Ridge, TN, United States", "organization": "Oak Ridge National Laboratory", - "country": "United States", + "country": "USA", }, { "value": "Institut für Kernphysik, Johann Wolfgang Goethe-Universität Frankfurt, Frankfurt, Germany", @@ -9095,7 +9095,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -9172,7 +9172,7 @@ def parsed_articles(parser, articles): { "value": "Creighton University, Omaha, NE, United States", "organization": "Creighton University", - "country": "United States", + "country": "USA", } ], }, @@ -9230,7 +9230,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -9422,7 +9422,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -9649,7 +9649,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -9709,7 +9709,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -9753,7 +9753,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -9786,7 +9786,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -9852,7 +9852,7 @@ def parsed_articles(parser, articles): { "value": "University of Tennessee, Knoxville, TN, United States", "organization": "University of Tennessee", - "country": "United States", + "country": "USA", } ], }, @@ -10015,7 +10015,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -10026,7 +10026,7 @@ def parsed_articles(parser, articles): { "value": "Comenius University Bratislava, Faculty of Mathematics, Physics and Informatics, Bratislava, Slovak Republic", "organization": "Comenius University Bratislava", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -10124,7 +10124,7 @@ def parsed_articles(parser, articles): { "value": "University of Kansas, Lawrence, KS, United States", "organization": "University of Kansas", - "country": "United States", + "country": "USA", } ], }, @@ -10201,7 +10201,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -10234,7 +10234,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -10265,7 +10265,7 @@ def parsed_articles(parser, articles): { "value": "University of Houston, Houston, TX, United States", "organization": "University of Houston", - "country": "United States", + "country": "USA", } ], }, @@ -10276,7 +10276,7 @@ def parsed_articles(parser, articles): { "value": "Technical University of Košice, Košice, Slovak Republic", "organization": "Technical University of Košice", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -10287,7 +10287,7 @@ def parsed_articles(parser, articles): { "value": "Technical University of Košice, Košice, Slovak Republic", "organization": "Technical University of Košice", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -10329,7 +10329,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, University of California, Berkeley, CA, United States", "organization": "Department of Physics", - "country": "United States", + "country": "USA", } ], }, @@ -10542,7 +10542,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -10780,7 +10780,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", } ], }, @@ -10873,7 +10873,7 @@ def parsed_articles(parser, articles): { "value": "Wayne State University, Detroit, MI, United States", "organization": "Wayne State University", - "country": "United States", + "country": "USA", } ], }, @@ -10926,7 +10926,7 @@ def parsed_articles(parser, articles): { "value": "Faculty of Science, P.J. Šafárik University, Košice, Slovak Republic", "organization": "Faculty of Science", - "country": "Slovak Republic", + "country": "Slovakia", } ], }, @@ -11025,7 +11025,7 @@ def parsed_articles(parser, articles): { "value": "Yale University, New Haven, CT, United States", "organization": "Yale University", - "country": "United States", + "country": "USA", } ], }, @@ -11113,7 +11113,7 @@ def parsed_articles(parser, articles): { "value": "The University of Texas at Austin, Austin, TX, United States", "organization": "The University of Texas at Austin", - "country": "United States", + "country": "USA", } ], }, @@ -11234,7 +11234,7 @@ def parsed_articles(parser, articles): { "value": "Department of Physics, Pusan National University, Pusan, Republic of Korea", "organization": "Department of Physics", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -11245,7 +11245,7 @@ def parsed_articles(parser, articles): { "value": "Inha University, Incheon, Republic of Korea", "organization": "Inha University", - "country": "Republic of Korea", + "country": "South Korea", } ], }, @@ -11327,7 +11327,7 @@ def parsed_articles(parser, articles): { "value": "School of Physics and Astronomy, University of Birmingham, Birmingham, United Kingdom", "organization": "School of Physics and Astronomy", - "country": "United Kingdom", + "country": "UK", }, ], }, From 73947e8bf5b75db3666af2a1ac34b9d3a2a827ca Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Fri, 19 Jan 2024 16:56:56 +0100 Subject: [PATCH 3/7] rest of Elsvier country prase --- dags/aps/parser.py | 5 +- dags/common/exceptions.py | 6 +- dags/common/utils.py | 18 +- dags/elsevier/parser.py | 1 + .../elsevier/test_elsevier_dag_pull_sftp.py | 2 +- tests/units/elsevier/test_elsevier_parser.py | 276 ++++++++++++------ 6 files changed, 204 insertions(+), 104 deletions(-) diff --git a/dags/aps/parser.py b/dags/aps/parser.py index 86ed7ab1..a1cb4076 100644 --- a/dags/aps/parser.py +++ b/dags/aps/parser.py @@ -2,8 +2,7 @@ from common.parsing.json_extractors import CustomExtractor, NestedValueExtractor from common.parsing.parser import IParser -from common.utils import construct_license -from common.utils import parse_country_from_value +from common.utils import construct_license, parse_country_from_value from inspire_utils.record import get_value from structlog import get_logger @@ -102,7 +101,7 @@ def _get_affiliations(self, article, affiliationIds): { "value": affiliation["name"], "organization": (",").join(affiliation["name"].split(",")[:-1]), - "country": parse_country_from_value(affiliation["name"]) + "country": parse_country_from_value(affiliation["name"]), } for affiliation in article["affiliations"] if affiliation["id"] in affiliationIds diff --git a/dags/common/exceptions.py b/dags/common/exceptions.py index 7f265e90..be9a8393 100644 --- a/dags/common/exceptions.py +++ b/dags/common/exceptions.py @@ -30,7 +30,9 @@ class EmptyOutputFromPreviousTask(Exception): def __init__(self, taks_name): super().__init__(f"The output from previous task is empty: {taks_name}") + class FoundMoreThanOneMatchOrNone(Exception): def __init__(self, country_value): - super().__init__(f"Found more than one or zero match for a country: {country_value}") - + super().__init__( + f"Found more than one or zero match for a country: {country_value}" + ) diff --git a/dags/common/utils.py b/dags/common/utils.py index 39a5cb62..6cffa274 100644 --- a/dags/common/utils.py +++ b/dags/common/utils.py @@ -3,26 +3,30 @@ import os import re import tarfile -import pycountry import xml.etree.ElementTree as ET import zipfile from ftplib import error_perm from io import StringIO from stat import S_ISDIR, S_ISREG -from common.countries_mapping import COUNTRIES_DEFAULT_MAPPING import backoff +import pycountry import requests from airflow.models.dagrun import DagRun from airflow.utils.state import DagRunState from common.constants import ( BY_PATTERN, CDATA_PATTERN, + COUNTRY_PARSING_PATTERN, CREATIVE_COMMONS_PATTERN, LICENSE_PATTERN, ) -from common.exceptions import UnknownFileExtension, UnknownLicense, FoundMoreThanOneMatchOrNone -from common.constants import COUNTRY_PARSING_PATTERN +from common.countries_mapping import COUNTRIES_DEFAULT_MAPPING +from common.exceptions import ( + FoundMoreThanOneMatchOrNone, + UnknownFileExtension, + UnknownLicense, +) from structlog import get_logger logger = get_logger() @@ -259,17 +263,19 @@ def create_or_update_article(data): response.raise_for_status() return response.json() + def parse_country_from_value(affiliation_value): country = COUNTRY_PARSING_PATTERN.search(affiliation_value).group(0) try: mapped_countries = pycountry.countries.search_fuzzy(country) if len(mapped_countries) > 1 or len(mapped_countries) == 0: raise FoundMoreThanOneMatchOrNone(affiliation_value) - return COUNTRIES_DEFAULT_MAPPING[mapped_countries[0].name] + return mapped_countries[0].name except: return find_country_match_from_mapping(affiliation_value) + def find_country_match_from_mapping(affiliation_value): for key in COUNTRIES_DEFAULT_MAPPING: - if re.search(r'\b%s\b' % key, affiliation_value, flags=re.IGNORECASE): + if re.search(r"\b%s\b" % key, affiliation_value, flags=re.IGNORECASE): return COUNTRIES_DEFAULT_MAPPING[key] diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py index 865f0fd1..8903690b 100644 --- a/dags/elsevier/parser.py +++ b/dags/elsevier/parser.py @@ -205,6 +205,7 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]): affiliations.append( { "value": affiliation_value, + "country": parse_country_from_value(affiliation_value), } ) diff --git a/tests/integration/elsevier/test_elsevier_dag_pull_sftp.py b/tests/integration/elsevier/test_elsevier_dag_pull_sftp.py index 331fa15c..e9c64652 100644 --- a/tests/integration/elsevier/test_elsevier_dag_pull_sftp.py +++ b/tests/integration/elsevier/test_elsevier_dag_pull_sftp.py @@ -5,8 +5,8 @@ from common.repository import IRepository from elsevier.repository import ElsevierRepository from elsevier.sftp_service import ElsevierSFTPService -from structlog import get_logger from pytest import fixture +from structlog import get_logger DAG_NAME = "elsevier_pull_sftp" diff --git a/tests/units/elsevier/test_elsevier_parser.py b/tests/units/elsevier/test_elsevier_parser.py index cf63e533..0eab6547 100644 --- a/tests/units/elsevier/test_elsevier_parser.py +++ b/tests/units/elsevier/test_elsevier_parser.py @@ -251,7 +251,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -271,7 +272,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -401,7 +403,8 @@ def parsed_articles(parser, articles): "given_names": "I.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -894,7 +897,8 @@ def parsed_articles(parser, articles): "given_names": "B.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -1013,7 +1017,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1022,7 +1027,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1069,7 +1075,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1276,7 +1283,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1356,7 +1364,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1398,7 +1407,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1456,7 +1466,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1558,7 +1569,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -1813,7 +1825,8 @@ def parsed_articles(parser, articles): "given_names": "C.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -2651,7 +2664,8 @@ def parsed_articles(parser, articles): "given_names": "R.A.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" }, { "value": "Centro de Aplicaciones Tecnológicas y Desarrollo Nuclear (CEADEN), Havana, Cuba", @@ -2725,7 +2739,8 @@ def parsed_articles(parser, articles): "given_names": "U.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -2965,7 +2980,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3073,7 +3089,8 @@ def parsed_articles(parser, articles): "given_names": "G.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3159,7 +3176,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3234,7 +3252,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3298,7 +3317,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3730,7 +3750,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -3739,7 +3760,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" }, { "value": "A.I. Alikhanyan National Science Laboratory (Yerevan Physics Institute) Foundation, Yerevan, Armenia", @@ -4391,7 +4413,8 @@ def parsed_articles(parser, articles): "given_names": "R.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4433,7 +4456,8 @@ def parsed_articles(parser, articles): "given_names": "M.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4486,7 +4510,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4495,7 +4520,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4794,7 +4820,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4825,7 +4852,8 @@ def parsed_articles(parser, articles): "given_names": "O.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4834,7 +4862,8 @@ def parsed_articles(parser, articles): "given_names": "T.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4854,7 +4883,8 @@ def parsed_articles(parser, articles): "given_names": "E.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4874,7 +4904,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4960,7 +4991,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -4969,7 +5001,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5154,7 +5187,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5295,7 +5329,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -5304,7 +5339,8 @@ def parsed_articles(parser, articles): "given_names": "N.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5313,7 +5349,8 @@ def parsed_articles(parser, articles): "given_names": "E.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5399,7 +5436,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5523,7 +5561,8 @@ def parsed_articles(parser, articles): "given_names": "E.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5642,7 +5681,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5651,7 +5691,8 @@ def parsed_articles(parser, articles): "given_names": "A.B.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -5830,7 +5871,8 @@ def parsed_articles(parser, articles): "given_names": "T.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6075,7 +6117,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6199,7 +6242,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6241,7 +6285,8 @@ def parsed_articles(parser, articles): "given_names": "M.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6283,7 +6328,8 @@ def parsed_articles(parser, articles): "given_names": "L.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -6292,7 +6338,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6339,7 +6386,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6661,7 +6709,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -6779,10 +6828,12 @@ def parsed_articles(parser, articles): "given_names": "K.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" }, { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", }, ], }, @@ -6901,7 +6952,8 @@ def parsed_articles(parser, articles): "given_names": "I.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7207,7 +7259,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7238,7 +7291,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7247,7 +7301,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7256,7 +7311,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7287,7 +7343,8 @@ def parsed_articles(parser, articles): "given_names": "P.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -7329,7 +7386,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7371,7 +7429,8 @@ def parsed_articles(parser, articles): "given_names": "V.A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7676,7 +7735,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7707,7 +7767,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7727,7 +7788,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -7949,7 +8011,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -8013,7 +8076,8 @@ def parsed_articles(parser, articles): "given_names": "I.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8296,7 +8360,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8305,7 +8370,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8402,7 +8468,8 @@ def parsed_articles(parser, articles): "given_names": "R.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8411,7 +8478,8 @@ def parsed_articles(parser, articles): "given_names": "E.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -8612,7 +8680,8 @@ def parsed_articles(parser, articles): "given_names": "B.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -8654,7 +8723,8 @@ def parsed_articles(parser, articles): "given_names": "E.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8663,7 +8733,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -8727,7 +8798,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9208,7 +9280,8 @@ def parsed_articles(parser, articles): "country": "Germany", }, { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", }, ], }, @@ -9239,7 +9312,8 @@ def parsed_articles(parser, articles): "given_names": "D.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9281,7 +9355,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9323,7 +9398,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9453,7 +9529,8 @@ def parsed_articles(parser, articles): "given_names": "S.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9473,7 +9550,8 @@ def parsed_articles(parser, articles): "given_names": "Y.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -9960,7 +10038,8 @@ def parsed_articles(parser, articles): "given_names": "M.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10254,7 +10333,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10307,7 +10387,8 @@ def parsed_articles(parser, articles): "given_names": "N.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10672,7 +10753,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10692,7 +10774,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10800,7 +10883,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10831,7 +10915,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN" + "value": "Affiliated with an international laboratory covered by a cooperation agreement with CERN", + "country": "CERN" } ], }, @@ -10862,7 +10947,8 @@ def parsed_articles(parser, articles): "given_names": "K.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -10915,7 +11001,8 @@ def parsed_articles(parser, articles): "given_names": "N.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -11336,7 +11423,8 @@ def parsed_articles(parser, articles): "given_names": "A.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -11356,7 +11444,8 @@ def parsed_articles(parser, articles): "given_names": "N.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -11365,7 +11454,8 @@ def parsed_articles(parser, articles): "given_names": "M.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -11429,7 +11519,8 @@ def parsed_articles(parser, articles): "given_names": "V.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, @@ -11449,7 +11540,8 @@ def parsed_articles(parser, articles): "given_names": "N.", "affiliations": [ { - "value": "Affiliated with an institute covered by a cooperation agreement with CERN" + "value": "Affiliated with an institute covered by a cooperation agreement with CERN", + "country": "CERN", } ], }, From 26e65da5114120165fb76792391e0d4fb3869338 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Mon, 22 Jan 2024 11:52:58 +0100 Subject: [PATCH 4/7] pycountry==22.3.5 (because of constrains) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e4d768d8..146222ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ busypie==0.4.5 pydantic==1.10.7 jsonschema==4.17.3 plyvel==1.5.0 -pycountry==23.12.11 +pycountry==22.3.5 From 7270ee2f5e5abb4d83b30a7a7c1862bfe7560c00 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Mon, 22 Jan 2024 15:03:00 +0100 Subject: [PATCH 5/7] IOP, OUP, Springer, Hindawi countries mapping --- dags/hindawi/parser.py | 5 +++-- dags/iop/parser.py | 6 +++++- dags/springer/parser.py | 8 ++++---- .../springer/test_springer_dag_process_file.py | 4 ++-- tests/units/springer/test_parser.py | 4 ++-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/dags/hindawi/parser.py b/dags/hindawi/parser.py index d6500031..c62b8f1c 100644 --- a/dags/hindawi/parser.py +++ b/dags/hindawi/parser.py @@ -1,9 +1,10 @@ import re import xml.etree.ElementTree as ET -from common.constants import COUNTRY_PARSING_PATTERN, ORGANIZATION_PARSING_PATTERN +from common.constants import ORGANIZATION_PARSING_PATTERN from common.parsing.parser import IParser from common.parsing.xml_extractors import ConstantExtractor, CustomExtractor +from common.utils import parse_country_from_value from hindawi.xml_extractors import HindawiTextExtractor as TextExtractor from structlog import get_logger @@ -121,7 +122,7 @@ def _get_affiliations(self, author): { "value": affiliation.text, "organization": ORGANIZATION_PARSING_PATTERN.sub("", affiliation.text), - "country": COUNTRY_PARSING_PATTERN.search(affiliation.text).group(0), + "country": parse_country_from_value(affiliation.text), } for affiliation in affiliations ] diff --git a/dags/iop/parser.py b/dags/iop/parser.py index a27b8057..41a14239 100644 --- a/dags/iop/parser.py +++ b/dags/iop/parser.py @@ -18,6 +18,7 @@ get_license_type, get_license_type_and_version_from_url, parse_to_int, + parse_country_from_value ) from idutils import is_arxiv from inspire_utils.date import PartialDate @@ -303,12 +304,15 @@ def _get_institution(self, article, id): ) def _get_country(self, article, id): - return extract_text( + country = extract_text( article=article, path=f"front/article-meta/contrib-group/aff[@id='{id}']/country", field_name="country", dois=self.dois, ) + if not country: + return + return parse_country_from_value(country) def _extract_copyright_year(self, article): return extract_text( diff --git a/dags/springer/parser.py b/dags/springer/parser.py index 90bfc50d..53d781d5 100644 --- a/dags/springer/parser.py +++ b/dags/springer/parser.py @@ -9,7 +9,7 @@ CustomExtractor, TextExtractor, ) -from common.utils import construct_license +from common.utils import construct_license, parse_country_from_value from structlog import get_logger @@ -171,12 +171,12 @@ def _clean_aff(self, article: ET.Element): city_node, state_node, postcode_node, - country_node, ] if node is not None ] - - return ", ".join(result), org_name_node.text, country_node.text + country = parse_country_from_value(country_node.text) + result.append(country) + return ", ".join(result), org_name_node.text, country def _get_published_date(self, article: ET.Element): year = article.find( diff --git a/tests/integration/springer/test_springer_dag_process_file.py b/tests/integration/springer/test_springer_dag_process_file.py index 06969cb8..1fe85eff 100644 --- a/tests/integration/springer/test_springer_dag_process_file.py +++ b/tests/integration/springer/test_springer_dag_process_file.py @@ -236,9 +236,9 @@ def test_dag_validate_file_pass(article): "email": "nosaka@yukawa.kyoto-u.ac.jp", "affiliations": [ { - "value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, Korea", + "value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, South Korea", "organization": "School of Physics, Korea Institute for Advanced Study", - "country": "Korea", + "country": "South Korea", } ], "full_name": "Nosaka, Tomoki", diff --git a/tests/units/springer/test_parser.py b/tests/units/springer/test_parser.py index f8165480..75deea36 100644 --- a/tests/units/springer/test_parser.py +++ b/tests/units/springer/test_parser.py @@ -72,8 +72,8 @@ def test_authors(parsed_articles): "affiliations": [ { "organization": "School of Physics, Korea Institute for Advanced Study", - "value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, Korea", - "country": "Korea", + "value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, South Korea", + "country": "South Korea", } ], "surname": "Nosaka", From 8da4d917495728565defa014bc41ab9ec94a4b84 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 21 Feb 2024 11:38:24 +0100 Subject: [PATCH 6/7] Added one more value --- dags/common/countries_mapping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/common/countries_mapping.py b/dags/common/countries_mapping.py index 37a149af..92920183 100644 --- a/dags/common/countries_mapping.py +++ b/dags/common/countries_mapping.py @@ -6,6 +6,7 @@ "DPR. Korea": "North Korea", "CERN": "CERN", "European Organization for Nuclear Research": "CERN", + "Conseil Européen pour la Recherche Nucléaire": "CERN", "KEK": "Japan", "DESY": "Germany", "FERMILAB": "USA", From 1373ce6a750195f6dd5bd65f87426cc84920c213 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 21 Feb 2024 13:39:04 +0100 Subject: [PATCH 7/7] added South Korea mapping --- dags/common/countries_mapping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/common/countries_mapping.py b/dags/common/countries_mapping.py index 92920183..8c54f02f 100644 --- a/dags/common/countries_mapping.py +++ b/dags/common/countries_mapping.py @@ -2,6 +2,7 @@ "INFN": "Italy", "Democratic People's Republic of Korea": "North Korea", "Korea, Democratic People's Republic of": "North Korea", + "Korea, Republic of": "South Korea", "DPR Korea": "North Korea", "DPR. Korea": "North Korea", "CERN": "CERN",