From 976b3bc05aebc32fd6757b72b0afc58b34475629 Mon Sep 17 00:00:00 2001 From: Harris Tzovanakis Date: Thu, 13 Jun 2024 08:41:48 +0200 Subject: [PATCH] assing: fix author `can_claim` check * ref: cern-sis/issues-inspire#467 --- backend/inspirehep/assign/utils.py | 51 +++++++++++-------- backend/inspirehep/assign/views.py | 2 +- .../tests/unit/assign/test_assign_utils.py | 50 ++++++++++++++++++ 3 files changed, 80 insertions(+), 23 deletions(-) create mode 100644 backend/tests/unit/assign/test_assign_utils.py diff --git a/backend/inspirehep/assign/utils.py b/backend/inspirehep/assign/utils.py index 0ecb5e7533..e9ad190da2 100644 --- a/backend/inspirehep/assign/utils.py +++ b/backend/inspirehep/assign/utils.py @@ -4,6 +4,7 @@ # # inspirehep is free software; you can redistribute it and/or modify it under # the terms of the MIT License; see LICENSE file for more details. + import structlog from flask import request from inspire_dojson.utils import get_recid_from_ref @@ -14,6 +15,7 @@ from invenio_pidstore.models import PersistentIdentifier from invenio_records.models import RecordMetadata from sqlalchemy.orm.exc import NoResultFound +from unidecode import unidecode from inspirehep.accounts.api import get_current_user_orcid from inspirehep.records.api import AuthorsRecord @@ -108,30 +110,35 @@ def can_claim(data, author_profile_recid): if not lit_record: return False - author_parsed_name = ParsedName.loads(current_author_profile["name"]["value"]) - author_names = { - current_author_profile["name"]["value"], - author_parsed_name.last, - str(author_parsed_name), # removes ',' and puts it in normal order - } - author_names.update( - [ - author_name.split(",")[0] - for author_name in get_value( - current_author_profile, "name.name_variants", [] - ) - ] - ) + def get_last_names(name): + parsed_name = ParsedName.loads(name) + # corner case for single name (ie. "Smith") + if len(parsed_name) == 1: + return {unidecode(parsed_name.first)} + # corner case for full names without comma, + # we are treating them as last names (ie. "Smith Davis") + if "," not in name: + names = name.split() + else: + names = parsed_name.last_list + + last_names = set() + for last_name in names: + last_name = unidecode(last_name) + last_names.add(last_name) + return last_names + + author_last_names = set() + author_last_names.update(get_last_names(current_author_profile["name"]["value"])) + for variant in get_value(current_author_profile, "name.name_variants", []): + author_last_names.update(get_last_names(variant)) lit_author = get_author_by_recid(lit_record, int(author_profile_recid)) - lit_author_parsed_name = ParsedName.loads(lit_author.get("full_name", "")) - lit_author_names = { - lit_author.get("full_name", ""), - lit_author_parsed_name.last, - str(lit_author_parsed_name), - } + lit_author_last_names = set() + if lit_author: + lit_author_last_names.update(get_last_names(lit_author.get("full_name", ""))) - return lit_author_names & author_names + return bool(author_last_names & lit_author_last_names) def _check_names_compability(lit_record, author_parsed_name, last_names_only=False): @@ -141,7 +148,7 @@ def _check_names_compability(lit_record, author_parsed_name, last_names_only=Fal author_name_to_compare = ( author_parsed_name.last if last_names_only - else f"{ author_parsed_name.last}, {author_parsed_name.first}".strip(", ") + else f"{author_parsed_name.last}, {author_parsed_name.first}".strip(", ") ) matched_authors_recids = [ recid diff --git a/backend/inspirehep/assign/views.py b/backend/inspirehep/assign/views.py index 119b36c1be..130afa879c 100644 --- a/backend/inspirehep/assign/views.py +++ b/backend/inspirehep/assign/views.py @@ -166,7 +166,7 @@ def assign_different_profile(args): for literature_id in literature_ids: record = LiteratureRecord.get_record_by_pid_value(literature_id) - if record.get("curated") and not is_from_author_stub: + if record.get("curated_relation") and not is_from_author_stub: literature_ids_already_claimed.append(literature_id) if not can_claim(record, from_author_recid): literature_ids_not_compatible_name.append(literature_id) diff --git a/backend/tests/unit/assign/test_assign_utils.py b/backend/tests/unit/assign/test_assign_utils.py new file mode 100644 index 0000000000..1b8075146a --- /dev/null +++ b/backend/tests/unit/assign/test_assign_utils.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2019 CERN. +# +# inspirehep is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +import pytest +from mock import patch + +from inspirehep.assign.utils import can_claim + + +@pytest.mark.parametrize( + "authors_full_name, profile_name, name_variants, expected", + [ + ("Smith, John", "Smith, J.", [], True), + ("Smith, J.", "Smith, John", [], True), + ("Smith Davis, J.", "Smith, Robert", [], True), + ("Davis, J.", "Smith Davis, P.", [], True), + ("Smith, J.", "Davis, Smith", [], False), + ("Smïth, J.", "Smith, J.", [], True), + ("Smith Davis", "Smith, J.", [], True), + ("Smith, J.", "Smith", [], True), + ], +) +@patch("inspirehep.assign.utils._get_current_user_author_profile") +@patch("inspirehep.assign.utils._get_lit_record_from_db") +@patch("inspirehep.assign.utils.get_author_by_recid") +def test_can_claim( + mock_get_author_by_recid, + mock_get_lit_record_from_db, + mock_get_current_user_author_profile, + authors_full_name, + profile_name, + name_variants, + expected, +): + mock_get_current_user_author_profile.return_value = { + "name": {"value": profile_name}, + "name_variants": name_variants, + } + mock_get_lit_record_from_db.return_value = {"control_number": 123} + mock_get_author_by_recid.return_value = {"full_name": authors_full_name} + + data = {"control_number": 123} + author_profile_recid = 1 + + result = can_claim(data, author_profile_recid) + assert result == expected