From b6d602f3401a1764e800ccbbbc3dd98485b27a23 Mon Sep 17 00:00:00 2001 From: MJedr Date: Wed, 6 Jul 2022 13:34:11 +0200 Subject: [PATCH] move `normalize_affiliations` from next ref: cern-sis/issues-inspire#48 --- inspire_utils/record.py | 121 +++++++++++++++++++++++++++++++++++++++- setup.py | 2 + 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/inspire_utils/record.py b/inspire_utils/record.py index 21ec1e2..c6b2ba6 100644 --- a/inspire_utils/record.py +++ b/inspire_utils/record.py @@ -22,12 +22,14 @@ from __future__ import absolute_import, division, print_function -from six import string_types - import re +from elasticsearch_dsl import Q, Search +from six import string_types -SPLIT_KEY_PATTERN = re.compile(r'\.|\[') +from .dedupers import dedupe_list + +SPLIT_KEY_PATTERN = re.compile(r"\.|\[") def get_value(record, key, default=None): @@ -512,3 +514,116 @@ def replace_undesirable_characters(line): line = line.replace(bad_char, replacement) return line + + +def _match_lit_author_affiliation(raw_aff, current_search): + query = Q( + "nested", + path="authors", + query=( + Q("match", authors__raw_affiliations__value=raw_aff) & Q("exists", field="authors.affiliations.value") + ), + inner_hits={}, + ) + query_filters = Q("term", _collections="Literature") & Q("term", curated=True) + result = ( + Search(index="records-hep", using=current_search) + .query(query) + .filter(query_filters) + .highlight("authors.raw_affiliations.value", fragment_size=len(raw_aff)) + .source(False) + .params(size=20) + .execute() + .hits + ) + return result + + +def _clean_up_affiliation_data(affiliations): + cleaned_affiliations = [] + for aff in affiliations: + cleaned_affiliations.append( + {key: val for key, val in aff.items() if key in ["value", "record"]} + ) + return cleaned_affiliations + + +def _find_unambiguous_affiliation(result): + for matched_author in result: + matched_author_data = matched_author.meta.inner_hits.authors.hits[0].to_dict() + matched_author_raw_affs = matched_author_data["raw_affiliations"] + matched_author_affs = matched_author_data["affiliations"] + matched_aff = [] + if len(matched_author_raw_affs) == 1: + matched_aff = matched_author_affs + elif len(matched_author_raw_affs) == len(matched_author_affs): + matched_aff = _extract_matched_aff_from_highlight( + matched_author.meta.highlight["authors.raw_affiliations.value"], + matched_author_raw_affs, + matched_author_affs, + ) + if matched_aff: + return _clean_up_affiliation_data(matched_aff) + + +def _raw_aff_highlight_len(highlighted_raw_aff): + matches = re.findall(r"(.*?)", highlighted_raw_aff) + return sum(len(match) for match in matches) + + +def _extract_matched_aff_from_highlight( + highlighted_raw_affs, author_raw_affs, author_affs +): + raw_aff_highlight_lenghts = [ + _raw_aff_highlight_len(raw_aff) for raw_aff in highlighted_raw_affs + ] + longest_highlight_idx = raw_aff_highlight_lenghts.index( + max(raw_aff_highlight_lenghts) + ) + extracted_raw_aff = re.sub( + "|", "", highlighted_raw_affs[longest_highlight_idx] + ) + for raw_aff, aff in zip(author_raw_affs, author_affs): + if raw_aff["value"] == extracted_raw_aff: + return [aff] + + +def normalize_affiliations(data, current_seearch): + """ + Normalizes author raw affiliations in literature record. + Params: + data (dict): data contaning list of authors with affiliations to normalize + current_search (LocalProxy): Elasticsearch client + Returns: + normalized_affiliations: list containing normalized affiliations for each author + ambiguous_affiliations: not matched (not normalized) affiliations + """ + matched_affiliations = {} + normalized_affiliations = [] + ambiguous_affiliations = [] + for author in data.get("authors", []): + author_affiliations = author.get("affiliations", []) + if author_affiliations: + normalized_affiliations.append(author_affiliations) + continue + raw_affs = get_value(author, "raw_affiliations.value", []) + for raw_aff in raw_affs: + if raw_aff in matched_affiliations: + author_affiliations.extend(matched_affiliations[raw_aff]) + continue + matched_author_affiliations_hits = _match_lit_author_affiliation( + raw_aff, current_seearch + ) + matched_author_affiliations = _find_unambiguous_affiliation( + matched_author_affiliations_hits + ) + if matched_author_affiliations: + matched_affiliations[raw_aff] = matched_author_affiliations + author_affiliations.extend(matched_author_affiliations) + else: + ambiguous_affiliations.append(raw_aff) + normalized_affiliations.append(dedupe_list(author_affiliations)) + return ( + normalized_affiliations, + ambiguous_affiliations, + ) diff --git a/setup.py b/setup.py index 994677e..dbc2a73 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,8 @@ 'nameparser~=0.0,>=0.5.3', 'python-dateutil~=2.0,>=2.6.1', 'six~=1.0,>=1.10.0', + 'elasticsearch==7.1.0', + 'elasticsearch-dsl~=7.1' ] docs_require = []