From b6d602f3401a1764e800ccbbbc3dd98485b27a23 Mon Sep 17 00:00:00 2001
From: MJedr <jedrych.marcysia@gmail.com>
Date: Wed, 6 Jul 2022 13:34:11 +0200
Subject: [PATCH] move `normalize_affiliations` from next ref:
 cern-sis/issues-inspire#48

---
 inspire_utils/record.py | 121 +++++++++++++++++++++++++++++++++++++++-
 setup.py                |   2 +
 2 files changed, 120 insertions(+), 3 deletions(-)
diff --git a/inspire_utils/record.py b/inspire_utils/record.py
index 21ec1e2..c6b2ba6 100644
--- a/inspire_utils/record.py
+++ b/inspire_utils/record.py
@@ -22,12 +22,14 @@
 
 from __future__ import absolute_import, division, print_function
 
-from six import string_types
-
 import re
 
+from elasticsearch_dsl import Q, Search
+from six import string_types
 
-SPLIT_KEY_PATTERN = re.compile(r'\.|\[')
+from .dedupers import dedupe_list
+
+SPLIT_KEY_PATTERN = re.compile(r"\.|\[")
 
 
 def get_value(record, key, default=None):
@@ -512,3 +514,116 @@ def replace_undesirable_characters(line):
         line = line.replace(bad_char, replacement)
 
     return line
+
+
+def _match_lit_author_affiliation(raw_aff, current_search):
+    query = Q(
+        "nested",
+        path="authors",
+        query=(
+            Q("match", authors__raw_affiliations__value=raw_aff) & Q("exists", field="authors.affiliations.value")
+        ),
+        inner_hits={},
+    )
+    query_filters = Q("term", _collections="Literature") & Q("term", curated=True)
+    result = (
+        Search(index="records-hep", using=current_search)
+        .query(query)
+        .filter(query_filters)
+        .highlight("authors.raw_affiliations.value", fragment_size=len(raw_aff))
+        .source(False)
+        .params(size=20)
+        .execute()
+        .hits
+    )
+    return result
+
+
+def _clean_up_affiliation_data(affiliations):
+    cleaned_affiliations = []
+    for aff in affiliations:
+        cleaned_affiliations.append(
+            {key: val for key, val in aff.items() if key in ["value", "record"]}
+        )
+    return cleaned_affiliations
+
+
+def _find_unambiguous_affiliation(result):
+    for matched_author in result:
+        matched_author_data = matched_author.meta.inner_hits.authors.hits[0].to_dict()
+        matched_author_raw_affs = matched_author_data["raw_affiliations"]
+        matched_author_affs = matched_author_data["affiliations"]
+        matched_aff = []
+        if len(matched_author_raw_affs) == 1:
+            matched_aff = matched_author_affs
+        elif len(matched_author_raw_affs) == len(matched_author_affs):
+            matched_aff = _extract_matched_aff_from_highlight(
+                matched_author.meta.highlight["authors.raw_affiliations.value"],
+                matched_author_raw_affs,
+                matched_author_affs,
+            )
+        if matched_aff:
+            return _clean_up_affiliation_data(matched_aff)
+
+
+def _raw_aff_highlight_len(highlighted_raw_aff):
+    matches = re.findall(r"<em>(.*?)</em>", highlighted_raw_aff)
+    return sum(len(match) for match in matches)
+
+
+def _extract_matched_aff_from_highlight(
+    highlighted_raw_affs, author_raw_affs, author_affs
+):
+    raw_aff_highlight_lenghts = [
+        _raw_aff_highlight_len(raw_aff) for raw_aff in highlighted_raw_affs
+    ]
+    longest_highlight_idx = raw_aff_highlight_lenghts.index(
+        max(raw_aff_highlight_lenghts)
+    )
+    extracted_raw_aff = re.sub(
+        "<em>|</em>", "", highlighted_raw_affs[longest_highlight_idx]
+    )
+    for raw_aff, aff in zip(author_raw_affs, author_affs):
+        if raw_aff["value"] == extracted_raw_aff:
+            return [aff]
+
+
+def normalize_affiliations(data, current_seearch):
+    """
+    Normalizes author raw affiliations in literature record.
+    Params:
+        data (dict): data contaning list of authors with affiliations to normalize
+        current_search (LocalProxy): Elasticsearch client
+    Returns:
+        normalized_affiliations: list containing normalized affiliations for each author
+        ambiguous_affiliations: not matched (not normalized) affiliations
+    """
+    matched_affiliations = {}
+    normalized_affiliations = []
+    ambiguous_affiliations = []
+    for author in data.get("authors", []):
+        author_affiliations = author.get("affiliations", [])
+        if author_affiliations:
+            normalized_affiliations.append(author_affiliations)
+            continue
+        raw_affs = get_value(author, "raw_affiliations.value", [])
+        for raw_aff in raw_affs:
+            if raw_aff in matched_affiliations:
+                author_affiliations.extend(matched_affiliations[raw_aff])
+                continue
+            matched_author_affiliations_hits = _match_lit_author_affiliation(
+                raw_aff, current_seearch
+            )
+            matched_author_affiliations = _find_unambiguous_affiliation(
+                matched_author_affiliations_hits
+            )
+            if matched_author_affiliations:
+                matched_affiliations[raw_aff] = matched_author_affiliations
+                author_affiliations.extend(matched_author_affiliations)
+            else:
+                ambiguous_affiliations.append(raw_aff)
+        normalized_affiliations.append(dedupe_list(author_affiliations))
+    return (
+        normalized_affiliations,
+        ambiguous_affiliations,
+    )
diff --git a/setup.py b/setup.py
index 994677e..dbc2a73 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,8 @@
     'nameparser~=0.0,>=0.5.3',
     'python-dateutil~=2.0,>=2.6.1',
     'six~=1.0,>=1.10.0',
+    'elasticsearch==7.1.0',
+    'elasticsearch-dsl~=7.1'
 ]
 
 docs_require = []