Skip to content

Commit

Permalink
Merge pull request #77 from MJedr/add-normalization
Browse files Browse the repository at this point in the history
move `normalize_affiliations` from next
  • Loading branch information
MJedr authored Jul 7, 2022
2 parents b239570 + b6d602f commit 512714f
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 3 deletions.
121 changes: 118 additions & 3 deletions inspire_utils/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@

from __future__ import absolute_import, division, print_function

from six import string_types

import re

from elasticsearch_dsl import Q, Search
from six import string_types

SPLIT_KEY_PATTERN = re.compile(r'\.|\[')
from .dedupers import dedupe_list

SPLIT_KEY_PATTERN = re.compile(r"\.|\[")


def get_value(record, key, default=None):
Expand Down Expand Up @@ -512,3 +514,116 @@ def replace_undesirable_characters(line):
line = line.replace(bad_char, replacement)

return line


def _match_lit_author_affiliation(raw_aff, current_search):
query = Q(
"nested",
path="authors",
query=(
Q("match", authors__raw_affiliations__value=raw_aff) & Q("exists", field="authors.affiliations.value")
),
inner_hits={},
)
query_filters = Q("term", _collections="Literature") & Q("term", curated=True)
result = (
Search(index="records-hep", using=current_search)
.query(query)
.filter(query_filters)
.highlight("authors.raw_affiliations.value", fragment_size=len(raw_aff))
.source(False)
.params(size=20)
.execute()
.hits
)
return result


def _clean_up_affiliation_data(affiliations):
cleaned_affiliations = []
for aff in affiliations:
cleaned_affiliations.append(
{key: val for key, val in aff.items() if key in ["value", "record"]}
)
return cleaned_affiliations


def _find_unambiguous_affiliation(result):
for matched_author in result:
matched_author_data = matched_author.meta.inner_hits.authors.hits[0].to_dict()
matched_author_raw_affs = matched_author_data["raw_affiliations"]
matched_author_affs = matched_author_data["affiliations"]
matched_aff = []
if len(matched_author_raw_affs) == 1:
matched_aff = matched_author_affs
elif len(matched_author_raw_affs) == len(matched_author_affs):
matched_aff = _extract_matched_aff_from_highlight(
matched_author.meta.highlight["authors.raw_affiliations.value"],
matched_author_raw_affs,
matched_author_affs,
)
if matched_aff:
return _clean_up_affiliation_data(matched_aff)


def _raw_aff_highlight_len(highlighted_raw_aff):
matches = re.findall(r"<em>(.*?)</em>", highlighted_raw_aff)
return sum(len(match) for match in matches)


def _extract_matched_aff_from_highlight(
highlighted_raw_affs, author_raw_affs, author_affs
):
raw_aff_highlight_lenghts = [
_raw_aff_highlight_len(raw_aff) for raw_aff in highlighted_raw_affs
]
longest_highlight_idx = raw_aff_highlight_lenghts.index(
max(raw_aff_highlight_lenghts)
)
extracted_raw_aff = re.sub(
"<em>|</em>", "", highlighted_raw_affs[longest_highlight_idx]
)
for raw_aff, aff in zip(author_raw_affs, author_affs):
if raw_aff["value"] == extracted_raw_aff:
return [aff]


def normalize_affiliations(data, current_seearch):
"""
Normalizes author raw affiliations in literature record.
Params:
data (dict): data contaning list of authors with affiliations to normalize
current_search (LocalProxy): Elasticsearch client
Returns:
normalized_affiliations: list containing normalized affiliations for each author
ambiguous_affiliations: not matched (not normalized) affiliations
"""
matched_affiliations = {}
normalized_affiliations = []
ambiguous_affiliations = []
for author in data.get("authors", []):
author_affiliations = author.get("affiliations", [])
if author_affiliations:
normalized_affiliations.append(author_affiliations)
continue
raw_affs = get_value(author, "raw_affiliations.value", [])
for raw_aff in raw_affs:
if raw_aff in matched_affiliations:
author_affiliations.extend(matched_affiliations[raw_aff])
continue
matched_author_affiliations_hits = _match_lit_author_affiliation(
raw_aff, current_seearch
)
matched_author_affiliations = _find_unambiguous_affiliation(
matched_author_affiliations_hits
)
if matched_author_affiliations:
matched_affiliations[raw_aff] = matched_author_affiliations
author_affiliations.extend(matched_author_affiliations)
else:
ambiguous_affiliations.append(raw_aff)
normalized_affiliations.append(dedupe_list(author_affiliations))
return (
normalized_affiliations,
ambiguous_affiliations,
)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
'nameparser~=0.0,>=0.5.3',
'python-dateutil~=2.0,>=2.6.1',
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
'elasticsearch-dsl~=7.1'
]

docs_require = []
Expand Down

0 comments on commit 512714f

Please sign in to comment.