Skip to content

Commit

Permalink
Reduce edition match score if one edition is missing a date (#10462)
Browse files Browse the repository at this point in the history
* reduce edition match score if one edition is missing a date
closes #10461
* use type alias for threshold results
* increase penalty for date mismatch on editions
* tidy existing match tests, and add 2 more
* add a full import test for the pre-ISBN / nodate ISBN scenario
  • Loading branch information
hornc authored Feb 25, 2025
1 parent 0670b15 commit 51ee743
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 75 deletions.
29 changes: 14 additions & 15 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def __str__(self):
subject_fields = ['subjects', 'subject_places', 'subject_times', 'subject_people']


def normalize(s):
def normalize(s: str) -> str:
"""Strip non-alphanums and truncate at 25 chars."""
norm = strip_accents(s).lower()
norm = norm.replace(' and ', ' ')
Expand All @@ -160,7 +160,7 @@ def is_redirect(thing):
return thing.type.key == '/type/redirect'


def split_subtitle(full_title):
def split_subtitle(full_title: str):
"""
Splits a title into (title, subtitle),
strips parenthetical tags. Used for bookseller
Expand Down Expand Up @@ -220,7 +220,6 @@ def build_author_reply(authors_in, edits, source):
:rtype: tuple
:return: (list, list) authors [{"key": "/author/OL..A"}, ...], author_reply
"""

authors = []
author_reply = []
for a in authors_in:
Expand Down Expand Up @@ -386,7 +385,7 @@ def update_ia_metadata_for_ol_edition(edition_id):
return data


def normalize_record_bibids(rec):
def normalize_record_bibids(rec: dict):
"""
Returns the Edition import record with all ISBN fields and LCCNs cleaned.
Expand All @@ -397,16 +396,18 @@ def normalize_record_bibids(rec):
for field in ('isbn_13', 'isbn_10', 'isbn'):
if rec.get(field):
rec[field] = [
normalize_isbn(isbn) for isbn in rec.get(field) if normalize_isbn(isbn)
normalize_isbn(isbn)
for isbn in rec.get(field, '')
if normalize_isbn(isbn)
]
if rec.get('lccn'):
rec['lccn'] = [
normalize_lccn(lccn) for lccn in rec.get('lccn') if normalize_lccn(lccn)
normalize_lccn(lccn) for lccn in rec.get('lccn', '') if normalize_lccn(lccn)
]
return rec


def isbns_from_record(rec):
def isbns_from_record(rec: dict) -> list[str]:
"""
Returns a list of all isbns from the various possible isbn fields.
Expand All @@ -417,7 +418,7 @@ def isbns_from_record(rec):
return isbns


def build_pool(rec):
def build_pool(rec: dict) -> dict[str, list[str]]:
"""
Searches for existing edition matches on title and bibliographic keys.
Expand All @@ -440,7 +441,6 @@ def build_pool(rec):
# Find records with matching ISBNs
if isbns := isbns_from_record(rec):
pool['isbn'] = set(editions_matched(rec, 'isbn_', isbns))

return {k: list(v) for k, v in pool.items() if v}


Expand All @@ -451,7 +451,6 @@ def find_quick_match(rec: dict) -> str | None:
:param dict rec: Edition record
:return: First key matched of format "/books/OL..M" or None if no match found.
"""

if 'openlibrary' in rec:
return '/books/' + rec['openlibrary']

Expand Down Expand Up @@ -480,14 +479,14 @@ def find_quick_match(rec: dict) -> str | None:
return None


def editions_matched(rec, key, value=None):
def editions_matched(rec: dict, key: str, value=None) -> list[str]:
"""
Search OL for editions matching record's 'key' value.
:param dict rec: Edition import record
:param str key: Key to search on, e.g. 'isbn_'
:param list|str value: Value or Values to use, overriding record values
:rtpye: list
:rtype: list
:return: List of edition keys ["/books/OL..M",]
"""
if value is None and key not in rec:
Expand All @@ -500,11 +499,11 @@ def editions_matched(rec, key, value=None):
return ekeys


def find_threshold_match(rec: dict, edition_pool: dict) -> str | None:
def find_threshold_match(rec: dict, edition_pool: dict[str, list[str]]) -> str | None:
"""
Find the best match for rec in edition_pool and return its key.
:param dict rec: the new edition we are trying to match.
:param list edition_pool: list of possible edition key matches, output of build_pool(import record)
:param dict edition_pool: edition key matches, output of build_pool(import record)
:return: None or the edition key '/books/OL...M' of the best edition match for enriched_rec in edition_pool
"""
seen = set()
Expand Down Expand Up @@ -730,7 +729,7 @@ def normalize_import_record(rec: dict) -> None:

# Split subtitle if required and not already present
if ':' in rec.get('title', '') and not rec.get('subtitle'):
title, subtitle = split_subtitle(rec.get('title'))
title, subtitle = split_subtitle(rec.get('title', ''))
if subtitle:
rec['title'] = title
rec['subtitle'] = subtitle
Expand Down
54 changes: 26 additions & 28 deletions openlibrary/catalog/add_book/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
re_brackets = re.compile(r'^(.+)\[.*?\]$')
re_whitespace_and_punct = re.compile(r'[-\s,;:.]+')

type ThresholdResult = tuple[str, str, int] # (field/category, result, score)

DATE_MISMATCH = -800
ISBN_MATCH = 85
THRESHOLD = 875

Expand Down Expand Up @@ -60,7 +63,6 @@ def normalize(s: str) -> str:
by lowercasing, unicode -> NFC,
stripping extra whitespace and punctuation, and replacing ampersands.
"""

s = unicodedata.normalize('NFC', s)
s = s.replace(' & ', ' and ')
s = re_whitespace_and_punct.sub(' ', s.lower()).strip()
Expand Down Expand Up @@ -153,7 +155,7 @@ def expand_record(rec: dict) -> dict[str, str | list[str]]:
return expanded_rec


def build_titles(title: str):
def build_titles(title: str) -> dict:
"""
Uses a full title to create normalized and short title versions.
Used for expanding a set of title variants for matching,
Expand Down Expand Up @@ -181,11 +183,11 @@ def build_titles(title: str):
}


def within(a, b, distance):
def within(a, b, distance) -> bool:
return abs(a - b) <= distance


def compare_country(e1: dict, e2: dict):
def compare_country(e1: dict, e2: dict) -> ThresholdResult:
field = 'publish_country'
if field not in e1 or field not in e2:
return (field, 'value missing', 0)
Expand All @@ -197,7 +199,7 @@ def compare_country(e1: dict, e2: dict):
return (field, 'mismatch', -205)


def compare_lccn(e1: dict, e2: dict):
def compare_lccn(e1: dict, e2: dict) -> ThresholdResult:
field = 'lccn'
if field not in e1 or field not in e2:
return (field, 'value missing', 0)
Expand All @@ -206,23 +208,24 @@ def compare_lccn(e1: dict, e2: dict):
return (field, 'mismatch', -320)


def compare_date(e1: dict, e2: dict):
if 'publish_date' not in e1 or 'publish_date' not in e2:
return ('date', 'value missing', 0)
def compare_date(e1: dict, e2: dict) -> ThresholdResult:
if ('publish_date' not in e1) and ('publish_date' not in e2):
return ('date', 'values missing', 0)
if ('publish_date' in e1) ^ ('publish_date' in e2):
return ('date', 'value missing', DATE_MISMATCH)
if e1['publish_date'] == e2['publish_date']:
return ('date', 'exact match', 200)
try:
e1_pub = int(e1['publish_date'])
e2_pub = int(e2['publish_date'])
if within(e1_pub, e2_pub, 2):
return ('date', '+/-2 years', -25)
else:
return ('date', 'mismatch', -250)
except ValueError as TypeError:
return ('date', 'mismatch', -250)
except (ValueError, TypeError):
pass
return ('date', 'mismatch', DATE_MISMATCH)


def compare_isbn(e1: dict, e2: dict):
def compare_isbn(e1: dict, e2: dict) -> ThresholdResult:
if len(e1['isbn']) == 0 or len(e2['isbn']) == 0:
return ('ISBN', 'missing', 0)
for i in e1['isbn']:
Expand All @@ -232,10 +235,7 @@ def compare_isbn(e1: dict, e2: dict):
return ('ISBN', 'mismatch', -225)


# 450 + 200 + 85 + 200


def level1_match(e1: dict, e2: dict):
def level1_match(e1: dict, e2: dict) -> list[ThresholdResult]:
"""
:param dict e1: Expanded Edition, output of expand_record()
:param dict e2: Expanded Edition, output of expand_record()
Expand All @@ -254,7 +254,7 @@ def level1_match(e1: dict, e2: dict):
return score


def level2_match(e1: dict, e2: dict):
def level2_match(e1: dict, e2: dict) -> list[ThresholdResult]:
"""
:param dict e1: Expanded Edition, output of expand_record()
:param dict e2: Expanded Edition, output of expand_record()
Expand All @@ -274,7 +274,7 @@ def level2_match(e1: dict, e2: dict):
return score


def compare_author_fields(e1_authors, e2_authors):
def compare_author_fields(e1_authors, e2_authors) -> bool:
for i in e1_authors:
for j in e2_authors:
if normalize(i['db_name']) == normalize(j['db_name']):
Expand All @@ -284,7 +284,7 @@ def compare_author_fields(e1_authors, e2_authors):
return False


def compare_author_keywords(e1_authors, e2_authors):
def compare_author_keywords(e1_authors, e2_authors) -> ThresholdResult:
max_score = 0
for i in e1_authors:
for j in e2_authors:
Expand All @@ -300,15 +300,15 @@ def compare_author_keywords(e1_authors, e2_authors):
return ('authors', 'mismatch', -200)


def compare_authors(e1: dict, e2: dict):
def compare_authors(e1: dict, e2: dict) -> ThresholdResult:
"""
Compares the authors of two edition representations and
returns a evaluation and score.
:param dict e1: Expanded Edition, output of expand_record()
:param dict e2: Expanded Edition, output of expand_record()
:rtype: tuple
:return: str?, message, score
:return: field/category, message, score
"""
if 'authors' in e1 and 'authors' in e2: # noqa: SIM102
if compare_author_fields(e1['authors'], e2['authors']):
Expand Down Expand Up @@ -340,7 +340,7 @@ def title_replace_amp(amazon):
return normalize(amazon['full-title'].replace(" & ", " and ")).lower()


def substr_match(a: str, b: str):
def substr_match(a: str, b: str) -> bool:
return a.find(b) != -1 or b.find(a) != -1


Expand Down Expand Up @@ -409,15 +409,15 @@ def compare_number_of_pages(amazon, marc):
return ('pagination', 'non-match (by more than 10)', -225)


def short_part_publisher_match(p1, p2):
def short_part_publisher_match(p1: str, p2: str) -> bool:
pub1 = p1.split()
pub2 = p2.split()
if len(pub1) == 1 or len(pub2) == 1:
return False
return all(substr_match(i, j) for i, j in zip(pub1, pub2))


def compare_publisher(e1: dict, e2: dict):
def compare_publisher(e1: dict, e2: dict) -> ThresholdResult:
if 'publishers' in e1 and 'publishers' in e2:
for e1_pub in e1['publishers']:
e1_norm = normalize(e1_pub)
Expand All @@ -432,9 +432,7 @@ def compare_publisher(e1: dict, e2: dict):
elif short_part_publisher_match(e1_norm, e2_norm):
return ('publisher', 'match', 100)
return ('publisher', 'mismatch', -51)

if 'publishers' not in e1 or 'publishers' not in e2:
return ('publisher', 'either missing', 0)
return ('publisher', 'either missing', 0)


def threshold_match(
Expand Down
49 changes: 48 additions & 1 deletion openlibrary/catalog/add_book/tests/test_add_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,52 @@ def test_find_match_is_used_when_looking_for_edition_matches(mock_site) -> None:
assert e['key'] == '/books/OL17M'


def test_preisbn_import_does_not_match_existing_undated_isbn_record(mock_site) -> None:
author = {
'type': {'key': '/type/author'},
'name': 'Jane Austen',
'key': '/authors/OL21594A',
}
existing_work = {
'authors': [
{
'author': {'key': '/authors/OL21594A'},
'type': {'key': '/type/author_role'},
}
],
'key': '/works/OL16W',
'title': 'Sense and Sensibility',
'type': {'key': '/type/work'},
}
existing_edition = {
'key': '/books/OL18M',
#'publish_date': '2025', NO DATE
'publishers': ['Penguin'],
'title': 'Sense and Sensibility',
'isbn_13': ['9780241734872'],
'source_records': ['non-marc:ja_light_record'],
'works': [{'key': '/works/OL16W'}],
}
mock_site.save(author)
mock_site.save(existing_work)
mock_site.save(existing_edition)
rec = {
'authors': [{'name': 'Jane Austen'}],
'publish_date': '1913',
'publishers': ['Penguin'],
'title': 'Sense and Sensibility',
'source_records': ['marc:somelibrary/some_marc.mrc'],
}
reply = load(rec)
assert reply['authors'][0]['status'] == 'matched'
assert reply['authors'][0]['key'] == '/authors/OL21594A'
assert reply['work']['status'] == 'matched'
assert reply['work']['key'] == '/works/OL16W'
assert (
reply['edition']['status'] == 'created'
) # New edition is created, not matched


def test_covers_are_added_to_edition(mock_site, monkeypatch) -> None:
"""Ensures a cover from rec is added to a matched edition."""
author = {
Expand Down Expand Up @@ -1871,7 +1917,8 @@ def test_year_1900_removed_from_amz_and_bwb_promise_items(self, rec, expected):


def test_find_match_title_only_promiseitem_against_noisbn_marc(mock_site):
# An existing light title + ISBN only record
# An existing light title + ISBN only record should not match an
# incoming pre-ISBN record.
existing_edition = {
'key': '/books/OL113M',
# NO author
Expand Down
Loading

0 comments on commit 51ee743

Please sign in to comment.