Reduce edition match score if one edition is missing a date (#10462)

* reduce edition match score if one edition is missing a date closes #10461 * use type alias for threshold results * increase penalty for date mismatch on editions * tidy existing match tests, and add 2 more * add a full import test for the pre-ISBN / nodate ISBN scenario
internetarchive · Feb 25, 2025 · 51ee743 · 51ee743
1 parent 0670b15
commit 51ee743
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 75 deletions.
diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -137,7 +137,7 @@ def __str__(self):
 subject_fields = ['subjects', 'subject_places', 'subject_times', 'subject_people']
 
 
-def normalize(s):
+def normalize(s: str) -> str:
     """Strip non-alphanums and truncate at 25 chars."""
     norm = strip_accents(s).lower()
     norm = norm.replace(' and ', ' ')
@@ -160,7 +160,7 @@ def is_redirect(thing):
     return thing.type.key == '/type/redirect'
 
 
-def split_subtitle(full_title):
+def split_subtitle(full_title: str):
     """
     Splits a title into (title, subtitle),
     strips parenthetical tags. Used for bookseller
@@ -220,7 +220,6 @@ def build_author_reply(authors_in, edits, source):
     :rtype: tuple
     :return: (list, list) authors [{"key": "/author/OL..A"}, ...], author_reply
     """
-
     authors = []
     author_reply = []
     for a in authors_in:
@@ -386,7 +385,7 @@ def update_ia_metadata_for_ol_edition(edition_id):
     return data
 
 
-def normalize_record_bibids(rec):
+def normalize_record_bibids(rec: dict):
     """
     Returns the Edition import record with all ISBN fields and LCCNs cleaned.
 
@@ -397,16 +396,18 @@ def normalize_record_bibids(rec):
     for field in ('isbn_13', 'isbn_10', 'isbn'):
         if rec.get(field):
             rec[field] = [
-                normalize_isbn(isbn) for isbn in rec.get(field) if normalize_isbn(isbn)
+                normalize_isbn(isbn)
+                for isbn in rec.get(field, '')
+                if normalize_isbn(isbn)
             ]
     if rec.get('lccn'):
         rec['lccn'] = [
-            normalize_lccn(lccn) for lccn in rec.get('lccn') if normalize_lccn(lccn)
+            normalize_lccn(lccn) for lccn in rec.get('lccn', '') if normalize_lccn(lccn)
         ]
     return rec
 
 
-def isbns_from_record(rec):
+def isbns_from_record(rec: dict) -> list[str]:
     """
     Returns a list of all isbns from the various possible isbn fields.
 
@@ -417,7 +418,7 @@ def isbns_from_record(rec):
     return isbns
 
 
-def build_pool(rec):
+def build_pool(rec: dict) -> dict[str, list[str]]:
     """
     Searches for existing edition matches on title and bibliographic keys.
 
@@ -440,7 +441,6 @@ def build_pool(rec):
     # Find records with matching ISBNs
     if isbns := isbns_from_record(rec):
         pool['isbn'] = set(editions_matched(rec, 'isbn_', isbns))
-
     return {k: list(v) for k, v in pool.items() if v}
 
 
@@ -451,7 +451,6 @@ def find_quick_match(rec: dict) -> str | None:
     :param dict rec: Edition record
     :return: First key matched of format "/books/OL..M" or None if no match found.
     """
-
     if 'openlibrary' in rec:
         return '/books/' + rec['openlibrary']
 
@@ -480,14 +479,14 @@ def find_quick_match(rec: dict) -> str | None:
     return None
 
 
-def editions_matched(rec, key, value=None):
+def editions_matched(rec: dict, key: str, value=None) -> list[str]:
     """
     Search OL for editions matching record's 'key' value.
 
     :param dict rec: Edition import record
     :param str key: Key to search on, e.g. 'isbn_'
     :param list|str value: Value or Values to use, overriding record values
-    :rtpye: list
+    :rtype: list
     :return: List of edition keys ["/books/OL..M",]
     """
     if value is None and key not in rec:
@@ -500,11 +499,11 @@ def editions_matched(rec, key, value=None):
     return ekeys
 
 
-def find_threshold_match(rec: dict, edition_pool: dict) -> str | None:
+def find_threshold_match(rec: dict, edition_pool: dict[str, list[str]]) -> str | None:
     """
     Find the best match for rec in edition_pool and return its key.
     :param dict rec: the new edition we are trying to match.
-    :param list edition_pool: list of possible edition key matches, output of build_pool(import record)
+    :param dict edition_pool: edition key matches, output of build_pool(import record)
     :return: None or the edition key '/books/OL...M' of the best edition match for enriched_rec in edition_pool
     """
     seen = set()
@@ -730,7 +729,7 @@ def normalize_import_record(rec: dict) -> None:
 
     # Split subtitle if required and not already present
     if ':' in rec.get('title', '') and not rec.get('subtitle'):
-        title, subtitle = split_subtitle(rec.get('title'))
+        title, subtitle = split_subtitle(rec.get('title', ''))
         if subtitle:
             rec['title'] = title
             rec['subtitle'] = subtitle

diff --git a/openlibrary/catalog/add_book/match.py b/openlibrary/catalog/add_book/match.py
@@ -8,6 +8,9 @@
 re_brackets = re.compile(r'^(.+)\[.*?\]$')
 re_whitespace_and_punct = re.compile(r'[-\s,;:.]+')
 
+type ThresholdResult = tuple[str, str, int]  # (field/category, result, score)
+
+DATE_MISMATCH = -800
 ISBN_MATCH = 85
 THRESHOLD = 875
 
@@ -60,7 +63,6 @@ def normalize(s: str) -> str:
     by lowercasing, unicode -> NFC,
     stripping extra whitespace and punctuation, and replacing ampersands.
     """
-
     s = unicodedata.normalize('NFC', s)
     s = s.replace(' & ', ' and ')
     s = re_whitespace_and_punct.sub(' ', s.lower()).strip()
@@ -153,7 +155,7 @@ def expand_record(rec: dict) -> dict[str, str | list[str]]:
     return expanded_rec
 
 
-def build_titles(title: str):
+def build_titles(title: str) -> dict:
     """
     Uses a full title to create normalized and short title versions.
     Used for expanding a set of title variants for matching,
@@ -181,11 +183,11 @@ def build_titles(title: str):
     }
 
 
-def within(a, b, distance):
+def within(a, b, distance) -> bool:
     return abs(a - b) <= distance
 
 
-def compare_country(e1: dict, e2: dict):
+def compare_country(e1: dict, e2: dict) -> ThresholdResult:
     field = 'publish_country'
     if field not in e1 or field not in e2:
         return (field, 'value missing', 0)
@@ -197,7 +199,7 @@ def compare_country(e1: dict, e2: dict):
     return (field, 'mismatch', -205)
 
 
-def compare_lccn(e1: dict, e2: dict):
+def compare_lccn(e1: dict, e2: dict) -> ThresholdResult:
     field = 'lccn'
     if field not in e1 or field not in e2:
         return (field, 'value missing', 0)
@@ -206,23 +208,24 @@ def compare_lccn(e1: dict, e2: dict):
     return (field, 'mismatch', -320)
 
 
-def compare_date(e1: dict, e2: dict):
-    if 'publish_date' not in e1 or 'publish_date' not in e2:
-        return ('date', 'value missing', 0)
+def compare_date(e1: dict, e2: dict) -> ThresholdResult:
+    if ('publish_date' not in e1) and ('publish_date' not in e2):
+        return ('date', 'values missing', 0)
+    if ('publish_date' in e1) ^ ('publish_date' in e2):
+        return ('date', 'value missing', DATE_MISMATCH)
     if e1['publish_date'] == e2['publish_date']:
         return ('date', 'exact match', 200)
     try:
         e1_pub = int(e1['publish_date'])
         e2_pub = int(e2['publish_date'])
         if within(e1_pub, e2_pub, 2):
             return ('date', '+/-2 years', -25)
-        else:
-            return ('date', 'mismatch', -250)
-    except ValueError as TypeError:
-        return ('date', 'mismatch', -250)
+    except (ValueError, TypeError):
+        pass
+    return ('date', 'mismatch', DATE_MISMATCH)
 
 
-def compare_isbn(e1: dict, e2: dict):
+def compare_isbn(e1: dict, e2: dict) -> ThresholdResult:
     if len(e1['isbn']) == 0 or len(e2['isbn']) == 0:
         return ('ISBN', 'missing', 0)
     for i in e1['isbn']:
@@ -232,10 +235,7 @@ def compare_isbn(e1: dict, e2: dict):
     return ('ISBN', 'mismatch', -225)
 
 
-# 450 + 200 + 85 + 200
-
-
-def level1_match(e1: dict, e2: dict):
+def level1_match(e1: dict, e2: dict) -> list[ThresholdResult]:
     """
     :param dict e1: Expanded Edition, output of expand_record()
     :param dict e2: Expanded Edition, output of expand_record()
@@ -254,7 +254,7 @@ def level1_match(e1: dict, e2: dict):
     return score
 
 
-def level2_match(e1: dict, e2: dict):
+def level2_match(e1: dict, e2: dict) -> list[ThresholdResult]:
     """
     :param dict e1: Expanded Edition, output of expand_record()
     :param dict e2: Expanded Edition, output of expand_record()
@@ -274,7 +274,7 @@ def level2_match(e1: dict, e2: dict):
     return score
 
 
-def compare_author_fields(e1_authors, e2_authors):
+def compare_author_fields(e1_authors, e2_authors) -> bool:
     for i in e1_authors:
         for j in e2_authors:
             if normalize(i['db_name']) == normalize(j['db_name']):
@@ -284,7 +284,7 @@ def compare_author_fields(e1_authors, e2_authors):
     return False
 
 
-def compare_author_keywords(e1_authors, e2_authors):
+def compare_author_keywords(e1_authors, e2_authors) -> ThresholdResult:
     max_score = 0
     for i in e1_authors:
         for j in e2_authors:
@@ -300,15 +300,15 @@ def compare_author_keywords(e1_authors, e2_authors):
         return ('authors', 'mismatch', -200)
 
 
-def compare_authors(e1: dict, e2: dict):
+def compare_authors(e1: dict, e2: dict) -> ThresholdResult:
     """
     Compares the authors of two edition representations and
     returns a evaluation and score.
 
     :param dict e1: Expanded Edition, output of expand_record()
     :param dict e2: Expanded Edition, output of expand_record()
     :rtype: tuple
-    :return: str?, message, score
+    :return: field/category, message, score
     """
     if 'authors' in e1 and 'authors' in e2:  # noqa: SIM102
         if compare_author_fields(e1['authors'], e2['authors']):
@@ -340,7 +340,7 @@ def title_replace_amp(amazon):
     return normalize(amazon['full-title'].replace(" & ", " and ")).lower()
 
 
-def substr_match(a: str, b: str):
+def substr_match(a: str, b: str) -> bool:
     return a.find(b) != -1 or b.find(a) != -1
 
 
@@ -409,15 +409,15 @@ def compare_number_of_pages(amazon, marc):
         return ('pagination', 'non-match (by more than 10)', -225)
 
 
-def short_part_publisher_match(p1, p2):
+def short_part_publisher_match(p1: str, p2: str) -> bool:
     pub1 = p1.split()
     pub2 = p2.split()
     if len(pub1) == 1 or len(pub2) == 1:
         return False
     return all(substr_match(i, j) for i, j in zip(pub1, pub2))
 
 
-def compare_publisher(e1: dict, e2: dict):
+def compare_publisher(e1: dict, e2: dict) -> ThresholdResult:
     if 'publishers' in e1 and 'publishers' in e2:
         for e1_pub in e1['publishers']:
             e1_norm = normalize(e1_pub)
@@ -432,9 +432,7 @@ def compare_publisher(e1: dict, e2: dict):
                 elif short_part_publisher_match(e1_norm, e2_norm):
                     return ('publisher', 'match', 100)
         return ('publisher', 'mismatch', -51)
-
-    if 'publishers' not in e1 or 'publishers' not in e2:
-        return ('publisher', 'either missing', 0)
+    return ('publisher', 'either missing', 0)
 
 
 def threshold_match(

diff --git a/openlibrary/catalog/add_book/tests/test_add_book.py b/openlibrary/catalog/add_book/tests/test_add_book.py
@@ -1143,6 +1143,52 @@ def test_find_match_is_used_when_looking_for_edition_matches(mock_site) -> None:
     assert e['key'] == '/books/OL17M'
 
 
+def test_preisbn_import_does_not_match_existing_undated_isbn_record(mock_site) -> None:
+    author = {
+        'type': {'key': '/type/author'},
+        'name': 'Jane Austen',
+        'key': '/authors/OL21594A',
+    }
+    existing_work = {
+        'authors': [
+            {
+                'author': {'key': '/authors/OL21594A'},
+                'type': {'key': '/type/author_role'},
+            }
+        ],
+        'key': '/works/OL16W',
+        'title': 'Sense and Sensibility',
+        'type': {'key': '/type/work'},
+    }
+    existing_edition = {
+        'key': '/books/OL18M',
+        #'publish_date': '2025', NO DATE
+        'publishers': ['Penguin'],
+        'title': 'Sense and Sensibility',
+        'isbn_13': ['9780241734872'],
+        'source_records': ['non-marc:ja_light_record'],
+        'works': [{'key': '/works/OL16W'}],
+    }
+    mock_site.save(author)
+    mock_site.save(existing_work)
+    mock_site.save(existing_edition)
+    rec = {
+        'authors': [{'name': 'Jane Austen'}],
+        'publish_date': '1913',
+        'publishers': ['Penguin'],
+        'title': 'Sense and Sensibility',
+        'source_records': ['marc:somelibrary/some_marc.mrc'],
+    }
+    reply = load(rec)
+    assert reply['authors'][0]['status'] == 'matched'
+    assert reply['authors'][0]['key'] == '/authors/OL21594A'
+    assert reply['work']['status'] == 'matched'
+    assert reply['work']['key'] == '/works/OL16W'
+    assert (
+        reply['edition']['status'] == 'created'
+    )  # New edition is created, not matched
+
+
 def test_covers_are_added_to_edition(mock_site, monkeypatch) -> None:
     """Ensures a cover from rec is added to a matched edition."""
     author = {
@@ -1871,7 +1917,8 @@ def test_year_1900_removed_from_amz_and_bwb_promise_items(self, rec, expected):
 
 
 def test_find_match_title_only_promiseitem_against_noisbn_marc(mock_site):
-    # An existing light title + ISBN only record
+    # An existing light title + ISBN only record should not match an
+    # incoming pre-ISBN record.
     existing_edition = {
         'key': '/books/OL113M',
         # NO author