make TermNormIsSubStringMappingStrategy handle multi-word substrings

Currently, this will only prefer a term if the term norm is a single word contained (as a word) in the ent_match_norm. Change to also prefer it if the term norm is a sequence of words that are a substring of the ent_match_norm.
AstraZeneca · Jun 4, 2024 · 119f242 · 119f242
1 parent 3eb331f
commit 119f242
Showing 1 changed file with 10 additions and 4 deletions.
diff --git a/kazu/steps/linking/post_processing/mapping_strategies/strategies.py b/kazu/steps/linking/post_processing/mapping_strategies/strategies.py
@@ -2,6 +2,7 @@
 import itertools
 from abc import ABC
 from collections.abc import Iterable
+import re
 from typing import Optional
 
 from kazu.data import (
@@ -336,8 +337,8 @@ def filter_candidates(
 
 
 class SynNormIsSubStringMappingStrategy(MappingStrategy):
-    """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are string
-    matches of the match_norm tokens based on whitespace tokenisation.
+    """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are a
+    substring of the match_norm tokens, with a word boundary immediately before and after.
 
     If exactly one element of :data:`~.CandidatesToMetrics` matches, prefer it.
 
@@ -373,7 +374,6 @@ def filter_candidates(
         candidates: CandidatesToMetrics,
         parser_name: str,
     ) -> CandidatesToMetrics:
-        norm_tokens = set(ent_match_norm.split(" "))
 
         filtered_candidates_and_len = [
             (
@@ -384,7 +384,7 @@ def filter_candidates(
                 len(candidate.synonym_norm),
             )
             for candidate, metrics in candidates.items()
-            if candidate.synonym_norm in norm_tokens
+            if self._regex_check_substring_words(candidate.synonym_norm, ent_match_norm)
             and len(candidate.synonym_norm) >= self.min_syn_norm_len_to_consider
         ]
         filtered_candidates_and_len.sort(key=lambda x: x[1], reverse=True)
@@ -398,6 +398,12 @@ def filter_candidates(
                 return {candidate: metrics}
         return {}
 
+    @staticmethod
+    def _regex_check_substring_words(possible_substring: str, full_string: str) -> bool:
+        # there should be a word boundary on either side
+        regexp = r"\b" + possible_substring + r"\b"
+        return bool(re.search(regexp, full_string))
+
 
 class StrongMatchMappingStrategy(MappingStrategy):
     """