From 119f242feace16275d1a5a9777abad1f655226b1 Mon Sep 17 00:00:00 2001
From: Elliot Ford <elliot.ford@astrazeneca.com>
Date: Mon, 17 Apr 2023 13:37:50 +0100
Subject: [PATCH] make TermNormIsSubStringMappingStrategy handle multi-word
 substrings

Currently, this will only prefer a term if the term norm is a single
word contained (as a word) in the ent_match_norm. Change to also prefer
it if the term norm is a sequence of words that are a substring of the
ent_match_norm.
---
 .../mapping_strategies/strategies.py               | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kazu/steps/linking/post_processing/mapping_strategies/strategies.py b/kazu/steps/linking/post_processing/mapping_strategies/strategies.py
index fdc9e9bb..2b5df891 100644
--- a/kazu/steps/linking/post_processing/mapping_strategies/strategies.py
+++ b/kazu/steps/linking/post_processing/mapping_strategies/strategies.py
@@ -2,6 +2,7 @@
 import itertools
 from abc import ABC
 from collections.abc import Iterable
+import re
 from typing import Optional
 
 from kazu.data import (
@@ -336,8 +337,8 @@ def filter_candidates(
 
 
 class SynNormIsSubStringMappingStrategy(MappingStrategy):
-    """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are string
-    matches of the match_norm tokens based on whitespace tokenisation.
+    """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are a
+    substring of the match_norm tokens, with a word boundary immediately before and after.
 
     If exactly one element of :data:`~.CandidatesToMetrics` matches, prefer it.
 
@@ -373,7 +374,6 @@ def filter_candidates(
         candidates: CandidatesToMetrics,
         parser_name: str,
     ) -> CandidatesToMetrics:
-        norm_tokens = set(ent_match_norm.split(" "))
 
         filtered_candidates_and_len = [
             (
@@ -384,7 +384,7 @@ def filter_candidates(
                 len(candidate.synonym_norm),
             )
             for candidate, metrics in candidates.items()
-            if candidate.synonym_norm in norm_tokens
+            if self._regex_check_substring_words(candidate.synonym_norm, ent_match_norm)
             and len(candidate.synonym_norm) >= self.min_syn_norm_len_to_consider
         ]
         filtered_candidates_and_len.sort(key=lambda x: x[1], reverse=True)
@@ -398,6 +398,12 @@ def filter_candidates(
                 return {candidate: metrics}
         return {}
 
+    @staticmethod
+    def _regex_check_substring_words(possible_substring: str, full_string: str) -> bool:
+        # there should be a word boundary on either side
+        regexp = r"\b" + possible_substring + r"\b"
+        return bool(re.search(regexp, full_string))
+
 
 class StrongMatchMappingStrategy(MappingStrategy):
     """