From 119f242feace16275d1a5a9777abad1f655226b1 Mon Sep 17 00:00:00 2001 From: Elliot Ford Date: Mon, 17 Apr 2023 13:37:50 +0100 Subject: [PATCH] make TermNormIsSubStringMappingStrategy handle multi-word substrings Currently, this will only prefer a term if the term norm is a single word contained (as a word) in the ent_match_norm. Change to also prefer it if the term norm is a sequence of words that are a substring of the ent_match_norm. --- .../mapping_strategies/strategies.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kazu/steps/linking/post_processing/mapping_strategies/strategies.py b/kazu/steps/linking/post_processing/mapping_strategies/strategies.py index fdc9e9bb..2b5df891 100644 --- a/kazu/steps/linking/post_processing/mapping_strategies/strategies.py +++ b/kazu/steps/linking/post_processing/mapping_strategies/strategies.py @@ -2,6 +2,7 @@ import itertools from abc import ABC from collections.abc import Iterable +import re from typing import Optional from kazu.data import ( @@ -336,8 +337,8 @@ def filter_candidates( class SynNormIsSubStringMappingStrategy(MappingStrategy): - """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are string - matches of the match_norm tokens based on whitespace tokenisation. + """For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are a + substring of the match_norm tokens, with a word boundary immediately before and after. If exactly one element of :data:`~.CandidatesToMetrics` matches, prefer it. @@ -373,7 +374,6 @@ def filter_candidates( candidates: CandidatesToMetrics, parser_name: str, ) -> CandidatesToMetrics: - norm_tokens = set(ent_match_norm.split(" ")) filtered_candidates_and_len = [ ( @@ -384,7 +384,7 @@ def filter_candidates( len(candidate.synonym_norm), ) for candidate, metrics in candidates.items() - if candidate.synonym_norm in norm_tokens + if self._regex_check_substring_words(candidate.synonym_norm, ent_match_norm) and len(candidate.synonym_norm) >= self.min_syn_norm_len_to_consider ] filtered_candidates_and_len.sort(key=lambda x: x[1], reverse=True) @@ -398,6 +398,12 @@ def filter_candidates( return {candidate: metrics} return {} + @staticmethod + def _regex_check_substring_words(possible_substring: str, full_string: str) -> bool: + # there should be a word boundary on either side + regexp = r"\b" + possible_substring + r"\b" + return bool(re.search(regexp, full_string)) + class StrongMatchMappingStrategy(MappingStrategy): """