Skip to content

Commit

Permalink
make TermNormIsSubStringMappingStrategy handle multi-word substrings
Browse files Browse the repository at this point in the history
Currently, this will only prefer a term if the term norm is a single
word contained (as a word) in the ent_match_norm. Change to also prefer
it if the term norm is a sequence of words that are a substring of the
ent_match_norm.
  • Loading branch information
EFord36 committed Jun 4, 2024
1 parent 3eb331f commit 119f242
Showing 1 changed file with 10 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
from abc import ABC
from collections.abc import Iterable
import re
from typing import Optional

from kazu.data import (
Expand Down Expand Up @@ -336,8 +337,8 @@ def filter_candidates(


class SynNormIsSubStringMappingStrategy(MappingStrategy):
"""For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are string
matches of the match_norm tokens based on whitespace tokenisation.
"""For a :data:`~.CandidatesToMetrics`, see if any of their .synonym_norm are a
substring of the match_norm tokens, with a word boundary immediately before and after.
If exactly one element of :data:`~.CandidatesToMetrics` matches, prefer it.
Expand Down Expand Up @@ -373,7 +374,6 @@ def filter_candidates(
candidates: CandidatesToMetrics,
parser_name: str,
) -> CandidatesToMetrics:
norm_tokens = set(ent_match_norm.split(" "))

filtered_candidates_and_len = [
(
Expand All @@ -384,7 +384,7 @@ def filter_candidates(
len(candidate.synonym_norm),
)
for candidate, metrics in candidates.items()
if candidate.synonym_norm in norm_tokens
if self._regex_check_substring_words(candidate.synonym_norm, ent_match_norm)
and len(candidate.synonym_norm) >= self.min_syn_norm_len_to_consider
]
filtered_candidates_and_len.sort(key=lambda x: x[1], reverse=True)
Expand All @@ -398,6 +398,12 @@ def filter_candidates(
return {candidate: metrics}
return {}

@staticmethod
def _regex_check_substring_words(possible_substring: str, full_string: str) -> bool:
# there should be a word boundary on either side
regexp = r"\b" + possible_substring + r"\b"
return bool(re.search(regexp, full_string))


class StrongMatchMappingStrategy(MappingStrategy):
"""
Expand Down

0 comments on commit 119f242

Please sign in to comment.