From 989f7722b64a20eaf224d13799dfc5506e839260 Mon Sep 17 00:00:00 2001 From: Szymon Palucha Date: Wed, 16 Oct 2024 15:27:56 +0100 Subject: [PATCH] fix: CombinatorialSynonmyGenerator should only check for original strings --- .../synonym_generation.py | 6 +-- kazu/tests/test_synonym_generators.py | 51 +++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/kazu/ontology_preprocessing/synonym_generation.py b/kazu/ontology_preprocessing/synonym_generation.py index 1f56dd1f..b6f52d31 100644 --- a/kazu/ontology_preprocessing/synonym_generation.py +++ b/kazu/ontology_preprocessing/synonym_generation.py @@ -63,7 +63,7 @@ def __call__( ) final_results: defaultdict[OntologyStringResource, set[Synonym]] = defaultdict(set) original_strings = { - syn.text for resource in ontology_resources for syn in resource.active_ner_synonyms() + syn.text for resource in ontology_resources for syn in resource.original_synonyms } for i, permutation_list in enumerate(synonym_gen_permutations): logger.info( @@ -80,9 +80,7 @@ def __call__( desc=f"generating synonyms for {generator.__class__.__name__}", ): - for syn in list( - generated_results.get(resource, resource.active_ner_synonyms()) - ): + for syn in list(generated_results.get(resource, resource.original_synonyms)): new_strings = generator(syn.text) for new_syn_text in new_strings: if new_syn_text in original_strings: diff --git a/kazu/tests/test_synonym_generators.py b/kazu/tests/test_synonym_generators.py index 148f9278..37a653f7 100644 --- a/kazu/tests/test_synonym_generators.py +++ b/kazu/tests/test_synonym_generators.py @@ -249,6 +249,57 @@ def test_greek_substitution_dict_uncode_variants(): StringReplacement(replacement_dict={"-": [" ", "_"]}, include_greek=False), ], ), + ( + OntologyStringResource( + original_synonyms=frozenset( + [ + Synonym( + text="estimate", + mention_confidence=MentionConfidence.PROBABLE, + case_sensitive=False, + ), + Synonym( + text="estimate of", + mention_confidence=MentionConfidence.PROBABLE, + case_sensitive=False, + ), + ] + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + ), + set(), + [ + StopWordRemover(), # normally stopword remover would remove 'of' but since "estimate" is an original synonym + # CombinatorialSynonymGenerator will skip the alternative synonym generation + ], + ), + ( + OntologyStringResource( + original_synonyms=frozenset( + [ + Synonym( + text="estimate of", + mention_confidence=MentionConfidence.PROBABLE, + case_sensitive=False, + ), + ] + ), + behaviour=OntologyStringBehaviour.ADD_FOR_NER_AND_LINKING, + alternative_synonyms=frozenset( + [ + Synonym( + text="estimate", + mention_confidence=MentionConfidence.PROBABLE, + case_sensitive=False, + ) + ] + ), + ), + {"estimate"}, + [ + StopWordRemover(), + ], + ), ), ) def test_CombinatorialSynonymGenerator(resource, expected_syns, generators):