From 0590d5dd4ecfd642ebab550bf58ae34d64b6436c Mon Sep 17 00:00:00 2001 From: Szymon Palucha Date: Thu, 3 Oct 2024 15:57:41 +0100 Subject: [PATCH] refactor: reduce nesting in OntologyParser --- kazu/ontology_preprocessing/base.py | 104 +++++++++++++--------------- 1 file changed, 50 insertions(+), 54 deletions(-) diff --git a/kazu/ontology_preprocessing/base.py b/kazu/ontology_preprocessing/base.py index d0712293..9277bbee 100644 --- a/kazu/ontology_preprocessing/base.py +++ b/kazu/ontology_preprocessing/base.py @@ -164,7 +164,7 @@ def _resolve_candidates(self, candidates_df: pd.DataFrame) -> set[LinkingCandida StringNormalizer.normalize, entity_class=self.entity_class ) - for i, row in ( + for _, row in ( candidates_df[["syn_norm", SYN, IDX, MAPPING_TYPE]] .groupby(["syn_norm"]) .agg(set) @@ -253,64 +253,60 @@ def score_and_group_ids( ), EquivalentIdAggregationStrategy.NO_STRATEGY, ) - else: - if len(ids_and_source) == 1: - return ( - frozenset((EquivalentIdSet(ids_and_source=frozenset(ids_and_source)),)), - EquivalentIdAggregationStrategy.UNAMBIGUOUS, - ) + if len(ids_and_source) == 1: + return ( + frozenset((EquivalentIdSet(ids_and_source=frozenset(ids_and_source)),)), + EquivalentIdAggregationStrategy.UNAMBIGUOUS, + ) + + if not is_symbolic: + return ( + frozenset((EquivalentIdSet(ids_and_source=frozenset(ids_and_source)),)), + EquivalentIdAggregationStrategy.MERGED_AS_NON_SYMBOLIC, + ) - if not is_symbolic: - return ( - frozenset((EquivalentIdSet(ids_and_source=frozenset(ids_and_source)),)), - EquivalentIdAggregationStrategy.MERGED_AS_NON_SYMBOLIC, + # use similarity to group ids into EquivalentIdSets + DefaultLabels = set[str] + id_list: list[tuple[IdsAndSource, DefaultLabels]] = [] + for id_and_source_tuple in ids_and_source: + default_label = cast( + str, + self.metadata_db.get_by_idx(self.name, id_and_source_tuple[0])[DEFAULT_LABEL], + ) + most_similar_id_set = None + best_score = 0.0 + for id_and_default_label_set in id_list: + sim = max( + self.string_scorer(default_label, other_label) + for other_label in id_and_default_label_set[1] ) - else: - # use similarity to group ids into EquivalentIdSets - - DefaultLabels = set[str] - id_list: list[tuple[IdsAndSource, DefaultLabels]] = [] - for id_and_source_tuple in ids_and_source: - default_label = cast( - str, - self.metadata_db.get_by_idx(self.name, id_and_source_tuple[0])[ - DEFAULT_LABEL - ], + if sim > self.synonym_merge_threshold and sim > best_score: + most_similar_id_set = id_and_default_label_set + best_score = sim + + # for the first label, the above for loop is a no-op as id_sets is empty + # and the below if statement will be true. + # After that, it will be True if the id under consideration should not + # merge with any existing group and should get its own EquivalentIdSet + if not most_similar_id_set: + id_list.append( + ( + {id_and_source_tuple}, + {default_label}, ) - most_similar_id_set = None - best_score = 0.0 - for id_and_default_label_set in id_list: - sim = max( - self.string_scorer(default_label, other_label) - for other_label in id_and_default_label_set[1] - ) - if sim > self.synonym_merge_threshold and sim > best_score: - most_similar_id_set = id_and_default_label_set - best_score = sim - - # for the first label, the above for loop is a no-op as id_sets is empty - # and the below if statement will be true. - # After that, it will be True if the id under consideration should not - # merge with any existing group and should get its own EquivalentIdSet - if not most_similar_id_set: - id_list.append( - ( - {id_and_source_tuple}, - {default_label}, - ) - ) - else: - most_similar_id_set[0].add(id_and_source_tuple) - most_similar_id_set[1].add(default_label) - - return ( - frozenset( - EquivalentIdSet(ids_and_source=frozenset(ids_and_source)) - for ids_and_source, _ in id_list - ), - EquivalentIdAggregationStrategy.RESOLVED_BY_SIMILARITY, ) + else: + most_similar_id_set[0].add(id_and_source_tuple) + most_similar_id_set[1].add(default_label) + + return ( + frozenset( + EquivalentIdSet(ids_and_source=frozenset(ids_and_source)) + for ids_and_source, _ in id_list + ), + EquivalentIdAggregationStrategy.RESOLVED_BY_SIMILARITY, + ) def _parse_df_if_not_already_parsed(self) -> None: if self.parsed_dataframe is None: