From e0dfcb34282a92be37a1ad2b3402e5687946b39c Mon Sep 17 00:00:00 2001 From: Harshad Date: Wed, 26 Jun 2024 16:52:40 -0500 Subject: [PATCH] Addressed situation when assign_default_confidence() returns only dataframe with all NaN confidence values (#548) Ok, so here was the problem: When the dataframe whose redundant rows had to be filtered out had all `NaN` values for confidence, the line https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L441 returned `df` = Empty dataframe and the entire source data frame = `nan_df`. Due to this, the following line: https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L447 result in `dfmax = {}` which is of type `pandas.Series`. Hence the confusion. The correct way to handle this is simple adding an `if` statement: https://github.com/mapping-commons/sssom-py/blob/ffa2109616020f994196cbb827d71bca17192014/src/sssom/util.py#L447-L469 I've added an explicit test and it passes. Fixes #546 --- src/sssom/util.py | 45 ++++++++++++++++++++--------------------- tests/test_reconcile.py | 7 +++++++ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 6be13679..fcd2912e 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p else: key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID] dfmax: pd.DataFrame - dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates() - max_conf: Dict[Tuple[str, ...], float] = {} - for _, row in dfmax.iterrows(): + if not df.empty: + dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates() + max_conf: Dict[Tuple[str, ...], float] = {} + for _, row in dfmax.iterrows(): + if ignore_predicate: + max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE] + else: + max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE] if ignore_predicate: - max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE] + df = df[ + df.apply( + lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])], + axis=1, + ) + ] else: - max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE] - if ignore_predicate: - df = df[ - df.apply( - lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])], - axis=1, - ) - ] - else: - df = df[ - df.apply( - lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])], - axis=1, - ) - ] + df = df[ + df.apply( + lambda x: x[CONFIDENCE] + >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])], + axis=1, + ) + ] # We are preserving confidence = NaN rows without making assumptions. # This means that there are potential duplicate mappings - # FutureWarning: The frame.append method is deprecated and - # will be removed from pandas in a future version. - # Use pandas.concat instead. - # return_df = df.append(nan_df).drop_duplicates() + confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates() # Reconciling dataframe rows based on the predicates with equal confidence. diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index eaf26ea5..9c322704 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -22,6 +22,13 @@ def test_filter(self): df2 = filter_redundant_rows(self.msdf2.df) self.assertEqual(18, len(df2.index)) + # Create a new dataframe with the confidence column having NaN values + import numpy as np + + self.msdf1.df["confidence"] = np.NAN + df3 = filter_redundant_rows(self.msdf1.df) + self.assertEqual(11, len(df3.index)) + def test_deal_with_negation(self): """Test handling negating returns the right number of rows.""" df1 = deal_with_negation(self.msdf1.df)