From e0dfcb34282a92be37a1ad2b3402e5687946b39c Mon Sep 17 00:00:00 2001
From: Harshad <hrshdhgd@users.noreply.github.com>
Date: Wed, 26 Jun 2024 16:52:40 -0500
Subject: [PATCH] Addressed situation when assign_default_confidence() returns
 only dataframe with all NaN confidence values (#548)

Ok, so here was the problem:

When the dataframe whose redundant rows had to be filtered out had all
`NaN` values for confidence, the line


https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L441

returned `df` = Empty dataframe and the entire source data frame =
`nan_df`.

Due to this, the following line:


https://github.com/mapping-commons/sssom-py/blob/550206721911f711ee678eb1a8da50591649bd04/src/sssom/util.py#L447

result in `dfmax = {}` which is of type `pandas.Series`. Hence the
confusion.

The correct way to handle this is simple adding an `if` statement:


https://github.com/mapping-commons/sssom-py/blob/ffa2109616020f994196cbb827d71bca17192014/src/sssom/util.py#L447-L469

I've added an explicit test and it passes.  Fixes #546
---
 src/sssom/util.py       | 45 ++++++++++++++++++++---------------------
 tests/test_reconcile.py |  7 +++++++
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/sssom/util.py b/src/sssom/util.py
index 6be13679..fcd2912e 100644
--- a/src/sssom/util.py
+++ b/src/sssom/util.py
@@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
     else:
         key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
     dfmax: pd.DataFrame
-    dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
-    max_conf: Dict[Tuple[str, ...], float] = {}
-    for _, row in dfmax.iterrows():
+    if not df.empty:
+        dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
+        max_conf: Dict[Tuple[str, ...], float] = {}
+        for _, row in dfmax.iterrows():
+            if ignore_predicate:
+                max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+            else:
+                max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
         if ignore_predicate:
-            max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
+            df = df[
+                df.apply(
+                    lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
+                    axis=1,
+                )
+            ]
         else:
-            max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
-    if ignore_predicate:
-        df = df[
-            df.apply(
-                lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
-                axis=1,
-            )
-        ]
-    else:
-        df = df[
-            df.apply(
-                lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
-                axis=1,
-            )
-        ]
+            df = df[
+                df.apply(
+                    lambda x: x[CONFIDENCE]
+                    >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
+                    axis=1,
+                )
+            ]
     # We are preserving confidence = NaN rows without making assumptions.
     # This means that there are potential duplicate mappings
-    # FutureWarning: The frame.append method is deprecated and
-    # will be removed from pandas in a future version.
-    # Use pandas.concat instead.
-    # return_df = df.append(nan_df).drop_duplicates()
+
     confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()
 
     # Reconciling dataframe rows based on the predicates with equal confidence.
diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py
index eaf26ea5..9c322704 100644
--- a/tests/test_reconcile.py
+++ b/tests/test_reconcile.py
@@ -22,6 +22,13 @@ def test_filter(self):
         df2 = filter_redundant_rows(self.msdf2.df)
         self.assertEqual(18, len(df2.index))
 
+        # Create a new dataframe with the confidence column having NaN values
+        import numpy as np
+
+        self.msdf1.df["confidence"] = np.NAN
+        df3 = filter_redundant_rows(self.msdf1.df)
+        self.assertEqual(11, len(df3.index))
+
     def test_deal_with_negation(self):
         """Test handling negating returns the right number of rows."""
         df1 = deal_with_negation(self.msdf1.df)