From 2d36d2ce29a2105972d41c37c6da8ba73cb85d1b Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Tue, 24 Oct 2023 08:51:55 -0400 Subject: [PATCH] fix boundary_adherence nan handling --- .../statistical/boundary_adherence.py | 8 ++- .../statistical/test_boundary_adherence.py | 61 +++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/sdmetrics/single_column/statistical/boundary_adherence.py b/sdmetrics/single_column/statistical/boundary_adherence.py index 2646b24e..eb6c710e 100644 --- a/sdmetrics/single_column/statistical/boundary_adherence.py +++ b/sdmetrics/single_column/statistical/boundary_adherence.py @@ -43,8 +43,12 @@ def compute(cls, real_data, synthetic_data): float: The boundary adherence of the two columns. """ - real_data = pd.Series(real_data).dropna() - synthetic_data = pd.Series(synthetic_data).dropna() + if any(pd.isna(real_data)): + real_data = pd.Series(real_data).dropna() + synthetic_data = pd.Series(synthetic_data).dropna() + else: + real_data = pd.Series(real_data) + synthetic_data = pd.Series(synthetic_data) if is_datetime(real_data): real_data = pd.to_numeric(real_data) diff --git a/tests/unit/single_column/statistical/test_boundary_adherence.py b/tests/unit/single_column/statistical/test_boundary_adherence.py index 4ef240bf..828a7730 100644 --- a/tests/unit/single_column/statistical/test_boundary_adherence.py +++ b/tests/unit/single_column/statistical/test_boundary_adherence.py @@ -1,5 +1,7 @@ +from datetime import datetime from unittest.mock import patch +import numpy as np import pandas as pd from sdmetrics.single_column.statistical import BoundaryAdherence @@ -31,6 +33,65 @@ def test_compute(self): # Assert assert result == 0.75 + def test_compute_nans(self): + """Test the ``compute`` method with nan values. + + Expect that the nan values in synthetic data are considered as + out of bounds if the real data does not also containt nan values. + """ + # Setup + real_data = pd.Series([1.0, 2.4, 2.6, 0.8]) # 0.8 -> 2.6 + real_data_nans = pd.Series([1.0, 2.4, 2.6, 0.8, np.nan]) + synthetic_data = pd.Series([0.9, 1.8, 2.1, 5.0, np.nan]) + + metric = BoundaryAdherence() + + # Run + result = metric.compute(real_data, synthetic_data) + result_ignore_nans = metric.compute(real_data_nans, synthetic_data) + + # Assert + assert result == 0.6 + assert result_ignore_nans == 0.75 + + def test_compute_datetime_nans(self): + """Test the ``compute`` method with nan values. + + Expect that the nan values in synthetic data are considered as + out of bounds if the real data does not also containt nan values. + """ + # Setup + real_data = pd.Series([ + datetime(2020, 10, 1), + datetime(2021, 1, 2), + datetime(2021, 9, 12), + datetime(2022, 10, 1), + + ], dtype='datetime64[ns]') # 0.8 -> 2.6 + real_data_nans = pd.Series([ + datetime(2020, 10, 1), + datetime(2021, 1, 2), + datetime(2021, 9, 12), + datetime(2022, 10, 1), + pd.NaT + ], dtype='datetime64[ns]') + synthetic_data = pd.Series([ + datetime(2020, 11, 1), + datetime(2021, 1, 2), + datetime(2021, 2, 9), + pd.NaT, + ], dtype='datetime64[ns]') + + metric = BoundaryAdherence() + + # Run + result = metric.compute(real_data, synthetic_data) + result_ignore_nans = metric.compute(real_data_nans, synthetic_data) + + # Assert + assert result == 0.75 + assert result_ignore_nans == 1 + @patch('sdmetrics.single_column.statistical.boundary_adherence.SingleColumnMetric.normalize') def test_normalize(self, normalize_mock): """Test the ``normalize`` method.