Skip to content

Commit

Permalink
fix nans
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Nov 6, 2023
1 parent b429723 commit 158473c
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
6 changes: 6 additions & 0 deletions sdmetrics/column_pairs/statistical/referential_integrity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Referential Integrity Metric."""
import logging

import pandas as pd

from sdmetrics.column_pairs.base import ColumnPairsMetric
from sdmetrics.goal import Goal

Expand Down Expand Up @@ -43,6 +45,10 @@ def compute_breakdown(cls, real_data, synthetic_data):
dict:
The score breakdown of the key uniqueness metric.
"""
if pd.isna(real_data[1]).any():
synthetic_data = list(synthetic_data)
synthetic_data[1] = synthetic_data[1].dropna()

missing_parents = not real_data[1].isin(real_data[0]).all()
if missing_parents:
LOGGER.info(
Expand Down
34 changes: 34 additions & 0 deletions tests/unit/column_pairs/statistical/test_referential_integrity.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

from sdmetrics.column_pairs.statistical import ReferentialIntegrity
Expand Down Expand Up @@ -70,3 +71,36 @@ def test_compute(self, compute_breakdown_mock):
# Assert
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
assert result == 0.6

def test_compute_with_nan_foreign_keys_real_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the real data."""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys),
synthetic_data=(parent_keys, foreign_keys)
)

# Assert
assert result == 1.0

def test_compute_with_nan_foreign_keys_only_synthetic_data(self):
"""Test the ``compute`` method with NaN foreign keys inside the synthetic data."""
# Setup
parent_keys = pd.Series(['a', 'b', 'c'])
foreign_keys = pd.Series(['a', 'a', 'b', 'c', 'a'])
synth_foreign_keys = pd.Series(['a', 'a', 'b', 'c', np.nan])
metric = ReferentialIntegrity()

# Run
result = metric.compute(
real_data=(parent_keys, foreign_keys),
synthetic_data=(parent_keys, synth_foreign_keys)
)

# Assert
assert result == 0.8

0 comments on commit 158473c

Please sign in to comment.