Skip to content

Commit

Permalink
Fix KSComplement instability for constant float values (#654)
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho authored Nov 27, 2024
1 parent 8b450df commit ac19c4d
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 10 deletions.
7 changes: 7 additions & 0 deletions sdmetrics/single_column/statistical/kscomplement.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Kolmogorov-Smirnov test based Metric."""

import sys

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
Expand All @@ -8,6 +10,8 @@
from sdmetrics.single_column.base import SingleColumnMetric
from sdmetrics.utils import is_datetime

MAX_DECIMALS = sys.float_info.dig - 1


class KSComplement(SingleColumnMetric):
"""Kolmogorov-Smirnov statistic based metric.
Expand Down Expand Up @@ -57,6 +61,9 @@ def compute(real_data, synthetic_data):
real_data = pd.to_numeric(real_data)
synthetic_data = pd.to_numeric(synthetic_data)

real_data = real_data.round(MAX_DECIMALS)
synthetic_data = synthetic_data.round(MAX_DECIMALS)

try:
statistic, _ = ks_2samp(real_data, synthetic_data)
except ValueError as e:
Expand Down
16 changes: 13 additions & 3 deletions tests/integration/reports/multi_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
from packaging import version

from sdmetrics.demos import load_demo
from sdmetrics.reports.multi_table.quality_report import QualityReport
Expand Down Expand Up @@ -299,6 +300,15 @@ def test_quality_report_with_errors():
'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality', 'Intertable Trends'],
'Score': [0.8165079365079364, 0.55, 0.95, 0.5833333333333334],
})

pandas_version = version.parse(pd.__version__)
if pandas_version >= version.parse('2.2.0'):
err1 = "TypeError: '<' not supported between instances of 'int' and 'str'"
err2 = "TypeError: '<' not supported between instances of 'Timestamp' and 'str'"
err3 = "TypeError: '<' not supported between instances of 'float' and 'str'"
else:
err1 = err2 = err3 = "TypeError: can't multiply sequence by non-int of type 'float'"

expected_details = pd.DataFrame({
'Table': [
'users',
Expand Down Expand Up @@ -334,11 +344,11 @@ def test_quality_report_with_errors():
'Error': [
None,
None,
"TypeError: '<' not supported between instances of 'int' and 'str'",
err1,
np.nan,
np.nan,
"TypeError: '<' not supported between instances of 'Timestamp' and 'str'",
"TypeError: '<' not supported between instances of 'float' and 'str'",
err2,
err3,
None,
],
})
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from packaging import version

from sdmetrics.demos import load_demo
from sdmetrics.reports.single_table._properties import ColumnShapes
Expand Down Expand Up @@ -84,15 +85,22 @@ def test_get_score_errors(self):

# Run
column_shape_property = ColumnShapes()

expected_message_1 = (
"TypeError: '<' not supported between instances of 'Timestamp' and 'int'"
)
expected_message_2 = "TypeError: '<' not supported between instances of 'str' and 'float'"

score = column_shape_property.get_score(real_data, synthetic_data, metadata)

# Assert
pandas_version = version.parse(pd.__version__)
if pandas_version >= version.parse('2.2.0'):
expected_message_1 = (
"TypeError: '<' not supported between instances of 'Timestamp' and 'int'"
)
expected_message_2 = (
"TypeError: '<' not supported between instances of 'str' and 'float'"
)
else:
expected_message_1 = (
"TypeError: unsupported operand type(s) for *: 'Timestamp' and 'float'"
)
expected_message_2 = "TypeError: can't multiply sequence by non-int of type 'float'"

details = column_shape_property.details
details_nan = details.loc[pd.isna(details['Score'])]
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/single_column/statistical/test_kscomplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,16 @@ def test_bad(array_like):

assert 0.0 <= output < 0.5
assert 0.0 <= normalized < 0.5


def test_one_float_value():
"""Test KSComplement.compute when both data have the same float values GH#652."""
# Setup
real = pd.Series([0.3 - 0.2])
synth = pd.Series([0.2 - 0.1])

# Run
output = KSComplement.compute(real, synth)

# Assert
assert output == 1
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
from packaging import version

from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes

Expand Down Expand Up @@ -108,7 +109,12 @@ def test__generate_details_error(self):
result = column_shape_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'"
pandas_version = version.parse(pd.__version__)
if pandas_version >= version.parse('2.2.0'):
expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'"
else:
expected_message = "TypeError: can't multiply sequence by non-int of type 'float'"

result_nan = result.loc[pd.isna(result['Score'])]
column_names_nan = result_nan['Column'].tolist()
error_message = result_nan['Error'].tolist()
Expand Down

0 comments on commit ac19c4d

Please sign in to comment.