Skip to content

Commit

Permalink
update computation
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Nov 15, 2023
1 parent a2b637f commit 86bd6f0
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 43 deletions.
20 changes: 5 additions & 15 deletions sdmetrics/single_table/table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,12 @@ def compute_breakdown(cls, real_data, synthetic_data):
synthetic_data (pandas.DataFrame):
The synthetic data.
"""
missing_columns_in_synthetic = set(real_data.columns) - set(synthetic_data.columns)
invalid_names = []
invalid_sdtypes = []
for column in synthetic_data.columns:
if column not in real_data.columns:
invalid_names.append(column)
continue
synthetic_columns = set(synthetic_data.columns)
real_columns = set(real_data.columns)
intersection_columns = real_columns & synthetic_columns
union_columns = real_columns | synthetic_columns
score = len(intersection_columns)/len(union_columns)

if synthetic_data[column].dtype != real_data[column].dtype:
invalid_sdtypes.append(column)

proportion_correct_columns = 1 - len(missing_columns_in_synthetic) / len(real_data.columns)
proportion_valid_names = 1 - len(invalid_names) / len(synthetic_data.columns)
proportion_valid_sdtypes = 1 - len(invalid_sdtypes) / len(synthetic_data.columns)

score = proportion_correct_columns * proportion_valid_names * proportion_valid_sdtypes
return {'score': score}

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_end_to_end_with_metrics_failing(self):
# Assert
expected_properties = pd.DataFrame({
'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'],
'Score': [1.0, 0.7833333333333333, 1.0]
'Score': [1.0, 1.0, 1.0]
})
expected_details = pd.DataFrame({
'Table': [
Expand Down Expand Up @@ -91,7 +91,7 @@ def test_end_to_end_with_metrics_failing(self):
"TypeError: '<=' not supported between instances of 'str' and 'float'", None
]
})
assert results == 0.9277777777777777
assert results == 1.0
pd.testing.assert_frame_equal(
report.get_properties(), expected_properties, check_exact=False, atol=2e-2
)
Expand Down
27 changes: 1 addition & 26 deletions tests/unit/single_table/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,36 +87,11 @@ def test_compute_breakdown_with_invalid_names(self, real_data):
expected_result = {'score': 0.8333333333333334}
assert result == expected_result

def test_compute_breakdown_with_invalid_dtypes(self, real_data):
"""Test the ``compute_breakdown`` method with invalid dtypes."""
# Setup
synthetic_data = pd.DataFrame({
'col_1': [3.0, 2.0, 1.0, 4.0, 5.0],
'col_2': ['A', 'B', 'C', 'D', 'E'],
'col_3': [True, False, True, False, True],
'col_4': [
'2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'
],
'col_5': [4.0, 2.0, 3.0, 4.0, 5.0],
})

metric = TableStructure()

# Run
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
expected_result = {'score': 0.6}
assert result == expected_result

def test_compute_breakdown_multiple_error(self, real_data):
"""Test the ``compute_breakdown`` method with the different failure modes."""
synthetic_data = pd.DataFrame({
'col_1': [1, 2, 1, 4, 5],
'col_3': [True, False, True, False, True],
'col_4': [
'2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'
],
'col_5': [4.0, 2.0, 3.0, 4.0, 5.0],
'col_6': [4.0, 2.0, 3.0, 4.0, 5.0],
})
Expand All @@ -127,7 +102,7 @@ def test_compute_breakdown_multiple_error(self, real_data):
result = metric.compute_breakdown(real_data, synthetic_data)

# Assert
expected_result = {'score': 0.5120000000000001}
expected_result = {'score': 0.5}
assert result == expected_result

@patch('sdmetrics.single_table.table_structure.TableStructure.compute_breakdown')
Expand Down

0 comments on commit 86bd6f0

Please sign in to comment.