Skip to content

Commit

Permalink
Update TableFormat metric to TableStructure + fix its computation (
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Nov 16, 2023
1 parent fbe3a65 commit a8a8b44
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 173 deletions.
2 changes: 1 addition & 1 deletion sdmetrics/reports/multi_table/_properties/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_visualization(self, table_name=None):
category_orders={'group': list(self.details['Table'])},
color='Metric',
color_discrete_map={
'TableFormat': PlotConfig.DATACEBO_DARK,
'TableStructure': PlotConfig.DATACEBO_DARK,
},
pattern_shape='Metric',
pattern_shape_sequence=[''],
Expand Down
20 changes: 4 additions & 16 deletions sdmetrics/reports/single_table/_properties/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from sdmetrics.errors import VisualizationUnavailableError
from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
from sdmetrics.single_table import TableFormat
from sdmetrics.single_table import TableStructure


class Structure(BaseSingleTableProperty):
Expand Down Expand Up @@ -31,21 +31,9 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
Returns:
pandas.DataFrame
"""
column_to_ignore_dtype = []
non_pii_sdtype = [
'numerical', 'datetime', 'categorical', 'boolean'
]
for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
if sdtype in non_pii_sdtype:
continue

column_to_ignore_dtype.append(column_name)

try:
score = TableFormat.compute(
real_data, synthetic_data,
ignore_dtype_columns=column_to_ignore_dtype
score = TableStructure.compute(
real_data, synthetic_data
)
error_message = None

Expand All @@ -58,7 +46,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
progress_bar.update()

result = pd.DataFrame({
'Metric': 'TableFormat',
'Metric': 'TableStructure',
'Score': score,
'Error': error_message,
}, index=[0])
Expand Down
4 changes: 2 additions & 2 deletions sdmetrics/single_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from sdmetrics.single_table.privacy.numerical_sklearn import (
NumericalLR, NumericalMLP, NumericalSVR)
from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
from sdmetrics.single_table.table_format import TableFormat
from sdmetrics.single_table.table_structure import TableStructure

__all__ = [
'bayesian_network',
Expand Down Expand Up @@ -91,5 +91,5 @@
'TVComplement',
'RangeCoverage',
'NewRowSynthesis',
'TableFormat',
'TableStructure',
]
80 changes: 0 additions & 80 deletions sdmetrics/single_table/table_format.py

This file was deleted.

60 changes: 60 additions & 0 deletions sdmetrics/single_table/table_structure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Table Format metric."""
from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric


class TableStructure(SingleTableMetric):
"""TableStructure Single Table metric.
This metric computes whether the names and data types of each column are
the same in the real and synthetic data.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""

name = 'TableStructure'
goal = Goal.MAXIMIZE
min_value = 0
max_value = 1

@classmethod
def compute_breakdown(cls, real_data, synthetic_data):
"""Compute the score breakdown of the table format metric.
Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
"""
synthetic_columns = set(synthetic_data.columns)
real_columns = set(real_data.columns)
intersection_columns = real_columns & synthetic_columns
union_columns = real_columns | synthetic_columns
score = len(intersection_columns)/len(union_columns)

return {'score': score}

@classmethod
def compute(cls, real_data, synthetic_data):
"""Compute the table format metric score.
Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
Returns:
float:
The metric score.
"""
return cls.compute_breakdown(real_data, synthetic_data)['score']
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_end_to_end(self):

expected_details = pd.DataFrame({
'Table': ['users', 'sessions', 'transactions'],
'Metric': ['TableFormat', 'TableFormat', 'TableFormat'],
'Metric': ['TableStructure', 'TableStructure', 'TableStructure'],
'Score': [1.0, 1.0, 1.0],
})
pd.testing.assert_frame_equal(structure.details, expected_details)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_end_to_end_with_metrics_failing(self):
# Assert
expected_properties = pd.DataFrame({
'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'],
'Score': [1.0, 0.7833333333333333, 1.0]
'Score': [1.0, 1.0, 1.0]
})
expected_details = pd.DataFrame({
'Table': [
Expand Down Expand Up @@ -91,7 +91,7 @@ def test_end_to_end_with_metrics_failing(self):
"TypeError: '<=' not supported between instances of 'str' and 'float'", None
]
})
assert results == 0.9277777777777777
assert results == 1.0
pd.testing.assert_frame_equal(
report.get_properties(), expected_properties, check_exact=False, atol=2e-2
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_get_score(self):
assert score == 1.0

expected_details = pd.DataFrame({
'Metric': 'TableFormat',
'Metric': 'TableStructure',
'Score': 1.0,
}, index=[0])

Expand All @@ -41,7 +41,7 @@ def test_get_score_error(self):
assert pd.isna(score)

expected_details = pd.DataFrame({
'Metric': 'TableFormat',
'Metric': 'TableStructure',
'Score': np.nan,
'Error': "AttributeError: 'list' object has no attribute 'columns'"
}, index=[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_end_to_end(self):
})

expected_details_data_structure = pd.DataFrame({
'Metric': ['TableFormat'],
'Metric': ['TableStructure'],
'Score': [1.0]
})

Expand Down Expand Up @@ -134,7 +134,7 @@ def test_generate_with_object_datetimes(self):
})

expected_details_data_structure = pd.DataFrame({
'Metric': ['TableFormat'],
'Metric': ['TableStructure'],
'Score': [1.0]
})

Expand Down
4 changes: 2 additions & 2 deletions tests/unit/reports/multi_table/_properties/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_get_visualization(mock_px):
mock_df = pd.DataFrame({
'Table': ['Table1', 'Table2'],
'Score': [0.7, 0.3],
'Metric': ['TableFormat', 'TableFormat']
'Metric': ['TableStructure', 'TableStructure']
})
structure_property.details = mock_df

Expand Down Expand Up @@ -57,7 +57,7 @@ def test_get_visualization(mock_px):
'category_orders': {'group': mock_df['Table'].tolist()},
'color': 'Metric',
'color_discrete_map': {
'TableFormat': '#000036',
'TableStructure': '#000036',
},
'pattern_shape': 'Metric',
'pattern_shape_sequence': [''],
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/reports/single_table/_properties/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class TestStructure:

@patch('sdmetrics.reports.single_table._properties.structure.'
'TableFormat.compute')
'TableStructure.compute')
def test__generate_details(self, table_format_mock):
"""Test the ``_generate_details`` method."""
# Setup
Expand All @@ -38,17 +38,17 @@ def test__generate_details(self, table_format_mock):

# Assert
table_format_mock.assert_called_once_with(
real_data, synthetic_data, ignore_dtype_columns=[]
real_data, synthetic_data,
)

expected_details = pd.DataFrame({
'Metric': 'TableFormat',
'Metric': 'TableStructure',
'Score': 0.75,
}, index=[0])
pd.testing.assert_frame_equal(result, expected_details)

@patch('sdmetrics.reports.single_table._properties.structure.'
'TableFormat.compute')
'TableStructure.compute')
def test__generate_details_with_id_column(self, table_format_mock):
"""Test the ``_generate_details`` method."""
# Setup
Expand Down Expand Up @@ -78,11 +78,11 @@ def test__generate_details_with_id_column(self, table_format_mock):

# Assert
table_format_mock.assert_called_once_with(
real_data, synthetic_data, ignore_dtype_columns=['id']
real_data, synthetic_data
)

expected_details = pd.DataFrame({
'Metric': 'TableFormat',
'Metric': 'TableStructure',
'Score': 0.75,
}, index=[0])
pd.testing.assert_frame_equal(result, expected_details)
Expand Down
Loading

0 comments on commit a8a8b44

Please sign in to comment.