From a8a8b4471e0b99683596c1df789ae5fa750fbcde Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 16 Nov 2023 10:55:25 -0600 Subject: [PATCH] Update `TableFormat` metric to `TableStructure` + fix its computation (#519) --- .../multi_table/_properties/structure.py | 2 +- .../single_table/_properties/structure.py | 20 +---- sdmetrics/single_table/__init__.py | 4 +- sdmetrics/single_table/table_format.py | 80 ------------------- sdmetrics/single_table/table_structure.py | 60 ++++++++++++++ .../multi_table/_properties/test_structure.py | 2 +- .../multi_table/test_diagnostic_report.py | 4 +- .../_properties/test_structure.py | 4 +- .../single_table/test_diagnostic_report.py | 4 +- .../multi_table/_properties/test_structure.py | 4 +- .../_properties/test_structure.py | 12 +-- ...able_format.py => test_table_structure.py} | 69 +++------------- 12 files changed, 92 insertions(+), 173 deletions(-) delete mode 100644 sdmetrics/single_table/table_format.py create mode 100644 sdmetrics/single_table/table_structure.py rename tests/unit/single_table/{test_table_format.py => test_table_structure.py} (65%) diff --git a/sdmetrics/reports/multi_table/_properties/structure.py b/sdmetrics/reports/multi_table/_properties/structure.py index affabd07..094462d2 100644 --- a/sdmetrics/reports/multi_table/_properties/structure.py +++ b/sdmetrics/reports/multi_table/_properties/structure.py @@ -43,7 +43,7 @@ def get_visualization(self, table_name=None): category_orders={'group': list(self.details['Table'])}, color='Metric', color_discrete_map={ - 'TableFormat': PlotConfig.DATACEBO_DARK, + 'TableStructure': PlotConfig.DATACEBO_DARK, }, pattern_shape='Metric', pattern_shape_sequence=[''], diff --git a/sdmetrics/reports/single_table/_properties/structure.py b/sdmetrics/reports/single_table/_properties/structure.py index 7d539863..fcc821af 100644 --- a/sdmetrics/reports/single_table/_properties/structure.py +++ b/sdmetrics/reports/single_table/_properties/structure.py @@ -3,7 +3,7 @@ from sdmetrics.errors import VisualizationUnavailableError from sdmetrics.reports.single_table._properties import BaseSingleTableProperty -from sdmetrics.single_table import TableFormat +from sdmetrics.single_table import TableStructure class Structure(BaseSingleTableProperty): @@ -31,21 +31,9 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No Returns: pandas.DataFrame """ - column_to_ignore_dtype = [] - non_pii_sdtype = [ - 'numerical', 'datetime', 'categorical', 'boolean' - ] - for column_name in metadata['columns']: - sdtype = metadata['columns'][column_name]['sdtype'] - if sdtype in non_pii_sdtype: - continue - - column_to_ignore_dtype.append(column_name) - try: - score = TableFormat.compute( - real_data, synthetic_data, - ignore_dtype_columns=column_to_ignore_dtype + score = TableStructure.compute( + real_data, synthetic_data ) error_message = None @@ -58,7 +46,7 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No progress_bar.update() result = pd.DataFrame({ - 'Metric': 'TableFormat', + 'Metric': 'TableStructure', 'Score': score, 'Error': error_message, }, index=[0]) diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index d6a49ac2..6515c275 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -32,7 +32,7 @@ from sdmetrics.single_table.privacy.numerical_sklearn import ( NumericalLR, NumericalMLP, NumericalSVR) from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor -from sdmetrics.single_table.table_format import TableFormat +from sdmetrics.single_table.table_structure import TableStructure __all__ = [ 'bayesian_network', @@ -91,5 +91,5 @@ 'TVComplement', 'RangeCoverage', 'NewRowSynthesis', - 'TableFormat', + 'TableStructure', ] diff --git a/sdmetrics/single_table/table_format.py b/sdmetrics/single_table/table_format.py deleted file mode 100644 index 428aa7ca..00000000 --- a/sdmetrics/single_table/table_format.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Table Format metric.""" -from sdmetrics.goal import Goal -from sdmetrics.single_table.base import SingleTableMetric - - -class TableFormat(SingleTableMetric): - """TableFormat Single Table metric. - - This metric computes whether the names and data types of each column are - the same in the real and synthetic data. - - Attributes: - name (str): - Name to use when reports about this metric are printed. - goal (sdmetrics.goal.Goal): - The goal of this metric. - min_value (Union[float, tuple[float]]): - Minimum value or values that this metric can take. - max_value (Union[float, tuple[float]]): - Maximum value or values that this metric can take. - """ - - name = 'TableFormat' - goal = Goal.MAXIMIZE - min_value = 0 - max_value = 1 - - @classmethod - def compute_breakdown(cls, real_data, synthetic_data, ignore_dtype_columns=None): - """Compute the score breakdown of the table format metric. - - Args: - real_data (pandas.DataFrame): - The real data. - synthetic_data (pandas.DataFrame): - The synthetic data. - ignore_dtype_columns (list[str]): - List of column names to ignore when comparing data types. - Defaults to ``None``. - """ - ignore_dtype_columns = ignore_dtype_columns or [] - missing_columns_in_synthetic = set(real_data.columns) - set(synthetic_data.columns) - invalid_names = [] - invalid_sdtypes = [] - for column in synthetic_data.columns: - if column not in real_data.columns: - invalid_names.append(column) - continue - - if column in ignore_dtype_columns: - continue - - if synthetic_data[column].dtype != real_data[column].dtype: - invalid_sdtypes.append(column) - - proportion_correct_columns = 1 - len(missing_columns_in_synthetic) / len(real_data.columns) - proportion_valid_names = 1 - len(invalid_names) / len(synthetic_data.columns) - proportion_valid_sdtypes = 1 - len(invalid_sdtypes) / len(synthetic_data.columns) - - score = proportion_correct_columns * proportion_valid_names * proportion_valid_sdtypes - return {'score': score} - - @classmethod - def compute(cls, real_data, synthetic_data, ignore_dtype_columns=None): - """Compute the table format metric score. - - Args: - real_data (pandas.DataFrame): - The real data. - synthetic_data (pandas.DataFrame): - The synthetic data. - ignore_dtype_columns (list[str]): - List of column names to ignore when comparing data types. - Defaults to ``None``. - - Returns: - float: - The metric score. - """ - return cls.compute_breakdown(real_data, synthetic_data, ignore_dtype_columns)['score'] diff --git a/sdmetrics/single_table/table_structure.py b/sdmetrics/single_table/table_structure.py new file mode 100644 index 00000000..f7c17bf4 --- /dev/null +++ b/sdmetrics/single_table/table_structure.py @@ -0,0 +1,60 @@ +"""Table Format metric.""" +from sdmetrics.goal import Goal +from sdmetrics.single_table.base import SingleTableMetric + + +class TableStructure(SingleTableMetric): + """TableStructure Single Table metric. + + This metric computes whether the names and data types of each column are + the same in the real and synthetic data. + + Attributes: + name (str): + Name to use when reports about this metric are printed. + goal (sdmetrics.goal.Goal): + The goal of this metric. + min_value (Union[float, tuple[float]]): + Minimum value or values that this metric can take. + max_value (Union[float, tuple[float]]): + Maximum value or values that this metric can take. + """ + + name = 'TableStructure' + goal = Goal.MAXIMIZE + min_value = 0 + max_value = 1 + + @classmethod + def compute_breakdown(cls, real_data, synthetic_data): + """Compute the score breakdown of the table format metric. + + Args: + real_data (pandas.DataFrame): + The real data. + synthetic_data (pandas.DataFrame): + The synthetic data. + """ + synthetic_columns = set(synthetic_data.columns) + real_columns = set(real_data.columns) + intersection_columns = real_columns & synthetic_columns + union_columns = real_columns | synthetic_columns + score = len(intersection_columns)/len(union_columns) + + return {'score': score} + + @classmethod + def compute(cls, real_data, synthetic_data): + """Compute the table format metric score. + + Args: + real_data (pandas.DataFrame): + The real data. + synthetic_data (pandas.DataFrame): + The synthetic data. + + Returns: + float: + The metric score. + """ + return cls.compute_breakdown(real_data, synthetic_data)['score'] diff --git a/tests/integration/reports/multi_table/_properties/test_structure.py b/tests/integration/reports/multi_table/_properties/test_structure.py index 68091605..dadf1785 100644 --- a/tests/integration/reports/multi_table/_properties/test_structure.py +++ b/tests/integration/reports/multi_table/_properties/test_structure.py @@ -23,7 +23,7 @@ def test_end_to_end(self): expected_details = pd.DataFrame({ 'Table': ['users', 'sessions', 'transactions'], - 'Metric': ['TableFormat', 'TableFormat', 'TableFormat'], + 'Metric': ['TableStructure', 'TableStructure', 'TableStructure'], 'Score': [1.0, 1.0, 1.0], }) pd.testing.assert_frame_equal(structure.details, expected_details) diff --git a/tests/integration/reports/multi_table/test_diagnostic_report.py b/tests/integration/reports/multi_table/test_diagnostic_report.py index cf905a0b..0e0dcb76 100644 --- a/tests/integration/reports/multi_table/test_diagnostic_report.py +++ b/tests/integration/reports/multi_table/test_diagnostic_report.py @@ -62,7 +62,7 @@ def test_end_to_end_with_metrics_failing(self): # Assert expected_properties = pd.DataFrame({ 'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'], - 'Score': [1.0, 0.7833333333333333, 1.0] + 'Score': [1.0, 1.0, 1.0] }) expected_details = pd.DataFrame({ 'Table': [ @@ -91,7 +91,7 @@ def test_end_to_end_with_metrics_failing(self): "TypeError: '<=' not supported between instances of 'str' and 'float'", None ] }) - assert results == 0.9277777777777777 + assert results == 1.0 pd.testing.assert_frame_equal( report.get_properties(), expected_properties, check_exact=False, atol=2e-2 ) diff --git a/tests/integration/reports/single_table/_properties/test_structure.py b/tests/integration/reports/single_table/_properties/test_structure.py index 7ed8b345..3ad901a8 100644 --- a/tests/integration/reports/single_table/_properties/test_structure.py +++ b/tests/integration/reports/single_table/_properties/test_structure.py @@ -20,7 +20,7 @@ def test_get_score(self): assert score == 1.0 expected_details = pd.DataFrame({ - 'Metric': 'TableFormat', + 'Metric': 'TableStructure', 'Score': 1.0, }, index=[0]) @@ -41,7 +41,7 @@ def test_get_score_error(self): assert pd.isna(score) expected_details = pd.DataFrame({ - 'Metric': 'TableFormat', + 'Metric': 'TableStructure', 'Score': np.nan, 'Error': "AttributeError: 'list' object has no attribute 'columns'" }, index=[0]) diff --git a/tests/integration/reports/single_table/test_diagnostic_report.py b/tests/integration/reports/single_table/test_diagnostic_report.py index 0fb7b470..1ff5ec35 100644 --- a/tests/integration/reports/single_table/test_diagnostic_report.py +++ b/tests/integration/reports/single_table/test_diagnostic_report.py @@ -84,7 +84,7 @@ def test_end_to_end(self): }) expected_details_data_structure = pd.DataFrame({ - 'Metric': ['TableFormat'], + 'Metric': ['TableStructure'], 'Score': [1.0] }) @@ -134,7 +134,7 @@ def test_generate_with_object_datetimes(self): }) expected_details_data_structure = pd.DataFrame({ - 'Metric': ['TableFormat'], + 'Metric': ['TableStructure'], 'Score': [1.0] }) diff --git a/tests/unit/reports/multi_table/_properties/test_structure.py b/tests/unit/reports/multi_table/_properties/test_structure.py index ddaf9175..4f99ec7b 100644 --- a/tests/unit/reports/multi_table/_properties/test_structure.py +++ b/tests/unit/reports/multi_table/_properties/test_structure.py @@ -29,7 +29,7 @@ def test_get_visualization(mock_px): mock_df = pd.DataFrame({ 'Table': ['Table1', 'Table2'], 'Score': [0.7, 0.3], - 'Metric': ['TableFormat', 'TableFormat'] + 'Metric': ['TableStructure', 'TableStructure'] }) structure_property.details = mock_df @@ -57,7 +57,7 @@ def test_get_visualization(mock_px): 'category_orders': {'group': mock_df['Table'].tolist()}, 'color': 'Metric', 'color_discrete_map': { - 'TableFormat': '#000036', + 'TableStructure': '#000036', }, 'pattern_shape': 'Metric', 'pattern_shape_sequence': [''], diff --git a/tests/unit/reports/single_table/_properties/test_structure.py b/tests/unit/reports/single_table/_properties/test_structure.py index 154a35a6..f112d5d8 100644 --- a/tests/unit/reports/single_table/_properties/test_structure.py +++ b/tests/unit/reports/single_table/_properties/test_structure.py @@ -11,7 +11,7 @@ class TestStructure: @patch('sdmetrics.reports.single_table._properties.structure.' - 'TableFormat.compute') + 'TableStructure.compute') def test__generate_details(self, table_format_mock): """Test the ``_generate_details`` method.""" # Setup @@ -38,17 +38,17 @@ def test__generate_details(self, table_format_mock): # Assert table_format_mock.assert_called_once_with( - real_data, synthetic_data, ignore_dtype_columns=[] + real_data, synthetic_data, ) expected_details = pd.DataFrame({ - 'Metric': 'TableFormat', + 'Metric': 'TableStructure', 'Score': 0.75, }, index=[0]) pd.testing.assert_frame_equal(result, expected_details) @patch('sdmetrics.reports.single_table._properties.structure.' - 'TableFormat.compute') + 'TableStructure.compute') def test__generate_details_with_id_column(self, table_format_mock): """Test the ``_generate_details`` method.""" # Setup @@ -78,11 +78,11 @@ def test__generate_details_with_id_column(self, table_format_mock): # Assert table_format_mock.assert_called_once_with( - real_data, synthetic_data, ignore_dtype_columns=['id'] + real_data, synthetic_data ) expected_details = pd.DataFrame({ - 'Metric': 'TableFormat', + 'Metric': 'TableStructure', 'Score': 0.75, }, index=[0]) pd.testing.assert_frame_equal(result, expected_details) diff --git a/tests/unit/single_table/test_table_format.py b/tests/unit/single_table/test_table_structure.py similarity index 65% rename from tests/unit/single_table/test_table_format.py rename to tests/unit/single_table/test_table_structure.py index 2195f1a4..329bd722 100644 --- a/tests/unit/single_table/test_table_format.py +++ b/tests/unit/single_table/test_table_structure.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from sdmetrics.single_table import TableFormat +from sdmetrics.single_table import TableStructure @pytest.fixture() @@ -19,7 +19,7 @@ def real_data(): }) -class TestTableFormat: +class TestTableStructure: def test_compute_breakdown(self, real_data): """Test the ``compute_breakdown`` method.""" @@ -34,7 +34,7 @@ def test_compute_breakdown(self, real_data): 'col_5': [4.0, 2.0, 3.0, 4.0, 5.0] }) - metric = TableFormat() + metric = TableStructure() # Run result = metric.compute_breakdown(real_data, synthetic_data) @@ -55,7 +55,7 @@ def test_compute_breakdown_with_missing_columns(self, real_data): ]), }) - metric = TableFormat() + metric = TableStructure() # Run result = metric.compute_breakdown(real_data, synthetic_data) @@ -78,7 +78,7 @@ def test_compute_breakdown_with_invalid_names(self, real_data): 'col_6': [4.0, 2.0, 3.0, 4.0, 5.0], }) - metric = TableFormat() + metric = TableStructure() # Run result = metric.compute_breakdown(real_data, synthetic_data) @@ -87,74 +87,25 @@ def test_compute_breakdown_with_invalid_names(self, real_data): expected_result = {'score': 0.8333333333333334} assert result == expected_result - def test_compute_breakdown_with_invalid_dtypes(self, real_data): - """Test the ``compute_breakdown`` method with invalid dtypes.""" - # Setup - synthetic_data = pd.DataFrame({ - 'col_1': [3.0, 2.0, 1.0, 4.0, 5.0], - 'col_2': ['A', 'B', 'C', 'D', 'E'], - 'col_3': [True, False, True, False, True], - 'col_4': [ - '2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05' - ], - 'col_5': [4.0, 2.0, 3.0, 4.0, 5.0], - }) - - metric = TableFormat() - - # Run - result = metric.compute_breakdown(real_data, synthetic_data) - - # Assert - expected_result = {'score': 0.6} - assert result == expected_result - - def test_compute_breakdown_ignore_dtype_columns(self, real_data): - """Test the ``compute_breakdown`` method when ignore_dtype_columns is set.""" - # Setup - synthetic_data = pd.DataFrame({ - 'col_1': [3.0, 2.0, 1.0, 4.0, 5.0], - 'col_2': ['A', 'B', 'C', 'D', 'E'], - 'col_3': [True, False, True, False, True], - 'col_4': [ - '2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05' - ], - 'col_5': [4.0, 2.0, 3.0, 4.0, 5.0], - }) - - metric = TableFormat() - - # Run - result = metric.compute_breakdown( - real_data, synthetic_data, ignore_dtype_columns=['col_4'] - ) - - # Assert - expected_result = {'score': 0.8} - assert result == expected_result - def test_compute_breakdown_multiple_error(self, real_data): """Test the ``compute_breakdown`` method with the different failure modes.""" synthetic_data = pd.DataFrame({ 'col_1': [1, 2, 1, 4, 5], 'col_3': [True, False, True, False, True], - 'col_4': [ - '2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05' - ], 'col_5': [4.0, 2.0, 3.0, 4.0, 5.0], 'col_6': [4.0, 2.0, 3.0, 4.0, 5.0], }) - metric = TableFormat() + metric = TableStructure() # Run result = metric.compute_breakdown(real_data, synthetic_data) # Assert - expected_result = {'score': 0.5120000000000001} + expected_result = {'score': 0.5} assert result == expected_result - @patch('sdmetrics.single_table.table_format.TableFormat.compute_breakdown') + @patch('sdmetrics.single_table.table_structure.TableStructure.compute_breakdown') def test_compute(self, compute_breakdown_mock, real_data): """Test the ``compute`` method.""" # Setup @@ -167,12 +118,12 @@ def test_compute(self, compute_breakdown_mock, real_data): ]), 'col_5': [4.0, 2.0, 3.0, 4.0, 5.0] }) - metric = TableFormat() + metric = TableStructure() compute_breakdown_mock.return_value = {'score': 0.6} # Run result = metric.compute(real_data, synthetic_data) # Assert - compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None) + compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data) assert result == 0.6