update computation

sdv-dev · Nov 15, 2023 · 86bd6f0 · 86bd6f0
1 parent a2b637f
commit 86bd6f0
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 43 deletions.
diff --git a/sdmetrics/single_table/table_structure.py b/sdmetrics/single_table/table_structure.py
@@ -35,22 +35,12 @@ def compute_breakdown(cls, real_data, synthetic_data):
         synthetic_data (pandas.DataFrame):
             The synthetic data.
         """
-        missing_columns_in_synthetic = set(real_data.columns) - set(synthetic_data.columns)
-        invalid_names = []
-        invalid_sdtypes = []
-        for column in synthetic_data.columns:
-            if column not in real_data.columns:
-                invalid_names.append(column)
-                continue
+        synthetic_columns = set(synthetic_data.columns)
+        real_columns = set(real_data.columns)
+        intersection_columns = real_columns & synthetic_columns
+        union_columns = real_columns | synthetic_columns
+        score = len(intersection_columns)/len(union_columns)
 
-            if synthetic_data[column].dtype != real_data[column].dtype:
-                invalid_sdtypes.append(column)
-
-        proportion_correct_columns = 1 - len(missing_columns_in_synthetic) / len(real_data.columns)
-        proportion_valid_names = 1 - len(invalid_names) / len(synthetic_data.columns)
-        proportion_valid_sdtypes = 1 - len(invalid_sdtypes) / len(synthetic_data.columns)
-
-        score = proportion_correct_columns * proportion_valid_names * proportion_valid_sdtypes
         return {'score': score}
 
     @classmethod

diff --git a/tests/integration/reports/multi_table/test_diagnostic_report.py b/tests/integration/reports/multi_table/test_diagnostic_report.py
@@ -62,7 +62,7 @@ def test_end_to_end_with_metrics_failing(self):
         # Assert
         expected_properties = pd.DataFrame({
             'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'],
-            'Score': [1.0, 0.7833333333333333, 1.0]
+            'Score': [1.0, 1.0, 1.0]
         })
         expected_details = pd.DataFrame({
             'Table': [
@@ -91,7 +91,7 @@ def test_end_to_end_with_metrics_failing(self):
                 "TypeError: '<=' not supported between instances of 'str' and 'float'", None
             ]
         })
-        assert results == 0.9277777777777777
+        assert results == 1.0
         pd.testing.assert_frame_equal(
             report.get_properties(), expected_properties, check_exact=False, atol=2e-2
         )

diff --git a/tests/unit/single_table/test_table_structure.py b/tests/unit/single_table/test_table_structure.py
@@ -87,36 +87,11 @@ def test_compute_breakdown_with_invalid_names(self, real_data):
         expected_result = {'score': 0.8333333333333334}
         assert result == expected_result
 
-    def test_compute_breakdown_with_invalid_dtypes(self, real_data):
-        """Test the ``compute_breakdown`` method with invalid dtypes."""
-        # Setup
-        synthetic_data = pd.DataFrame({
-            'col_1': [3.0, 2.0, 1.0, 4.0, 5.0],
-            'col_2': ['A', 'B', 'C', 'D', 'E'],
-            'col_3': [True, False, True, False, True],
-            'col_4': [
-                '2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'
-            ],
-            'col_5': [4.0, 2.0, 3.0, 4.0, 5.0],
-        })
-
-        metric = TableStructure()
-
-        # Run
-        result = metric.compute_breakdown(real_data, synthetic_data)
-
-        # Assert
-        expected_result = {'score': 0.6}
-        assert result == expected_result
-
     def test_compute_breakdown_multiple_error(self, real_data):
         """Test the ``compute_breakdown`` method with the different failure modes."""
         synthetic_data = pd.DataFrame({
             'col_1': [1, 2, 1, 4, 5],
             'col_3': [True, False, True, False, True],
-            'col_4': [
-                '2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'
-            ],
             'col_5': [4.0, 2.0, 3.0, 4.0, 5.0],
             'col_6': [4.0, 2.0, 3.0, 4.0, 5.0],
         })
@@ -127,7 +102,7 @@ def test_compute_breakdown_multiple_error(self, real_data):
         result = metric.compute_breakdown(real_data, synthetic_data)
 
         # Assert
-        expected_result = {'score': 0.5120000000000001}
+        expected_result = {'score': 0.5}
         assert result == expected_result
 
     @patch('sdmetrics.single_table.table_structure.TableStructure.compute_breakdown')