Update the synthetic data that's available for the single-table demo (#…

…520)
sdv-dev · Nov 16, 2023 · fbe3a65 · fbe3a65
1 parent 26ec13c
commit fbe3a65
Show file tree

Hide file tree

Showing 8 changed files with 271 additions and 280 deletions.
diff --git a/sdmetrics/demos/single_table/synthetic.csv b/sdmetrics/demos/single_table/synthetic.csv
diff --git a/tests/integration/reports/single_table/_properties/test_boundary.py b/tests/integration/reports/single_table/_properties/test_boundary.py
@@ -17,19 +17,14 @@ def test_get_score(self):
         score = boundary_property.get_score(real_data, synthetic_data, metadata)
 
         # Assert
-        assert score == 0.9172655676537751
-
+        assert score == 1.0
         expected_details = pd.DataFrame({
             'Column': [
                 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc',
                 'degree_perc', 'experience_years', 'employability_perc', 'mba_perc'
             ],
             'Metric': ['BoundaryAdherence'] * 10,
-            'Score': [
-                0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0,
-                0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0,
-                0.8883720930232558, 0.8930232558139535
-            ]
+            'Score': [1.0] * 10
         })
 
         pd.testing.assert_frame_equal(boundary_property.details, expected_details)
@@ -64,4 +59,4 @@ def test_get_score_error(self):
         assert error_messages[0] == expected_message_1
         assert error_messages[1] == expected_message_2
         assert error_messages[2] == expected_message_3
-        assert score == 0.9270636340403783
+        assert score == 1.0
diff --git a/tests/integration/reports/single_table/_properties/test_column_pair_trends.py b/tests/integration/reports/single_table/_properties/test_column_pair_trends.py
@@ -42,19 +42,19 @@ def test_get_score(self):
                 'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
             ],
             'Score': [
-                0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
-                0.6976744186046512, 0.8976744186046511
+                0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
+                0.772093023255814, 0.9348837209302325
             ],
             'Real Correlation': [
                 0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
             ],
             'Synthetic Correlation': [
-                0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
+                -0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
             ]
         }
         expected_details = pd.DataFrame(expected_details_dict)
         pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
-        assert score == 0.754164318336875
+        assert score == 0.8050699533533958
 
     def test_get_score_warnings(self, recwarn):
         """Test the ``get_score`` method when the metrics are raising erros for some columns."""
@@ -90,7 +90,7 @@ def test_get_score_warnings(self, recwarn):
         # Assert
         details = column_shape_property.details
         pd.testing.assert_series_equal(details['Error'], exp_error_serie, check_names=False)
-        assert score == 0.7023255813953488
+        assert score == 0.7751937984496124
 
     def test_only_categorical_columns(self):
         """Test the ``get_score`` method when there are only categorical columns."""
@@ -119,12 +119,12 @@ def test_only_categorical_columns(self):
             ],
             'Metric': ['ContingencySimilarity'] * 6,
             'Score': [
-                0.8883720930232558, 0.9023255813953488, 0.7767441860465116, 0.9348837209302325,
-                0.8883720930232558, 0.8976744186046511
+                0.9209302325581395, 0.9627906976744186, 0.6837209302325581, 0.9302325581395349,
+                0.9255813953488372, 0.9348837209302325
             ],
             'Real Correlation': [np.nan] * 6,
             'Synthetic Correlation': [np.nan] * 6
         }
         expected_details = pd.DataFrame(expected_details_dict)
         pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
-        assert score == 0.8813953488372093
+        assert score == 0.8930232558139535
diff --git a/tests/integration/reports/single_table/_properties/test_column_shapes.py b/tests/integration/reports/single_table/_properties/test_column_shapes.py
@@ -28,13 +28,15 @@ def test_get_score(self):
                 'TVComplement'
             ],
             'Score': [
-                0.701107, 0.768919, 0.869155, 0.826051, 0.553488, 0.902326, 0.995349, 0.627907,
-                0.939535, 0.627907, 0.916279, 0.800000, 0.781395, 0.841860, 0.972093, 0.925581
+                0.6621621621621622, 0.849290780141844, 0.8531399046104928, 0.43918918918918914,
+                0.8976744186046511, 0.9860465116279069, 0.986046511627907, 0.8976744186046511,
+                1.0, 0.9162790697674419, 0.9906976744186047, 0.3441860465116279,
+                0.9348837209302325, 0.9255813953488372, 0.9953488372093023, 0.9395348837209302
             ]
         }
         expected_details = pd.DataFrame(expected_details_dict)
         pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
-        assert score == 0.8155594899871002
+        assert score == 0.8511084702797364
 
     def test_get_score_errors(self):
         """Test the ``get_score`` method when the metrics are raising errors for some columns."""
@@ -65,4 +67,4 @@ def test_get_score_errors(self):
         assert column_names_nan == ['start_date', 'employability_perc']
         assert error_messages[0] == expected_message_1
         assert error_messages[1] == expected_message_2
-        assert score == 0.8261749908947813
+        assert score == 0.858620688670242
diff --git a/tests/integration/reports/single_table/_properties/test_coverage.py b/tests/integration/reports/single_table/_properties/test_coverage.py
@@ -16,7 +16,7 @@ def test_get_score(self):
         score = coverage_property.get_score(real_data, synthetic_data, metadata)
 
         # Assert
-        assert score == 0.9419212095491987
+        assert score == 0.896792056025647
 
         expected_details = pd.DataFrame({
             'Column': [
@@ -31,8 +31,10 @@ def test_get_score(self):
                 'RangeCoverage', 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage'
             ],
             'Score': [
-                1.0, 1.0, 0.42333783783783785, 1.0, 0.9807348482826732, 1.0, 1.0, 1.0, 1.0,
-                1.0, 1.0, 0.6666666666666667, 1.0, 1.0, 1.0, 1.0
+                0.9952153110047847, 0.9554140127388535, 0.45462162162162156,
+                0.7777777777777778, 0.928171334431631, 1.0, 1.0, 0.9659863945578232,
+                1.0, 1.0, 1.0, 0.33333333333333337, 0.9943749999999998, 0.943778110944528,
+                1.0, 1.0
             ]
         })
 
@@ -64,4 +66,4 @@ def test_get_score_error(self):
         assert column_names_nan == ['start_date', 'employability_perc']
         assert error_messages[0] == expected_message_1
         assert error_messages[1] == expected_message_2
-        assert score == 0.9336242394847984
+        assert score == 0.8827916132432548
diff --git a/tests/integration/reports/single_table/_properties/test_data_validity.py b/tests/integration/reports/single_table/_properties/test_data_validity.py
@@ -30,16 +30,11 @@ def test_get_score(self):
                 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence',
                 'CategoryAdherence'
             ],
-            'Score': [
-                0.8503937007874016, 0.8615384615384616, 0.9444444444444444,
-                1.0, 1.0, 0.8651162790697674, 1.0, 1.0, 0.9255813953488372,
-                1.0, 0.9441860465116279, 1.0, 1.0, 0.8883720930232558,
-                0.8930232558139535, 1.0, 1.0
-            ]
+            'Score': [1.0] * 17
         }
         expected_details = pd.DataFrame(expected_details_dict)
         pd.testing.assert_frame_equal(data_validity_property.details, expected_details)
-        assert score == 0.9513326868551618
+        assert score == 1.0
 
     def test_get_score_errors(self):
         """Test the ``get_score`` method when the metrics are raising errors for some columns."""
@@ -70,4 +65,4 @@ def test_get_score_errors(self):
         assert column_names_nan == ['start_date', 'employability_perc']
         assert error_messages[0] == expected_message_1
         assert error_messages[1] == expected_message_2
-        assert score == 0.9622593255151395
+        assert score == 1.0
diff --git a/tests/integration/reports/single_table/test_diagnostic_report.py b/tests/integration/reports/single_table/test_diagnostic_report.py
@@ -21,7 +21,7 @@ def test_get_properties(self):
         expected_frame = pd.DataFrame(
             {
                 'Property': ['Data Validity', 'Data Structure'],
-                'Score': [0.951333, 1.0]
+                'Score': [1.0, 1.0]
             }
         )
         pd.testing.assert_frame_equal(properties_frame, expected_frame)
@@ -38,7 +38,7 @@ def test_get_score(self):
 
         # Assert
 
-        assert result == 0.975666343427581
+        assert result == 1.0
 
     def test_get_score_with_no_verbose(self):
         """Test the ``get_score`` method works when verbose=False."""
@@ -51,7 +51,7 @@ def test_get_score_with_no_verbose(self):
         result_dict = report.get_score()
 
         # Assert
-        assert result_dict == 0.975666343427581
+        assert result_dict == 1.0
 
     def test_end_to_end(self):
         """Test the end-to-end functionality of the diagnostic report."""
@@ -78,9 +78,8 @@ def test_end_to_end(self):
                 'CategoryAdherence'
             ],
             'Score': [
-                0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
-                0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279, 1.0,
-                1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                1.0
             ]
         })
 
@@ -129,9 +128,8 @@ def test_generate_with_object_datetimes(self):
                 'CategoryAdherence'
             ],
             'Score': [
-                0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
-                0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279,
-                1.0, 1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                1.0
             ]
         })
 
@@ -160,9 +158,9 @@ def test_generate_multiple_times(self):
         report = DiagnosticReport()
         report.generate(real_data, synthetic_data, metadata, verbose=False)
 
-        assert report.get_score() == 0.975666343427581
+        assert report.get_score() == 1.0
         report.generate(real_data, synthetic_data, metadata)
-        assert report.get_score() == 0.975666343427581
+        assert report.get_score() == 1.0
 
     def test_get_details_with_errors(self):
         """Test the ``get_details`` function of the diagnostic report when there are errors."""
@@ -190,9 +188,8 @@ def test_get_details_with_errors(self):
                 'CategoryAdherence'
             ],
             'Score': [
-                0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
-                0.8651162790697674, 1.0, 1.0, np.nan, 1.0, 0.9441860465116279, 1.0, 1.0,
-                0.8883720930232558, 0.8930232558139535, 1.0, 1.0
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.nan, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                1.0
             ],
             'Error': [
                 None, None, None, None, None, None, None, None,

diff --git a/tests/integration/reports/single_table/test_quality_report.py b/tests/integration/reports/single_table/test_quality_report.py
@@ -87,7 +87,7 @@ def test_report_end_to_end(self):
             'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
             'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
             'Score': [
-                0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372
+                0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302
             ],
         }
 
@@ -105,14 +105,14 @@ def test_report_end_to_end(self):
                 'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
             ],
             'Score': [
-                0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
-                0.6976744186046512, 0.8976744186046511
+                0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
+                0.772093023255814, 0.9348837209302325
             ],
             'Real Correlation': [
                 0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
             ],
             'Synthetic Correlation': [
-                0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
+                -0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
             ]
         }
         expected_details_column_shapes = pd.DataFrame(expected_details_column_shapes_dict)
@@ -124,7 +124,7 @@ def test_report_end_to_end(self):
         pd.testing.assert_frame_equal(
             report.get_details('Column Pair Trends'), expected_details_cpt
         )
-        assert report.get_score() == 0.7804181608907237
+        assert report.get_score() == 0.8393750143888287
 
         report_info = report.get_info()
         assert report_info == report.report_info
@@ -167,7 +167,7 @@ def test_quality_report_with_object_datetimes(self):
             'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
             'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
             'Score': [
-                0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372
+                0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302
             ],
         }
 
@@ -185,14 +185,14 @@ def test_quality_report_with_object_datetimes(self):
                 'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
             ],
             'Score': [
-                0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
-                0.6976744186046512, 0.8976744186046511
+                0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
+                0.772093023255814, 0.9348837209302325
             ],
             'Real Correlation': [
                 0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
             ],
             'Synthetic Correlation': [
-                0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
+                -0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
             ]
         }
         expected_details_column_shapes = pd.DataFrame(expected_details_column_shapes_dict)
@@ -204,7 +204,7 @@ def test_quality_report_with_object_datetimes(self):
         pd.testing.assert_frame_equal(
             report.get_details('Column Pair Trends'), expected_details_cpt
         )
-        assert report.get_score() == 0.7804181608907237
+        assert report.get_score() == 0.8393750143888287
 
     def test_report_end_to_end_with_errors(self):
         """Test the quality report end to end with errors in the properties computation."""
@@ -229,7 +229,7 @@ def test_report_end_to_end_with_errors(self):
         expected_details_column_shapes_dict = {
             'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
             'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
-            'Score': [0.7011066184294531, np.nan, 0.9720930232558139, 0.9255813953488372],
+            'Score': [0.6621621621621622, np.nan, 0.9953488372093023, 0.9395348837209302],
             'Error': [
                 None,
                 "TypeError: '<' not supported between instances of 'str' and 'float'",
@@ -252,7 +252,7 @@ def test_report_end_to_end_with_errors(self):
                 'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
             ],
             'Score': [
-                np.nan, 0.586046511627907, 0.6232558139534884, np.nan, np.nan, 0.8976744186046511
+                np.nan, 0.6744186046511629, 0.7162790697674419, np.nan, np.nan, 0.9348837209302325
             ],
             'Real Correlation': [np.nan] * 6,
             'Synthetic Correlation': [np.nan] * 6,
@@ -274,7 +274,7 @@ def test_report_end_to_end_with_errors(self):
         pd.testing.assert_frame_equal(
             report.get_details('Column Pair Trends'), expected_details_cpt
         )
-        assert report.get_score() == 0.7842929635366918
+        assert report.get_score() == 0.8204378797402054
 
     def test_report_with_column_nan(self):
         """Test the report with column full of NaNs."""
@@ -307,7 +307,7 @@ def test_report_with_column_nan(self):
                 'KSComplement', 'KSComplement', 'TVComplement', 'TVComplement', 'KSComplement'
             ],
             'Score': [
-                0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372,
+                0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302,
                 np.nan
             ],
             'Error': [
@@ -334,16 +334,16 @@ def test_report_with_column_nan(self):
                 'ContingencySimilarity'
             ],
             'Score': [
-                0.9854510263003199, 0.586046511627907, 0.6232558139534884, np.nan,
-                0.7348837209302326, 0.6976744186046512, np.nan, 0.8976744186046511,
-                0.9720930232558139, 0.9255813953488372
+                0.9187918131436303, 0.6744186046511629, 0.7162790697674419, np.nan,
+                0.813953488372093, 0.772093023255814, np.nan, 0.9348837209302325,
+                0.9953488372093023, 0.9395348837209302
             ],
             'Real Correlation': [
                 0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
                 np.nan, np.nan
             ],
             'Synthetic Correlation': [
-                0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
+                -0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
                 np.nan, np.nan
             ],
             'Error': [