Skip to content

Commit

Permalink
Update the synthetic data that's available for the single-table demo (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Nov 16, 2023
1 parent 26ec13c commit fbe3a65
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 280 deletions.
432 changes: 216 additions & 216 deletions sdmetrics/demos/single_table/synthetic.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,14 @@ def test_get_score(self):
score = boundary_property.get_score(real_data, synthetic_data, metadata)

# Assert
assert score == 0.9172655676537751

assert score == 1.0
expected_details = pd.DataFrame({
'Column': [
'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc',
'degree_perc', 'experience_years', 'employability_perc', 'mba_perc'
],
'Metric': ['BoundaryAdherence'] * 10,
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0,
0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0,
0.8883720930232558, 0.8930232558139535
]
'Score': [1.0] * 10
})

pd.testing.assert_frame_equal(boundary_property.details, expected_details)
Expand Down Expand Up @@ -64,4 +59,4 @@ def test_get_score_error(self):
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert error_messages[2] == expected_message_3
assert score == 0.9270636340403783
assert score == 1.0
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,19 @@ def test_get_score(self):
'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
],
'Score': [
0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
0.6976744186046512, 0.8976744186046511
0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
0.772093023255814, 0.9348837209302325
],
'Real Correlation': [
0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
],
'Synthetic Correlation': [
0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
-0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
]
}
expected_details = pd.DataFrame(expected_details_dict)
pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
assert score == 0.754164318336875
assert score == 0.8050699533533958

def test_get_score_warnings(self, recwarn):
"""Test the ``get_score`` method when the metrics are raising erros for some columns."""
Expand Down Expand Up @@ -90,7 +90,7 @@ def test_get_score_warnings(self, recwarn):
# Assert
details = column_shape_property.details
pd.testing.assert_series_equal(details['Error'], exp_error_serie, check_names=False)
assert score == 0.7023255813953488
assert score == 0.7751937984496124

def test_only_categorical_columns(self):
"""Test the ``get_score`` method when there are only categorical columns."""
Expand Down Expand Up @@ -119,12 +119,12 @@ def test_only_categorical_columns(self):
],
'Metric': ['ContingencySimilarity'] * 6,
'Score': [
0.8883720930232558, 0.9023255813953488, 0.7767441860465116, 0.9348837209302325,
0.8883720930232558, 0.8976744186046511
0.9209302325581395, 0.9627906976744186, 0.6837209302325581, 0.9302325581395349,
0.9255813953488372, 0.9348837209302325
],
'Real Correlation': [np.nan] * 6,
'Synthetic Correlation': [np.nan] * 6
}
expected_details = pd.DataFrame(expected_details_dict)
pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
assert score == 0.8813953488372093
assert score == 0.8930232558139535
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ def test_get_score(self):
'TVComplement'
],
'Score': [
0.701107, 0.768919, 0.869155, 0.826051, 0.553488, 0.902326, 0.995349, 0.627907,
0.939535, 0.627907, 0.916279, 0.800000, 0.781395, 0.841860, 0.972093, 0.925581
0.6621621621621622, 0.849290780141844, 0.8531399046104928, 0.43918918918918914,
0.8976744186046511, 0.9860465116279069, 0.986046511627907, 0.8976744186046511,
1.0, 0.9162790697674419, 0.9906976744186047, 0.3441860465116279,
0.9348837209302325, 0.9255813953488372, 0.9953488372093023, 0.9395348837209302
]
}
expected_details = pd.DataFrame(expected_details_dict)
pd.testing.assert_frame_equal(column_shape_property.details, expected_details)
assert score == 0.8155594899871002
assert score == 0.8511084702797364

def test_get_score_errors(self):
"""Test the ``get_score`` method when the metrics are raising errors for some columns."""
Expand Down Expand Up @@ -65,4 +67,4 @@ def test_get_score_errors(self):
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.8261749908947813
assert score == 0.858620688670242
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_get_score(self):
score = coverage_property.get_score(real_data, synthetic_data, metadata)

# Assert
assert score == 0.9419212095491987
assert score == 0.896792056025647

expected_details = pd.DataFrame({
'Column': [
Expand All @@ -31,8 +31,10 @@ def test_get_score(self):
'RangeCoverage', 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage'
],
'Score': [
1.0, 1.0, 0.42333783783783785, 1.0, 0.9807348482826732, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 0.6666666666666667, 1.0, 1.0, 1.0, 1.0
0.9952153110047847, 0.9554140127388535, 0.45462162162162156,
0.7777777777777778, 0.928171334431631, 1.0, 1.0, 0.9659863945578232,
1.0, 1.0, 1.0, 0.33333333333333337, 0.9943749999999998, 0.943778110944528,
1.0, 1.0
]
})

Expand Down Expand Up @@ -64,4 +66,4 @@ def test_get_score_error(self):
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.9336242394847984
assert score == 0.8827916132432548
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,11 @@ def test_get_score(self):
'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence',
'CategoryAdherence'
],
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444,
1.0, 1.0, 0.8651162790697674, 1.0, 1.0, 0.9255813953488372,
1.0, 0.9441860465116279, 1.0, 1.0, 0.8883720930232558,
0.8930232558139535, 1.0, 1.0
]
'Score': [1.0] * 17
}
expected_details = pd.DataFrame(expected_details_dict)
pd.testing.assert_frame_equal(data_validity_property.details, expected_details)
assert score == 0.9513326868551618
assert score == 1.0

def test_get_score_errors(self):
"""Test the ``get_score`` method when the metrics are raising errors for some columns."""
Expand Down Expand Up @@ -70,4 +65,4 @@ def test_get_score_errors(self):
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.9622593255151395
assert score == 1.0
25 changes: 11 additions & 14 deletions tests/integration/reports/single_table/test_diagnostic_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_get_properties(self):
expected_frame = pd.DataFrame(
{
'Property': ['Data Validity', 'Data Structure'],
'Score': [0.951333, 1.0]
'Score': [1.0, 1.0]
}
)
pd.testing.assert_frame_equal(properties_frame, expected_frame)
Expand All @@ -38,7 +38,7 @@ def test_get_score(self):

# Assert

assert result == 0.975666343427581
assert result == 1.0

def test_get_score_with_no_verbose(self):
"""Test the ``get_score`` method works when verbose=False."""
Expand All @@ -51,7 +51,7 @@ def test_get_score_with_no_verbose(self):
result_dict = report.get_score()

# Assert
assert result_dict == 0.975666343427581
assert result_dict == 1.0

def test_end_to_end(self):
"""Test the end-to-end functionality of the diagnostic report."""
Expand All @@ -78,9 +78,8 @@ def test_end_to_end(self):
'CategoryAdherence'
],
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279, 1.0,
1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0
]
})

Expand Down Expand Up @@ -129,9 +128,8 @@ def test_generate_with_object_datetimes(self):
'CategoryAdherence'
],
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279,
1.0, 1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0
]
})

Expand Down Expand Up @@ -160,9 +158,9 @@ def test_generate_multiple_times(self):
report = DiagnosticReport()
report.generate(real_data, synthetic_data, metadata, verbose=False)

assert report.get_score() == 0.975666343427581
assert report.get_score() == 1.0
report.generate(real_data, synthetic_data, metadata)
assert report.get_score() == 0.975666343427581
assert report.get_score() == 1.0

def test_get_details_with_errors(self):
"""Test the ``get_details`` function of the diagnostic report when there are errors."""
Expand Down Expand Up @@ -190,9 +188,8 @@ def test_get_details_with_errors(self):
'CategoryAdherence'
],
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0,
0.8651162790697674, 1.0, 1.0, np.nan, 1.0, 0.9441860465116279, 1.0, 1.0,
0.8883720930232558, 0.8930232558139535, 1.0, 1.0
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.nan, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0
],
'Error': [
None, None, None, None, None, None, None, None,
Expand Down
36 changes: 18 additions & 18 deletions tests/integration/reports/single_table/test_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_report_end_to_end(self):
'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
'Score': [
0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372
0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302
],
}

Expand All @@ -105,14 +105,14 @@ def test_report_end_to_end(self):
'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
],
'Score': [
0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
0.6976744186046512, 0.8976744186046511
0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
0.772093023255814, 0.9348837209302325
],
'Real Correlation': [
0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
],
'Synthetic Correlation': [
0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
-0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
]
}
expected_details_column_shapes = pd.DataFrame(expected_details_column_shapes_dict)
Expand All @@ -124,7 +124,7 @@ def test_report_end_to_end(self):
pd.testing.assert_frame_equal(
report.get_details('Column Pair Trends'), expected_details_cpt
)
assert report.get_score() == 0.7804181608907237
assert report.get_score() == 0.8393750143888287

report_info = report.get_info()
assert report_info == report.report_info
Expand Down Expand Up @@ -167,7 +167,7 @@ def test_quality_report_with_object_datetimes(self):
'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
'Score': [
0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372
0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302
],
}

Expand All @@ -185,14 +185,14 @@ def test_quality_report_with_object_datetimes(self):
'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
],
'Score': [
0.9854510263003199, 0.586046511627907, 0.6232558139534884, 0.7348837209302326,
0.6976744186046512, 0.8976744186046511
0.9187918131436303, 0.6744186046511629, 0.7162790697674419, 0.813953488372093,
0.772093023255814, 0.9348837209302325
],
'Real Correlation': [
0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan
],
'Synthetic Correlation': [
0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan
-0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan
]
}
expected_details_column_shapes = pd.DataFrame(expected_details_column_shapes_dict)
Expand All @@ -204,7 +204,7 @@ def test_quality_report_with_object_datetimes(self):
pd.testing.assert_frame_equal(
report.get_details('Column Pair Trends'), expected_details_cpt
)
assert report.get_score() == 0.7804181608907237
assert report.get_score() == 0.8393750143888287

def test_report_end_to_end_with_errors(self):
"""Test the quality report end to end with errors in the properties computation."""
Expand All @@ -229,7 +229,7 @@ def test_report_end_to_end_with_errors(self):
expected_details_column_shapes_dict = {
'Column': ['start_date', 'second_perc', 'work_experience', 'degree_type'],
'Metric': ['KSComplement', 'KSComplement', 'TVComplement', 'TVComplement'],
'Score': [0.7011066184294531, np.nan, 0.9720930232558139, 0.9255813953488372],
'Score': [0.6621621621621622, np.nan, 0.9953488372093023, 0.9395348837209302],
'Error': [
None,
"TypeError: '<' not supported between instances of 'str' and 'float'",
Expand All @@ -252,7 +252,7 @@ def test_report_end_to_end_with_errors(self):
'ContingencySimilarity', 'ContingencySimilarity', 'ContingencySimilarity'
],
'Score': [
np.nan, 0.586046511627907, 0.6232558139534884, np.nan, np.nan, 0.8976744186046511
np.nan, 0.6744186046511629, 0.7162790697674419, np.nan, np.nan, 0.9348837209302325
],
'Real Correlation': [np.nan] * 6,
'Synthetic Correlation': [np.nan] * 6,
Expand All @@ -274,7 +274,7 @@ def test_report_end_to_end_with_errors(self):
pd.testing.assert_frame_equal(
report.get_details('Column Pair Trends'), expected_details_cpt
)
assert report.get_score() == 0.7842929635366918
assert report.get_score() == 0.8204378797402054

def test_report_with_column_nan(self):
"""Test the report with column full of NaNs."""
Expand Down Expand Up @@ -307,7 +307,7 @@ def test_report_with_column_nan(self):
'KSComplement', 'KSComplement', 'TVComplement', 'TVComplement', 'KSComplement'
],
'Score': [
0.7011066184294531, 0.627906976744186, 0.9720930232558139, 0.9255813953488372,
0.6621621621621622, 0.8976744186046511, 0.9953488372093023, 0.9395348837209302,
np.nan
],
'Error': [
Expand All @@ -334,16 +334,16 @@ def test_report_with_column_nan(self):
'ContingencySimilarity'
],
'Score': [
0.9854510263003199, 0.586046511627907, 0.6232558139534884, np.nan,
0.7348837209302326, 0.6976744186046512, np.nan, 0.8976744186046511,
0.9720930232558139, 0.9255813953488372
0.9187918131436303, 0.6744186046511629, 0.7162790697674419, np.nan,
0.813953488372093, 0.772093023255814, np.nan, 0.9348837209302325,
0.9953488372093023, 0.9395348837209302
],
'Real Correlation': [
0.04735340044317632, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan, np.nan
],
'Synthetic Correlation': [
0.07645134784253645, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
-0.11506297326956302, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan, np.nan
],
'Error': [
Expand Down

0 comments on commit fbe3a65

Please sign in to comment.