From 340b8feb09186b13670d930701319d166e568d5e Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:27:47 -0600 Subject: [PATCH] Update `DiagnosticReport` to calculate base correctness of synthetic data (#496) --- sdmetrics/reports/_results_handler.py | 151 ----------- sdmetrics/reports/base_report.py | 32 ++- .../reports/multi_table/diagnostic_report.py | 27 +- .../reports/multi_table/quality_report.py | 15 -- .../reports/single_table/diagnostic_report.py | 29 +- .../reports/single_table/quality_report.py | 15 -- .../multi_table/test_diagnostic_report.py | 123 ++++----- .../single_table/test_diagnostic_report.py | 255 ++++++------------ .../single_table/test_quality_report.py | 13 +- .../multi_table/test_diagnostic_report.py | 60 +---- .../multi_table/test_quality_report.py | 55 ---- .../single_table/test_diagnostic_report.py | 58 +--- .../single_table/test_quality_report.py | 37 --- tests/unit/reports/test__results_handler.py | 152 ----------- tests/unit/reports/test_base_report.py | 84 ++++-- 15 files changed, 252 insertions(+), 854 deletions(-) delete mode 100644 sdmetrics/reports/_results_handler.py delete mode 100644 tests/unit/reports/test__results_handler.py diff --git a/sdmetrics/reports/_results_handler.py b/sdmetrics/reports/_results_handler.py deleted file mode 100644 index 4ca26297..00000000 --- a/sdmetrics/reports/_results_handler.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Class for handling the storage and displaying of results for reports.""" - -import sys - -import pandas as pd - - -class BaseResultsHandler(): - """Base class for handling results for reports.""" - - def print_results(self, report, verbose): - """Print the results of a report. - - Args: - report (sdmetrics.reports.BaseReport): - Report class to print results for. - verbose (bool): - Whether or not to print results to std.out. - """ - raise NotImplementedError - - -class QualityReportResultsHandler(BaseResultsHandler): - """Results handler for quality reports.""" - - def print_results(self, properties, score, verbose): - """Print the results of a QualityReport. - - Args: - properties (dict): - Dictionary mapping property names to an instance of the Property class. - score (float): - Overall score of the report. - verbose (bool): - Whether or not to print results to std.out. - """ - if verbose: - sys.stdout.write( - f'\nOverall Quality Score: {round(score * 100, 2)}%\n\n' - ) - sys.stdout.write('Properties:\n') - - for property_name in properties: - property_score = round(properties[property_name]._compute_average() * 100, 2) - sys.stdout.write( - f'- {property_name}: {property_score}%\n' - ) - - -class DiagnosticReportResultsHandler(BaseResultsHandler): - """Results handler for diagnostic reports.""" - - DIAGNOSTIC_REPORT_RESULT_DETAILS = { - 'BoundaryAdherence': { - 'SUCCESS': ( - 'The synthetic data follows over 90% of the min/max boundaries set by the real ' - 'data' - ), - 'WARNING': ( - 'More than 10% the synthetic data does not follow the min/max boundaries set by ' - 'the real data' - ), - 'DANGER': ( - 'More than 50% the synthetic data does not follow the min/max boundaries set by ' - 'the real data' - ), - }, - 'CategoryCoverage': { - 'SUCCESS': ( - 'The synthetic data covers over 90% of the categories present in the real data' - ), - 'WARNING': ( - 'The synthetic data is missing more than 10% of the categories present in the ' - 'real data' - ), - 'DANGER': ( - 'The synthetic data is missing more than 50% of the categories present in the ' - 'real data' - ), - }, - 'NewRowSynthesis': { - 'SUCCESS': 'Over 90% of the synthetic rows are not copies of the real data', - 'WARNING': 'More than 10% of the synthetic rows are copies of the real data', - 'DANGER': 'More than 50% of the synthetic rows are copies of the real data', - }, - 'RangeCoverage': { - 'SUCCESS': ( - 'The synthetic data covers over 90% of the numerical ranges present in the real ' - 'data' - ), - 'WARNING': ( - 'The synthetic data is missing more than 10% of the numerical ranges present in ' - 'the real data' - ), - 'DANGER': ( - 'The synthetic data is missing more than 50% of the numerical ranges present in ' - 'the real data' - ), - } - } - - def __init__(self): - self.results = {} - - def _print_results_for_level(self, level): - """Print the result for a given level. - - Args: - level (string): - The level to print results for. - """ - level_marks = {'SUCCESS': '✓', 'WARNING': '!', 'DANGER': 'x'} - - if len(self.results[level]) > 0: - sys.stdout.write(f'\n{level}:\n') - for result in self.results[level]: - sys.stdout.write(f'{level_marks[level]} {result}\n') - - def print_results(self, properties, verbose): - """Print the results of a DiagnosticReport. - - Args: - properties (dict): - Dictionary mapping property names to an instance of the Property class. - verbose (bool): - Whether or not to print results to std.out. - """ - self.results['SUCCESS'] = [] - self.results['WARNING'] = [] - self.results['DANGER'] = [] - for property_name in properties: - details = properties[property_name].details - average_score_metric = details.groupby('Metric')['Score'].mean() - for metric, score in average_score_metric.items(): - if pd.isna(score): - continue - if score >= 0.9: - self.results['SUCCESS'].append( - self.DIAGNOSTIC_REPORT_RESULT_DETAILS[metric]['SUCCESS']) - elif score >= 0.5: - self.results['WARNING'].append( - self.DIAGNOSTIC_REPORT_RESULT_DETAILS[metric]['WARNING']) - else: - self.results['DANGER'].append( - self.DIAGNOSTIC_REPORT_RESULT_DETAILS[metric]['DANGER']) - - if verbose: - sys.stdout.write('\nDiagnostic Results:\n') - self._print_results_for_level('SUCCESS') - self._print_results_for_level('WARNING') - self._print_results_for_level('DANGER') diff --git a/sdmetrics/reports/base_report.py b/sdmetrics/reports/base_report.py index dbf600a9..3032e5e3 100644 --- a/sdmetrics/reports/base_report.py +++ b/sdmetrics/reports/base_report.py @@ -25,7 +25,6 @@ def __init__(self): self._overall_score = None self.is_generated = False self._properties = {} - self._results_handler = None self.report_info = { 'report_type': self.__class__.__name__, 'generated_date': None, @@ -91,6 +90,25 @@ def convert_datetimes(real_data, synthetic_data, metadata): except Exception: continue + def _print_results(self, verbose): + """Print the results. + + Args: + verbose (bool): + Whether or not to print results to std.out. + """ + if verbose: + sys.stdout.write( + f'\nOverall Score: {round(self._overall_score * 100, 2)}%\n\n' + ) + sys.stdout.write('Properties:\n') + + for property_name, property_instance in self._properties.items(): + property_score = round(property_instance._compute_average() * 100, 2) + sys.stdout.write( + f'- {property_name}: {property_score}%\n' + ) + def generate(self, real_data, synthetic_data, metadata, verbose=True): """Generate report. @@ -152,7 +170,7 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True): end_time = time.time() self.report_info['generation_time'] = end_time - start_time - self._handle_results(verbose) + self._print_results(verbose) def _check_property_name(self, property_name): """Check that the given property name is valid. @@ -168,6 +186,16 @@ def _check_property_name(self, property_name): f" Valid property names are '{valid_property_names}'." ) + def get_score(self): + """Return the overall score. + + Returns: + float + The overall score. + """ + self._check_report_generated() + return self._overall_score + def get_info(self): """Get the information about the report.""" return deepcopy(self.report_info) diff --git a/sdmetrics/reports/multi_table/diagnostic_report.py b/sdmetrics/reports/multi_table/diagnostic_report.py index f18edb21..6ccaa9bf 100644 --- a/sdmetrics/reports/multi_table/diagnostic_report.py +++ b/sdmetrics/reports/multi_table/diagnostic_report.py @@ -1,8 +1,5 @@ """Multi table diagnostic report.""" -from copy import deepcopy - -from sdmetrics.reports._results_handler import DiagnosticReportResultsHandler -from sdmetrics.reports.multi_table._properties import Boundary, Coverage, Synthesis +from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport @@ -10,27 +7,13 @@ class DiagnosticReport(BaseMultiTableReport): """Multi table diagnostic report. This class creates a diagnostic report for multi-table data. It calculates the diagnostic - score along three properties - Synthesis, Coverage, and Boundary. + score along three properties - Relationship Validity, Data Structure, and Data Validity. """ def __init__(self): super().__init__() self._properties = { - 'Coverage': Coverage(), - 'Boundary': Boundary(), - 'Synthesis': Synthesis() + 'Data Validity': DataValidity(), + 'Data Structure': Structure(), + 'Relationship Validity': RelationshipValidity() } - self._results_handler = DiagnosticReportResultsHandler() - - def _handle_results(self, verbose): - self._results_handler.print_results(self._properties, verbose) - - def get_results(self): - """Return the diagnostic results. - - Returns: - dict - The diagnostic results. - """ - self._check_report_generated() - return deepcopy(self._results_handler.results) diff --git a/sdmetrics/reports/multi_table/quality_report.py b/sdmetrics/reports/multi_table/quality_report.py index 31b74f41..0a246dcd 100644 --- a/sdmetrics/reports/multi_table/quality_report.py +++ b/sdmetrics/reports/multi_table/quality_report.py @@ -1,5 +1,4 @@ """Multi table quality report.""" -from sdmetrics.reports._results_handler import QualityReportResultsHandler from sdmetrics.reports.multi_table._properties import ( Cardinality, ColumnPairTrends, ColumnShapes, InterTableTrends) from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport @@ -20,17 +19,3 @@ def __init__(self): 'Cardinality': Cardinality(), 'Intertable Trends': InterTableTrends() } - self._results_handler = QualityReportResultsHandler() - - def _handle_results(self, verbose): - self._results_handler.print_results(self._properties, self._overall_score, verbose) - - def get_score(self): - """Return the overall quality score. - - Returns: - float - The overall quality score. - """ - self._check_report_generated() - return self._overall_score diff --git a/sdmetrics/reports/single_table/diagnostic_report.py b/sdmetrics/reports/single_table/diagnostic_report.py index 4a446716..4e36815e 100644 --- a/sdmetrics/reports/single_table/diagnostic_report.py +++ b/sdmetrics/reports/single_table/diagnostic_report.py @@ -1,39 +1,18 @@ """Single table diagnostic report.""" -import logging -from copy import deepcopy - -from sdmetrics.reports._results_handler import DiagnosticReportResultsHandler from sdmetrics.reports.base_report import BaseReport -from sdmetrics.reports.single_table._properties import Boundary, Coverage, Synthesis - -LOGGER = logging.getLogger(__name__) +from sdmetrics.reports.single_table._properties import DataValidity, Structure class DiagnosticReport(BaseReport): """Single table diagnostic report. This class creates a diagnostic report for single-table data. It calculates the diagnostic - score along three properties - Synthesis, Coverage, and Boundary. + score along two properties - Data Structure and Data Validity. """ def __init__(self): super().__init__() self._properties = { - 'Coverage': Coverage(), - 'Boundary': Boundary(), - 'Synthesis': Synthesis() + 'Data Validity': DataValidity(), + 'Data Structure': Structure(), } - self._results_handler = DiagnosticReportResultsHandler() - - def _handle_results(self, verbose): - self._results_handler.print_results(self._properties, verbose) - - def get_results(self): - """Return the diagnostic results. - - Returns: - dict - The diagnostic results. - """ - self._check_report_generated() - return deepcopy(self._results_handler.results) diff --git a/sdmetrics/reports/single_table/quality_report.py b/sdmetrics/reports/single_table/quality_report.py index ed97a577..04d2898a 100644 --- a/sdmetrics/reports/single_table/quality_report.py +++ b/sdmetrics/reports/single_table/quality_report.py @@ -1,5 +1,4 @@ """Single table quality report.""" -from sdmetrics.reports._results_handler import QualityReportResultsHandler from sdmetrics.reports.base_report import BaseReport from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes @@ -17,17 +16,3 @@ def __init__(self): 'Column Shapes': ColumnShapes(), 'Column Pair Trends': ColumnPairTrends() } - self._results_handler = QualityReportResultsHandler() - - def _handle_results(self, verbose): - self._results_handler.print_results(self._properties, self._overall_score, verbose) - - def get_score(self): - """Return the overall quality score. - - Returns: - float - The overall quality score. - """ - self._check_report_generated() - return self._overall_score diff --git a/tests/integration/reports/multi_table/test_diagnostic_report.py b/tests/integration/reports/multi_table/test_diagnostic_report.py index dd12a484..7e2b8923 100644 --- a/tests/integration/reports/multi_table/test_diagnostic_report.py +++ b/tests/integration/reports/multi_table/test_diagnostic_report.py @@ -14,23 +14,10 @@ def test_end_to_end(self): # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - results = report.get_results() + results = report.get_score() # Assert - expected_results = { - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data', - 'The synthetic data covers over 90% of the numerical ranges present' - ' in the real data' - ], - 'WARNING': [ - 'More than 10% the synthetic data does not follow the min/max boundaries' - ' set by the real data', - 'More than 10% of the synthetic rows are copies of the real data' - ], - 'DANGER': [] - } - assert results == expected_results + assert results == 0.9814814814814815 def test_end_to_end_with_object_datetimes(self): """Test the ``DiagnosticReport`` report with object datetimes.""" @@ -45,28 +32,15 @@ def test_end_to_end_with_object_datetimes(self): # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - results = report.get_results() + results = report.get_score() properties = report.get_properties() # Assert expected_dataframe = pd.DataFrame({ - 'Property': ['Coverage', 'Boundary', 'Synthesis'], - 'Score': [0.9573447196980541, 0.8666666666666667, 0.6333333333333333] + 'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'], + 'Score': [0.9444444444444445, 1.0, 1.0] }) - expected_results = { - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data', - 'The synthetic data covers over 90% of the numerical ranges present' - ' in the real data' - ], - 'WARNING': [ - 'More than 10% the synthetic data does not follow the min/max boundaries' - ' set by the real data', - 'More than 10% of the synthetic rows are copies of the real data' - ], - 'DANGER': [] - } - assert results == expected_results + assert results == 0.9814814814814815 pd.testing.assert_frame_equal(properties, expected_dataframe) def test_end_to_end_with_metrics_failing(self): @@ -83,36 +57,46 @@ def test_end_to_end_with_metrics_failing(self): # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - results = report.get_results() + results = report.get_score() # Assert - expected_results = { - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data' - ], - 'WARNING': [], - 'DANGER': ['More than 50% of the synthetic rows are copies of the real data'] - } expected_properties = pd.DataFrame({ - 'Property': ['Coverage', 'Boundary', 'Synthesis'], - 'Score': [0.9666666666666668, np.nan, 0.0] + 'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'], + 'Score': [0.9677777777777777, 0.7833333333333333, 1.0] }) expected_details = pd.DataFrame({ - 'Table': ['users', 'transactions', 'transactions'], - 'Column': ['age', 'timestamp', 'amount'], - 'Metric': ['BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence'], - 'Score': [np.nan, np.nan, np.nan], + 'Table': [ + 'users', 'users', 'users', 'users', 'sessions', 'sessions', 'sessions', + 'sessions', 'transactions', 'transactions', 'transactions', 'transactions', + 'transactions' + ], + 'Column': [ + 'user_id', 'country', 'gender', 'age', 'session_id', 'user_id', 'device', + 'os', 'transaction_id', 'session_id', 'timestamp', 'amount', 'approved' + ], + 'Metric': [ + 'KeyUniqueness', 'CategoryAdherence', 'CategoryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'KeyUniqueness', 'CategoryAdherence', 'CategoryAdherence', + 'KeyUniqueness', 'KeyUniqueness', 'BoundaryAdherence', 'BoundaryAdherence', + 'CategoryAdherence' + ], + 'Score': [ + 1.0, 1.0, 1.0, np.nan, 1.0, 0.7777777777777778, 1.0, 1.0, 1.0, 0.9, + np.nan, np.nan, 1.0 + ], 'Error': [ + None, None, None, "TypeError: '<=' not supported between instances of 'str' and 'int'", + None, None, None, None, None, None, "TypeError: '<=' not supported between instances of 'str' and 'Timestamp'", - "TypeError: '<=' not supported between instances of 'str' and 'float'" + "TypeError: '<=' not supported between instances of 'str' and 'float'", None ] }) - assert results == expected_results + assert results == 0.9170370370370371 pd.testing.assert_frame_equal( report.get_properties(), expected_properties, check_exact=False, atol=2e-2 ) - pd.testing.assert_frame_equal(report.get_details('Boundary'), expected_details) + pd.testing.assert_frame_equal(report.get_details('Data Validity'), expected_details) def test_get_properties(self): """Test the ``get_properties`` method.""" @@ -126,60 +110,65 @@ def test_get_properties(self): # Assert expected_dataframe = pd.DataFrame({ - 'Property': ['Coverage', 'Boundary', 'Synthesis'], - 'Score': [0.9573447196980541, 0.8666666666666667, 0.6333333333333333] + 'Property': ['Data Validity', 'Data Structure', 'Relationship Validity'], + 'Score': [0.9444444444444445, 1.0, 1.0] }) pd.testing.assert_frame_equal(properties, expected_dataframe) def test_get_details(self): - """Test the ``get_properties`` method.""" + """Test the ``get_details`` method.""" # Setup real_data, synthetic_data, metadata = load_demo(modality='multi_table') report = DiagnosticReport() # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - details = report.get_details('Coverage') + details = report.get_details('Data Validity') # Assert expected_dataframe = pd.DataFrame({ 'Table': [ - 'users', 'users', 'users', 'sessions', 'sessions', 'transactions', - 'transactions', 'transactions' + 'users', 'users', 'users', 'users', 'sessions', 'sessions', 'sessions', + 'sessions', 'transactions', 'transactions', 'transactions', 'transactions', + 'transactions' ], 'Column': [ - 'country', 'gender', 'age', 'device', 'os', 'timestamp', - 'amount', 'approved' + 'user_id', 'country', 'gender', 'age', 'session_id', 'user_id', 'device', + 'os', 'transaction_id', 'session_id', 'timestamp', 'amount', 'approved' ], 'Metric': [ - 'CategoryCoverage', 'CategoryCoverage', 'RangeCoverage', 'CategoryCoverage', - 'CategoryCoverage', 'RangeCoverage', 'RangeCoverage', 'CategoryCoverage' + 'KeyUniqueness', 'CategoryAdherence', 'CategoryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'KeyUniqueness', 'CategoryAdherence', 'CategoryAdherence', + 'KeyUniqueness', 'KeyUniqueness', 'BoundaryAdherence', 'BoundaryAdherence', + 'CategoryAdherence' ], 'Score': [ - 0.8333333333333334, 1.0, 1.0, 1.0, 1.0, 0.9955955390408375, - 0.829828885210262, 1.0 + 1.0, 1.0, 1.0, 0.9, 1.0, 0.7777777777777778, 1.0, 1.0, 1.0, 0.9, 0.9, + 0.8, 1.0 ] }) pd.testing.assert_frame_equal(details, expected_dataframe) def test_get_details_with_table_name(self): - """Test the ``get_properties`` method with a table_name parameter.""" + """Test the ``get_details`` method with a table_name parameter.""" # Setup real_data, synthetic_data, metadata = load_demo(modality='multi_table') report = DiagnosticReport() # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - details = report.get_details('Coverage', 'users') + details = report.get_details('Data Validity', 'users') # Assert expected_dataframe = pd.DataFrame({ - 'Table': ['users', 'users', 'users'], - 'Column': ['country', 'gender', 'age'], - 'Metric': ['CategoryCoverage', 'CategoryCoverage', 'RangeCoverage'], - 'Score': [0.8333333333333334, 1.0, 1.0] + 'Table': ['users', 'users', 'users', 'users'], + 'Column': ['user_id', 'country', 'gender', 'age'], + 'Metric': [ + 'KeyUniqueness', 'CategoryAdherence', 'CategoryAdherence', 'BoundaryAdherence' + ], + 'Score': [1.0, 1.0, 1.0, 0.9] }) pd.testing.assert_frame_equal(details, expected_dataframe) diff --git a/tests/integration/reports/single_table/test_diagnostic_report.py b/tests/integration/reports/single_table/test_diagnostic_report.py index 939a75cc..3e33e99f 100644 --- a/tests/integration/reports/single_table/test_diagnostic_report.py +++ b/tests/integration/reports/single_table/test_diagnostic_report.py @@ -20,63 +20,38 @@ def test_get_properties(self): # Assert expected_frame = pd.DataFrame( { - 'Property': ['Coverage', 'Boundary', 'Synthesis'], - 'Score': [0.9419212095491987, 0.9172655676537751, 1.0] + 'Property': ['Data Validity', 'Data Structure'], + 'Score': [0.951333, 1.0] } ) pd.testing.assert_frame_equal(properties_frame, expected_frame) - def test_get_results(self): - """Test the ``get_results`` method.""" + def test_get_score(self): + """Test the ``get_score`` method.""" # Setup real_data, synthetic_data, metadata = load_demo(modality='single_table') report = DiagnosticReport() # Run report.generate(real_data, synthetic_data, metadata) - result_dict = report.get_results() + result = report.get_score() # Assert - expected_results = { - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data', - 'The synthetic data covers over 90% of the numerical ranges present in the ' - 'real data', - 'The synthetic data follows over 90% of the min/max boundaries set by the ' - 'real data', - 'Over 90% of the synthetic rows are not copies of the real data' - ], - 'WARNING': [], - 'DANGER': [] - } - assert result_dict == expected_results + assert result == 0.975666343427581 - def test_get_results_with_no_verbose(self): - """Test the ``get_results`` method works when verbose=False.""" + def test_get_score_with_no_verbose(self): + """Test the ``get_score`` method works when verbose=False.""" # Setup real_data, synthetic_data, metadata = load_demo(modality='single_table') report = DiagnosticReport() # Run report.generate(real_data, synthetic_data, metadata, verbose=False) - result_dict = report.get_results() + result_dict = report.get_score() # Assert - expected_results = { - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data', - 'The synthetic data covers over 90% of the numerical ranges present in the ' - 'real data', - 'The synthetic data follows over 90% of the min/max boundaries set by the ' - 'real data', - 'Over 90% of the synthetic rows are not copies of the real data' - ], - 'WARNING': [], - 'DANGER': [] - } - - assert result_dict == expected_results + assert result_dict == 0.975666343427581 def test_end_to_end(self): """Test the end-to-end functionality of the diagnostic report.""" @@ -88,59 +63,40 @@ def test_end_to_end(self): report.generate(real_data, synthetic_data, metadata) # Assert - expected_details_synthetis = pd.DataFrame( - { - 'Metric': 'NewRowSynthesis', - 'Score': 1.0, - 'Num Matched Rows': 0, - 'Num New Rows': 215 - }, index=[0] - ) - - expected_details_coverage = pd.DataFrame({ + expected_details_data_validity = pd.DataFrame({ 'Column': [ - 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'high_spec', - 'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', 'experience_years', - 'employability_perc', 'mba_perc', 'work_experience', 'degree_type' + 'start_date', 'end_date', 'salary', 'duration', 'student_id', 'high_perc', + 'high_spec', 'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', + 'experience_years', 'employability_perc', 'mba_perc', 'work_experience', + 'degree_type' ], 'Metric': [ - 'RangeCoverage', 'RangeCoverage', 'RangeCoverage', 'RangeCoverage', - 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage', 'RangeCoverage', - 'CategoryCoverage', 'RangeCoverage', 'CategoryCoverage', 'RangeCoverage', - 'RangeCoverage', 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage' + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'BoundaryAdherence', 'CategoryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'CategoryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'CategoryAdherence' ], 'Score': [ - 1.0, 1.0, 0.42333783783783785, 1.0, 0.9807348482826732, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 0.6666666666666667, 1.0, 1.0, 1.0, 1.0 + 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0, + 0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279, 1.0, + 1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0 ] }) - expected_details_boundary = pd.DataFrame({ - 'Column': [ - 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc', - 'degree_perc', 'experience_years', 'employability_perc', 'mba_perc' - ], - 'Metric': ['BoundaryAdherence'] * 10, - 'Score': [ - 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, - 0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0, - 0.8883720930232558, 0.8930232558139535 - ] + expected_details_data_structure = pd.DataFrame({ + 'Metric': ['TableFormat'], + 'Score': [1.0] }) pd.testing.assert_frame_equal( - report.get_details('Synthesis'), - expected_details_synthetis - ) - - pd.testing.assert_frame_equal( - report.get_details('Coverage'), - expected_details_coverage + report.get_details('Data Validity'), + expected_details_data_validity ) pd.testing.assert_frame_equal( - report.get_details('Boundary'), - expected_details_boundary + report.get_details('Data Structure'), + expected_details_data_structure ) def test_generate_with_object_datetimes(self): @@ -158,59 +114,40 @@ def test_generate_with_object_datetimes(self): report.generate(real_data, synthetic_data, metadata) # Assert - expected_details_synthetis = pd.DataFrame( - { - 'Metric': 'NewRowSynthesis', - 'Score': 1.0, - 'Num Matched Rows': 0, - 'Num New Rows': 215 - }, index=[0] - ) - - expected_details_coverage = pd.DataFrame({ + expected_details_data_validity = pd.DataFrame({ 'Column': [ - 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'high_spec', - 'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', 'experience_years', - 'employability_perc', 'mba_perc', 'work_experience', 'degree_type' + 'start_date', 'end_date', 'salary', 'duration', 'student_id', 'high_perc', + 'high_spec', 'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', + 'experience_years', 'employability_perc', 'mba_perc', 'work_experience', + 'degree_type' ], 'Metric': [ - 'RangeCoverage', 'RangeCoverage', 'RangeCoverage', 'RangeCoverage', - 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage', 'RangeCoverage', - 'CategoryCoverage', 'RangeCoverage', 'CategoryCoverage', 'RangeCoverage', - 'RangeCoverage', 'RangeCoverage', 'CategoryCoverage', 'CategoryCoverage' + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'BoundaryAdherence', 'CategoryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'CategoryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'CategoryAdherence' ], 'Score': [ - 1.0, 1.0, 0.42333783783783785, 1.0, 0.9807348482826732, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 0.6666666666666667, 1.0, 1.0, 1.0, 1.0 + 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0, + 0.8651162790697674, 1.0, 1.0, 0.9255813953488372, 1.0, 0.9441860465116279, + 1.0, 1.0, 0.8883720930232558, 0.8930232558139535, 1.0, 1.0 ] }) - expected_details_boundary = pd.DataFrame({ - 'Column': [ - 'start_date', 'end_date', 'salary', 'duration', 'high_perc', 'second_perc', - 'degree_perc', 'experience_years', 'employability_perc', 'mba_perc' - ], - 'Metric': ['BoundaryAdherence'] * 10, - 'Score': [ - 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, - 0.8651162790697674, 0.9255813953488372, 0.9441860465116279, 1.0, - 0.8883720930232558, 0.8930232558139535 - ] + expected_details_data_structure = pd.DataFrame({ + 'Metric': ['TableFormat'], + 'Score': [1.0] }) pd.testing.assert_frame_equal( - report.get_details('Synthesis'), - expected_details_synthetis - ) - - pd.testing.assert_frame_equal( - report.get_details('Coverage'), - expected_details_coverage + report.get_details('Data Validity'), + expected_details_data_validity ) pd.testing.assert_frame_equal( - report.get_details('Boundary'), - expected_details_boundary + report.get_details('Data Structure'), + expected_details_data_structure ) def test_generate_multiple_times(self): @@ -222,84 +159,48 @@ def test_generate_multiple_times(self): # Run and assert report = DiagnosticReport() report.generate(real_data, synthetic_data, metadata, verbose=False) - expected_results = { - 'DANGER': [], - 'SUCCESS': [ - 'The synthetic data covers over 90% of the categories present in the real data', - 'The synthetic data covers over 90% of the numerical ranges present in the real ' - 'data', - 'The synthetic data follows over 90% of the min/max boundaries set by the real ' - 'data', - 'Over 90% of the synthetic rows are not copies of the real data' - ], - 'WARNING': [] - } - assert report.get_results() == expected_results + + assert report.get_score() == 0.975666343427581 report.generate(real_data, synthetic_data, metadata) - assert report.get_results() == expected_results + assert report.get_score() == 0.975666343427581 def test_get_details_with_errors(self): """Test the ``get_details`` function of the diagnostic report when there are errors.""" # Setup real_data, synthetic_data, metadata = load_demo(modality='single_table') report = DiagnosticReport() - real_data['second_perc'] = np.nan + real_data['second_perc'] = 'A' # Run report.generate(real_data, synthetic_data, metadata) # Assert expected_details = pd.DataFrame({ - 'Column': { - 0: 'start_date', - 1: 'end_date', - 2: 'salary', - 3: 'duration', - 4: 'high_perc', - 5: 'second_perc', - 6: 'degree_perc', - 7: 'experience_years', - 8: 'employability_perc', - 9: 'mba_perc' - }, - 'Metric': { - 0: 'BoundaryAdherence', - 1: 'BoundaryAdherence', - 2: 'BoundaryAdherence', - 3: 'BoundaryAdherence', - 4: 'BoundaryAdherence', - 5: 'BoundaryAdherence', - 6: 'BoundaryAdherence', - 7: 'BoundaryAdherence', - 8: 'BoundaryAdherence', - 9: 'BoundaryAdherence' - }, - 'Score': { - 0: 0.8503937007874016, - 1: 0.8615384615384616, - 2: 0.9444444444444444, - 3: 1.0, - 4: 0.8651162790697674, - 5: np.nan, - 6: 0.9441860465116279, - 7: 1.0, - 8: 0.8883720930232558, - 9: 0.8930232558139535 - }, - 'Error': { - 0: None, - 1: None, - 2: None, - 3: None, - 4: None, - 5: 'InvalidDataError: All NaN values in real data.', - 6: None, - 7: None, - 8: None, - 9: None - } + 'Column': [ + 'start_date', 'end_date', 'salary', 'duration', 'student_id', 'high_perc', + 'high_spec', 'mba_spec', 'second_perc', 'gender', 'degree_perc', 'placed', + 'experience_years', 'employability_perc', 'mba_perc', 'work_experience', + 'degree_type' + ], + 'Metric': [ + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'BoundaryAdherence', 'CategoryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'CategoryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'CategoryAdherence' + ], + 'Score': [ + 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, 1.0, 1.0, + 0.8651162790697674, 1.0, 1.0, np.nan, 1.0, 0.9441860465116279, 1.0, 1.0, + 0.8883720930232558, 0.8930232558139535, 1.0, 1.0 + ], + 'Error': [ + None, None, None, None, None, None, None, None, + 'TypeError: Invalid comparison between dtype=float64 and str', + None, None, None, None, None, None, None, None + ] }) pd.testing.assert_frame_equal( - report.get_details('Boundary'), + report.get_details('Data Validity'), expected_details ) diff --git a/tests/integration/reports/single_table/test_quality_report.py b/tests/integration/reports/single_table/test_quality_report.py index ede5ac5f..b21e8df5 100644 --- a/tests/integration/reports/single_table/test_quality_report.py +++ b/tests/integration/reports/single_table/test_quality_report.py @@ -1,6 +1,3 @@ -import contextlib -import io -import re import time from datetime import date, datetime @@ -365,7 +362,7 @@ def test_report_with_column_nan(self): report.get_details('Column Pair Trends'), expected_details_cpt ) - def test_report_with_verbose(self): + def test_report_with_verbose(self, capsys): """Test the report with verbose. Check that the report prints the correct information. @@ -390,13 +387,13 @@ def test_report_with_verbose(self): report = QualityReport() # Run - with contextlib.redirect_stdout(io.StringIO()) as my_stdout: - report.generate(real_data, synthetic_data, metadata) + report.generate(real_data, synthetic_data, metadata) + captured = capsys.readouterr() + output = captured.out # Assert for pattern in key_phrases: - match = re.search(pattern, my_stdout.getvalue()) - assert match is not None + pattern in output def test_correlation_similarity_constant_real_data(self): """Error out when CorrelationSimilarity is used with a constant pair of columns.""" diff --git a/tests/unit/reports/multi_table/test_diagnostic_report.py b/tests/unit/reports/multi_table/test_diagnostic_report.py index 2e22faf8..e41f975c 100644 --- a/tests/unit/reports/multi_table/test_diagnostic_report.py +++ b/tests/unit/reports/multi_table/test_diagnostic_report.py @@ -1,8 +1,5 @@ -from unittest.mock import Mock, patch - -from sdmetrics.reports._results_handler import DiagnosticReportResultsHandler from sdmetrics.reports.multi_table import DiagnosticReport -from sdmetrics.reports.multi_table._properties import Boundary, Coverage, Synthesis +from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure class TestDiagnosticReport: @@ -16,55 +13,6 @@ def test___init__(self): assert report._overall_score is None assert report.is_generated is False assert report.table_names == [] - assert isinstance(report._properties['Coverage'], Coverage) - assert isinstance(report._properties['Boundary'], Boundary) - assert isinstance(report._properties['Synthesis'], Synthesis) - assert isinstance(report._results_handler, DiagnosticReportResultsHandler) - - def test__handle_results(self): - """Test that the proper values are passed to the handler.""" - # Setup - report = DiagnosticReport() - report._properties = Mock() - report._results_handler = Mock() - - # Run - report._handle_results(True) - - # Assert - report._results_handler.print_results.assert_called_once_with( - report._properties, True) - - @patch('sdmetrics.reports.base_report.BaseReport.generate') - def test_generate_without_verbose(self, mock_super_generate): - """Test the ``generate`` method without verbose.""" - # Setup - real_data = Mock() - synthetic_data = Mock() - metadata = Mock() - report = DiagnosticReport() - - # Run - report.generate(real_data, synthetic_data, metadata, verbose=False) - - # Assert - mock_super_generate.assert_called_once_with( - real_data, synthetic_data, metadata, verbose=False) - - def test_get_results(self): - """Test the ``get_results`` method.""" - # Setup - report = DiagnosticReport() - mock_check_report_generated = Mock() - report._check_report_generated = mock_check_report_generated - mock_results_handler = Mock() - report._results_handler = mock_results_handler - mock_results_handler.results = {'SUCCESS': ['Test']} - report.is_generated = True - - # Run - results = report.get_results() - - # Assert - mock_check_report_generated.assert_called_once_with() - assert results == {'SUCCESS': ['Test']} + assert isinstance(report._properties['Data Validity'], DataValidity) + assert isinstance(report._properties['Data Structure'], Structure) + assert isinstance(report._properties['Relationship Validity'], RelationshipValidity) diff --git a/tests/unit/reports/multi_table/test_quality_report.py b/tests/unit/reports/multi_table/test_quality_report.py index ac0aeec3..24be6e4f 100644 --- a/tests/unit/reports/multi_table/test_quality_report.py +++ b/tests/unit/reports/multi_table/test_quality_report.py @@ -1,6 +1,3 @@ -from unittest.mock import Mock, patch - -from sdmetrics.reports._results_handler import QualityReportResultsHandler from sdmetrics.reports.multi_table import QualityReport from sdmetrics.reports.multi_table._properties import ( Cardinality, ColumnPairTrends, ColumnShapes, InterTableTrends) @@ -21,55 +18,3 @@ def test___init__(self): assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends) assert isinstance(report._properties['Cardinality'], Cardinality) assert isinstance(report._properties['Intertable Trends'], InterTableTrends) - assert isinstance(report._results_handler, QualityReportResultsHandler) - - def test__handle_results(self): - """Test that the proper values are passed to the handler.""" - # Setup - report = QualityReport() - report._overall_score = 0.5 - report._properties = { - 'Column Shapes': Mock(_compute_average=Mock(return_value=0.6)), - 'Column Pair Trends': Mock(_compute_average=Mock(return_value=0.4)) - } - report._results_handler = Mock() - - # Run - report._handle_results(True) - - # Assert - report._results_handler.print_results.assert_called_once_with( - report._properties, report._overall_score, True) - - @patch('sdmetrics.reports.base_report.BaseReport.generate') - def test_generate_without_verbose(self, mock_super_generate): - """Test the ``generate`` method without verbose.""" - # Setup - real_data = Mock() - synthetic_data = Mock() - metadata = Mock() - report = QualityReport() - - # Run - report.generate(real_data, synthetic_data, metadata, verbose=False) - - # Assert - mock_super_generate.assert_called_once_with( - real_data, synthetic_data, metadata, verbose=False - ) - - def test_get_score(self): - """Test the ``get_score`` method.""" - # Setup - report = QualityReport() - mock_check_report_generated = Mock() - report._check_report_generated = mock_check_report_generated - report._overall_score = 0.5 - report.is_generated = True - - # Run - score = report.get_score() - - # Assert - assert score == 0.5 - mock_check_report_generated.assert_called_once_with() diff --git a/tests/unit/reports/single_table/test_diagnostic_report.py b/tests/unit/reports/single_table/test_diagnostic_report.py index 7200c409..b744ce98 100644 --- a/tests/unit/reports/single_table/test_diagnostic_report.py +++ b/tests/unit/reports/single_table/test_diagnostic_report.py @@ -1,8 +1,5 @@ -from unittest.mock import Mock, patch - -from sdmetrics.reports._results_handler import DiagnosticReportResultsHandler from sdmetrics.reports.single_table import DiagnosticReport -from sdmetrics.reports.single_table._properties import Boundary, Coverage, Synthesis +from sdmetrics.reports.single_table._properties import DataValidity, Structure class TestDiagnosticReport: @@ -15,54 +12,5 @@ def test___init__(self): # Assert assert report._overall_score is None assert report.is_generated is False - assert isinstance(report._properties['Coverage'], Coverage) - assert isinstance(report._properties['Boundary'], Boundary) - assert isinstance(report._properties['Synthesis'], Synthesis) - assert isinstance(report._results_handler, DiagnosticReportResultsHandler) - - def test__handle_results(self): - """Test that the proper values are passed to the handler.""" - # Setup - report = DiagnosticReport() - report._properties = Mock() - report._results_handler = Mock() - - # Run - report._handle_results(True) - - # Assert - report._results_handler.print_results.assert_called_once_with(report._properties, True) - - @patch('sdmetrics.reports.base_report.BaseReport.generate') - def test_generate_without_verbose(self, mock_super_generate): - """Test the ``generate`` method without verbose.""" - # Setup - real_data = Mock() - synthetic_data = Mock() - metadata = Mock() - report = DiagnosticReport() - - # Run - report.generate(real_data, synthetic_data, metadata, verbose=False) - - # Assert - mock_super_generate.assert_called_once_with( - real_data, synthetic_data, metadata, verbose=False) - - def test_get_results(self): - """Test the ``get_results`` method.""" - # Setup - report = DiagnosticReport() - mock_check_report_generated = Mock() - report._check_report_generated = mock_check_report_generated - mock_results_handler = Mock() - report._results_handler = mock_results_handler - mock_results_handler.results = {'SUCCESS': ['Test']} - report.is_generated = True - - # Run - results = report.get_results() - - # Assert - mock_check_report_generated.assert_called_once_with() - assert results == {'SUCCESS': ['Test']} + assert isinstance(report._properties['Data Validity'], DataValidity) + assert isinstance(report._properties['Data Structure'], Structure) diff --git a/tests/unit/reports/single_table/test_quality_report.py b/tests/unit/reports/single_table/test_quality_report.py index 11cd4453..a3d58eff 100644 --- a/tests/unit/reports/single_table/test_quality_report.py +++ b/tests/unit/reports/single_table/test_quality_report.py @@ -1,6 +1,3 @@ -from unittest.mock import Mock, patch - -from sdmetrics.reports._results_handler import QualityReportResultsHandler from sdmetrics.reports.single_table import QualityReport from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes @@ -17,37 +14,3 @@ def test___init__(self): assert not report.is_generated assert isinstance(report._properties['Column Shapes'], ColumnShapes) assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends) - assert isinstance(report._results_handler, QualityReportResultsHandler) - - @patch('sys.stdout.write') - def test__handle_results(self, mock_write): - """Test that the proper values are passed to the handler.""" - # Setup - quality_report = QualityReport() - quality_report._overall_score = 0.5 - quality_report._properties = { - 'Column Shapes': Mock(_compute_average=Mock(return_value=0.6)), - 'Column Pair Trends': Mock(_compute_average=Mock(return_value=0.4)) - } - quality_report._results_handler = Mock() - - # Run - quality_report._handle_results(True) - - # Assert - quality_report._results_handler.print_results.assert_called_once_with( - quality_report._properties, quality_report._overall_score, True) - - def test_get_score(self): - """Test the ``get_score`` method.""" - # Setup - quality_report = QualityReport() - quality_report.is_generated = True - mock_score = Mock() - quality_report._overall_score = mock_score - - # Run - score = quality_report.get_score() - - # Assert - assert score == mock_score diff --git a/tests/unit/reports/test__results_handler.py b/tests/unit/reports/test__results_handler.py deleted file mode 100644 index b82b3dd8..00000000 --- a/tests/unit/reports/test__results_handler.py +++ /dev/null @@ -1,152 +0,0 @@ -from unittest.mock import Mock, call, patch - -import pandas as pd -import pytest - -from sdmetrics.reports._results_handler import ( - BaseResultsHandler, DiagnosticReportResultsHandler, QualityReportResultsHandler) - - -class TestBaseResultsHandler(): - def test_print_results(self): - """Test that base print results raises an error.""" - # Setup - handler = BaseResultsHandler() - - # Run and Assert - with pytest.raises(NotImplementedError): - handler.print_results(Mock(), Mock()) - - -class TestDiagnosticReportResultsHandler(): - @patch('sys.stdout.write') - def test_print_results_verbose_true(self, mock_write): - """Test that print results prints to std.out when verbose is True.""" - # Setup - properties = { - 'Coverage': Mock(), - 'Boundary': Mock(), - 'Synthesis': Mock() - } - properties['Coverage'].details = pd.DataFrame({ - 'Metric': ['CategoryCoverage', 'RangeCoverage', 'CategoryCoverage'], - 'Score': [0.1, 0.2, 0.3] - }) - properties['Boundary'].details = pd.DataFrame({ - 'Metric': ['BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence'], - 'Score': [0.5, 0.6, 0.7] - }) - properties['Synthesis'].details = pd.DataFrame({ - 'Metric': ['NewRowSynthesis'], - 'Score': [1.0] - }) - handler = DiagnosticReportResultsHandler() - - # Run - handler.print_results(properties, True) - - # Assert - calls = [ - call('\nDiagnostic Results:\n'), - call('\nSUCCESS:\n'), - call('✓ Over 90% of the synthetic rows are not copies of the real data\n'), - call('\nWARNING:\n'), - call( - '! More than 10% the synthetic data does not follow the min/max ' - 'boundaries set by the real data\n' - ), - call('\nDANGER:\n'), - call( - 'x The synthetic data is missing more than 50% of the categories' - ' present in the real data\n' - ), - call( - 'x The synthetic data is missing more than 50% of the numerical' - ' ranges present in the real data\n' - ) - ] - - mock_write.assert_has_calls(calls, any_order=True) - - @patch('sys.stdout.write') - def test_print_results_verbose_false(self, mock_write): - """Test that print results just stortes results when verbose is False.""" - # Setup - properties = { - 'Coverage': Mock(), - 'Boundary': Mock(), - 'Synthesis': Mock() - } - properties['Coverage'].details = pd.DataFrame({ - 'Metric': ['CategoryCoverage', 'RangeCoverage', 'CategoryCoverage'], - 'Score': [0.1, 0.2, 0.3] - }) - properties['Boundary'].details = pd.DataFrame({ - 'Metric': ['BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence'], - 'Score': [0.5, 0.6, 0.7] - }) - properties['Synthesis'].details = pd.DataFrame({ - 'Metric': ['NewRowSynthesis'], - 'Score': [1.0] - }) - handler = DiagnosticReportResultsHandler() - - # Run - handler.print_results(properties, False) - - # Assert - expected_results = { - 'SUCCESS': ['Over 90% of the synthetic rows are not copies of the real data'], - 'WARNING': [ - 'More than 10% the synthetic data does not follow the min/max ' - 'boundaries set by the real data' - ], - 'DANGER': [ - 'The synthetic data is missing more than 50% of the categories' - ' present in the real data', - 'The synthetic data is missing more than 50% of the numerical' - ' ranges present in the real data' - ] - } - mock_write.assert_not_called() - assert handler.results == expected_results - - -class TestQualityReportResultsHandler(): - @patch('sys.stdout.write') - def test_print_results_verbose_true(self, mock_write): - """Test the results are printed if verbose is True.""" - # Setup - score = 0.5 - properties = { - 'Column Shapes': Mock(_compute_average=Mock(return_value=0.6)), - 'Column Pair Trends': Mock(_compute_average=Mock(return_value=0.4)) - } - handler = QualityReportResultsHandler() - # Run - handler.print_results(properties, score, True) - - # Assert - calls = [ - call('\nOverall Quality Score: 50.0%\n\n'), - call('Properties:\n'), - call('- Column Shapes: 60.0%\n'), - call('- Column Pair Trends: 40.0%\n'), - ] - mock_write.assert_has_calls(calls, any_order=True) - - @patch('sys.stdout.write') - def test_print_results_verbose_false(self, mock_write): - """Test the results are not printed if verbose is False.""" - # Setup - score = 0.5 - properties = { - 'Column Shapes': Mock(_compute_average=Mock(return_value=0.6)), - 'Column Pair Trends': Mock(_compute_average=Mock(return_value=0.4)) - } - handler = QualityReportResultsHandler() - # Run - handler.print_results(properties, score, False) - - # Assert - mock_write.assert_not_called() diff --git a/tests/unit/reports/test_base_report.py b/tests/unit/reports/test_base_report.py index 40c25657..5076e86b 100644 --- a/tests/unit/reports/test_base_report.py +++ b/tests/unit/reports/test_base_report.py @@ -173,6 +173,50 @@ def test_generate_metadata_not_dict(self): with pytest.raises(TypeError, match=expected_message): base_report.generate(real_data, synthetic_data, metadata, verbose=False) + @patch('sys.stdout.write') + def test_print_results_verbose_true(self, mock_write): + """Test the results are printed if verbose is True.""" + # Setup + report = BaseReport() + report._overall_score = 0.5 + mock_column_shapes = Mock() + mock_column_shapes._compute_average.return_value = 0.6 + mock_column_pair_trends = Mock() + mock_column_pair_trends._compute_average.return_value = 0.4 + report._properties = { + 'Column Shapes': mock_column_shapes, + 'Column Pair Trends': mock_column_pair_trends + } + + # Run + report._print_results(True) + + # Assert + calls = [ + call('\nOverall Score: 50.0%\n\n'), + call('Properties:\n'), + call('- Column Shapes: 60.0%\n'), + call('- Column Pair Trends: 40.0%\n'), + ] + mock_write.assert_has_calls(calls, any_order=True) + + @patch('sys.stdout.write') + def test_print_results_verbose_false(self, mock_write): + """Test the results are not printed if verbose is False.""" + # Setup + report = BaseReport() + report._overall_score = 0.5 + report._properties = { + 'Column Shapes': Mock(_compute_average=Mock(return_value=0.6)), + 'Column Pair Trends': Mock(_compute_average=Mock(return_value=0.4)) + } + + # Run + report._print_results(False) + + # Assert + mock_write.assert_not_called() + @patch('sdmetrics.reports.base_report.datetime') @patch('sdmetrics.reports.base_report.time') @patch('sdmetrics.reports.base_report.version') @@ -190,8 +234,8 @@ def test_generate(self, version_mock, time_mock, datetime_mock): base_report = BaseReport() mock_validate = Mock() - mock_handle_results = Mock() - base_report._handle_results = mock_handle_results + mock__print_results = Mock() + base_report._print_results = mock__print_results base_report.validate = mock_validate base_report._properties['Property 1'] = Mock() base_report._properties['Property 1'].get_score.return_value = 1.0 @@ -218,7 +262,7 @@ def test_generate(self, version_mock, time_mock, datetime_mock): # Assert mock_validate.assert_called_once_with(real_data, synthetic_data, metadata) - mock_handle_results.assert_called_once_with(False) + mock__print_results.assert_called_once_with(False) base_report._properties['Property 1'].get_score.assert_called_with( real_data, synthetic_data, metadata, progress_bar=None ) @@ -250,7 +294,7 @@ def test_generate_multi_table_details(self, version_mock, time_mock, datetime_mo version_mock.return_value = 'version' base_report = BaseReport() - base_report._handle_results = Mock() + base_report._print_results = Mock() base_report.validate = Mock() base_report.convert_datetimes = Mock() base_report._properties['Property 1'] = Mock() @@ -322,21 +366,12 @@ def test_generate_multi_table_details(self, version_mock, time_mock, datetime_mo } assert base_report.report_info == expected_info - def test__handle_results(self): - """Test the ``_handle_results`` method.""" - # Setup - base_report = BaseReport() - - # Run and Assert - with pytest.raises(NotImplementedError): - base_report._handle_results(True) - @patch('tqdm.tqdm') def test_generate_verbose(self, mock_tqdm): """Test the ``generate`` method with verbose=True.""" # Setup base_report = BaseReport() - base_report._handle_results = Mock() + base_report._print_results = Mock() mock_validate = Mock() base_report.validate = mock_validate base_report._print_results = Mock() @@ -376,7 +411,7 @@ def test_generate_verbose(self, mock_tqdm): # Assert calls = [call(total=4, file=sys.stdout), call(total=6, file=sys.stdout)] mock_tqdm.assert_has_calls(calls, any_order=True) - base_report._handle_results.assert_called_once_with(True) + base_report._print_results.assert_called_once_with(True) def test__check_report_generated(self): """Test the ``check_report_generated`` method.""" @@ -417,6 +452,21 @@ def test__validate_property_generated(self): with pytest.raises(ValueError, match=expected_message_2): base_report._validate_property_generated(wrong_property_name) + def test_get_score(self): + """Test the ``get_score`` method.""" + # Setup + report = BaseReport() + report._check_report_generated = Mock() + report._overall_score = 0.7 + report.is_generated = True + + # Run + results = report.get_score() + + # Assert + report._check_report_generated.assert_called_once_with() + assert results == 0.7 + def test_get_properties(self): """Test the ``get_details`` method.""" # Setup @@ -453,8 +503,8 @@ def test_get_info(self, version_mock, time_mock, datetime_mock): base_report = BaseReport() mock_validate = Mock() - mock_handle_results = Mock() - base_report._handle_results = mock_handle_results + mock__print_results = Mock() + base_report._print_results = mock__print_results base_report.validate = mock_validate base_report._properties['Property 1'] = Mock() base_report._properties['Property 1'].get_score.return_value = 1.0