From c4550a2364100d2b5c80369b2a6dde12b7475085 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 2 Nov 2023 09:23:32 -0600 Subject: [PATCH] Add `DataValidity` property (#486) --- .../multi_table/_properties/__init__.py | 2 + .../multi_table/_properties/data_validity.py | 18 ++ .../single_table/_properties/__init__.py | 2 + .../single_table/_properties/column_shapes.py | 4 +- .../single_table/_properties/data_validity.py | 126 ++++++++++++++ .../_properties/test_data_validity.py | 39 +++++ .../_properties/test_data_validity.py | 73 ++++++++ .../multi_table/_properties/test_validity.py | 14 ++ .../_properties/test_column_shapes.py | 124 +++++++------- .../_properties/test_data_validity.py | 157 ++++++++++++++++++ 10 files changed, 495 insertions(+), 64 deletions(-) create mode 100644 sdmetrics/reports/multi_table/_properties/data_validity.py create mode 100644 sdmetrics/reports/single_table/_properties/data_validity.py create mode 100644 tests/integration/reports/multi_table/_properties/test_data_validity.py create mode 100644 tests/integration/reports/single_table/_properties/test_data_validity.py create mode 100644 tests/unit/reports/multi_table/_properties/test_validity.py create mode 100644 tests/unit/reports/single_table/_properties/test_data_validity.py diff --git a/sdmetrics/reports/multi_table/_properties/__init__.py b/sdmetrics/reports/multi_table/_properties/__init__.py index 119f8ddc..d43faaf1 100644 --- a/sdmetrics/reports/multi_table/_properties/__init__.py +++ b/sdmetrics/reports/multi_table/_properties/__init__.py @@ -6,6 +6,7 @@ from sdmetrics.reports.multi_table._properties.column_pair_trends import ColumnPairTrends from sdmetrics.reports.multi_table._properties.column_shapes import ColumnShapes from sdmetrics.reports.multi_table._properties.coverage import Coverage +from sdmetrics.reports.multi_table._properties.data_validity import DataValidity from sdmetrics.reports.multi_table._properties.inter_table_trends import InterTableTrends from sdmetrics.reports.multi_table._properties.structure import Structure from sdmetrics.reports.multi_table._properties.synthesis import Synthesis @@ -20,4 +21,5 @@ 'InterTableTrends', 'Synthesis', 'Structure', + 'DataValidity' ] diff --git a/sdmetrics/reports/multi_table/_properties/data_validity.py b/sdmetrics/reports/multi_table/_properties/data_validity.py new file mode 100644 index 00000000..db55c011 --- /dev/null +++ b/sdmetrics/reports/multi_table/_properties/data_validity.py @@ -0,0 +1,18 @@ +"""Data validity property for multi-table.""" +from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty +from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity + + +class DataValidity(BaseMultiTableProperty): + """Data Validitys property class for multi-table. + + This property computes, at base, whether each column contains valid data. + The metric is based on the type data in each column. + A metric score is computed column-wise and the final score is the average over all columns. + The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence + is used for categorical and boolean columns and the KeyUniqueness for primary and + alternate keys. The other column types are ignored by this property. + """ + + _single_table_property = SingleTableDataValidity + _num_iteration_case = 'column' diff --git a/sdmetrics/reports/single_table/_properties/__init__.py b/sdmetrics/reports/single_table/_properties/__init__.py index a9205ae4..3ce8377c 100644 --- a/sdmetrics/reports/single_table/_properties/__init__.py +++ b/sdmetrics/reports/single_table/_properties/__init__.py @@ -5,6 +5,7 @@ from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes from sdmetrics.reports.single_table._properties.coverage import Coverage +from sdmetrics.reports.single_table._properties.data_validity import DataValidity from sdmetrics.reports.single_table._properties.structure import Structure from sdmetrics.reports.single_table._properties.synthesis import Synthesis @@ -16,4 +17,5 @@ 'Boundary', 'Synthesis', 'Structure', + 'DataValidity' ] diff --git a/sdmetrics/reports/single_table/_properties/column_shapes.py b/sdmetrics/reports/single_table/_properties/column_shapes.py index b6a66f9f..1c300231 100644 --- a/sdmetrics/reports/single_table/_properties/column_shapes.py +++ b/sdmetrics/reports/single_table/_properties/column_shapes.py @@ -85,11 +85,11 @@ def get_visualization(self): average_score = round(self._compute_average(), 2) fig = px.bar( - self.details, + data_frame=self.details, x='Column', y='Score', title=f'Data Quality: Column Shapes (Average Score={average_score})', - category_orders={'group': self.details['Column']}, + category_orders={'group': list(self.details['Column'])}, color='Metric', color_discrete_map={ 'KSComplement': PlotConfig.DATACEBO_DARK, diff --git a/sdmetrics/reports/single_table/_properties/data_validity.py b/sdmetrics/reports/single_table/_properties/data_validity.py new file mode 100644 index 00000000..3cd05e2f --- /dev/null +++ b/sdmetrics/reports/single_table/_properties/data_validity.py @@ -0,0 +1,126 @@ +import numpy as np +import pandas as pd +import plotly.express as px + +from sdmetrics.reports.single_table._properties import BaseSingleTableProperty +from sdmetrics.reports.utils import PlotConfig +from sdmetrics.single_column import BoundaryAdherence, CategoryAdherence, KeyUniqueness + + +class DataValidity(BaseSingleTableProperty): + """Data Validity property class for single table. + + This property computes, at base, whether each column contains valid data. + The metric is based on the type data in each column. + The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence + is used for categorical and boolean columns and the KeyUniqueness for primary + and alternate keys. The other column types are ignored by this property. + """ + + _num_iteration_case = 'column' + _sdtype_to_metric = { + 'numerical': BoundaryAdherence, + 'datetime': BoundaryAdherence, + 'categorical': CategoryAdherence, + 'boolean': CategoryAdherence, + 'id': KeyUniqueness, + } + + def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): + """Generate the _details dataframe for the data validity property. + + Args: + real_data (pandas.DataFrame): + The real data + synthetic_data (pandas.DataFrame): + The synthetic data + metadata (dict): + The metadata of the table + progress_bar (tqdm.tqdm or None): + The progress bar to use. Defaults to None. + """ + column_names, metric_names, scores = [], [], [] + error_messages = [] + primary_key = metadata.get('primary_key') + alternate_keys = metadata.get('alternate_keys', []) + for column_name in metadata['columns']: + sdtype = metadata['columns'][column_name]['sdtype'] + primary_key_match = column_name == primary_key + alternate_key_match = column_name in alternate_keys + is_unique = primary_key_match or alternate_key_match + + try: + if sdtype not in self._sdtype_to_metric and not is_unique: + continue + + metric = self._sdtype_to_metric.get(sdtype, KeyUniqueness) + column_score = metric.compute( + real_data[column_name], synthetic_data[column_name] + ) + error_message = None + + except Exception as e: + column_score = np.nan + error_message = f'{type(e).__name__}: {e}' + finally: + if progress_bar: + progress_bar.update() + + column_names.append(column_name) + metric_names.append(metric.__name__) + scores.append(column_score) + error_messages.append(error_message) + + result = pd.DataFrame({ + 'Column': column_names, + 'Metric': metric_names, + 'Score': scores, + 'Error': error_messages, + }) + + if result['Error'].isna().all(): + result = result.drop('Error', axis=1) + + return result + + def get_visualization(self): + """Create a plot to show the data validity scores. + + Returns: + plotly.graph_objects._figure.Figure + """ + average_score = round(self._compute_average(), 2) + + fig = px.bar( + data_frame=self.details, + x='Column', + y='Score', + title=f'Data Diagnostic: Data Validity (Average Score={average_score})', + category_orders={'group': list(self.details['Column'])}, + color='Metric', + color_discrete_map={ + 'BoundaryAdherence': PlotConfig.DATACEBO_DARK, + 'CategoryAdherence': PlotConfig.DATACEBO_BLUE, + 'KeyUniqueness': PlotConfig.DATACEBO_GREEN + + }, + pattern_shape='Metric', + pattern_shape_sequence=['', '/', '.'], + hover_name='Column', + hover_data={ + 'Column': False, + 'Metric': True, + 'Score': True, + }, + ) + + fig.update_yaxes(range=[0, 1]) + + fig.update_layout( + xaxis_categoryorder='total ascending', + plot_bgcolor=PlotConfig.BACKGROUND_COLOR, + margin={'t': 150}, + font={'size': PlotConfig.FONT_SIZE}, + ) + + return fig diff --git a/tests/integration/reports/multi_table/_properties/test_data_validity.py b/tests/integration/reports/multi_table/_properties/test_data_validity.py new file mode 100644 index 00000000..00074ec9 --- /dev/null +++ b/tests/integration/reports/multi_table/_properties/test_data_validity.py @@ -0,0 +1,39 @@ +from unittest.mock import Mock + +from tqdm import tqdm + +from sdmetrics.demos import load_demo +from sdmetrics.reports.multi_table._properties import DataValidity + + +class TestDataValidity: + + def test_end_to_end(self): + """Test the ``DataValidity`` multi-table property end to end.""" + # Setup + real_data, synthetic_data, metadata = load_demo(modality='multi_table') + column_shapes = DataValidity() + + # Run + result = column_shapes.get_score(real_data, synthetic_data, metadata) + + # Assert + assert result == 0.9444444444444445 + + def test_with_progress_bar(self): + """Test that the progress bar is correctly updated.""" + # Setup + real_data, synthetic_data, metadata = load_demo(modality='multi_table') + column_shapes = DataValidity() + num_columns = sum(len(table['columns']) for table in metadata['tables'].values()) + + progress_bar = tqdm(total=num_columns) + mock_update = Mock() + progress_bar.update = mock_update + + # Run + result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar) + + # Assert + assert result == 0.9444444444444445 + assert mock_update.call_count == num_columns diff --git a/tests/integration/reports/single_table/_properties/test_data_validity.py b/tests/integration/reports/single_table/_properties/test_data_validity.py new file mode 100644 index 00000000..b92f69b7 --- /dev/null +++ b/tests/integration/reports/single_table/_properties/test_data_validity.py @@ -0,0 +1,73 @@ +import pandas as pd + +from sdmetrics.demos import load_demo +from sdmetrics.reports.single_table._properties import DataValidity + + +class TestDataValidity: + + def test_get_score(self): + """Test the ``get_score`` method""" + # Setup + real_data, synthetic_data, metadata = load_demo('single_table') + + # Run + data_validity_property = DataValidity() + score = data_validity_property.get_score(real_data, synthetic_data, metadata) + + # Assert + expected_details_dict = { + 'Column': [ + 'start_date', 'end_date', 'salary', 'duration', 'student_id', + 'high_perc', 'high_spec', 'mba_spec', 'second_perc', 'gender', + 'degree_perc', 'placed', 'experience_years', 'employability_perc', + 'mba_perc', 'work_experience', 'degree_type' + ], + 'Metric': [ + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', + 'KeyUniqueness', 'BoundaryAdherence', 'CategoryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'CategoryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence', + 'CategoryAdherence' + ], + 'Score': [ + 0.8503937007874016, 0.8615384615384616, 0.9444444444444444, + 1.0, 1.0, 0.8651162790697674, 1.0, 1.0, 0.9255813953488372, + 1.0, 0.9441860465116279, 1.0, 1.0, 0.8883720930232558, + 0.8930232558139535, 1.0, 1.0 + ] + } + expected_details = pd.DataFrame(expected_details_dict) + pd.testing.assert_frame_equal(data_validity_property.details, expected_details) + assert score == 0.9513326868551618 + + def test_get_score_errors(self): + """Test the ``get_score`` method when the metrics are raising errors for some columns.""" + # Setup + real_data, synthetic_data, metadata = load_demo('single_table') + + real_data['start_date'].iloc[0] = 0 + real_data['employability_perc'].iloc[2] = 'a' + + # Run + data_validity_property = DataValidity() + + expected_message_1 = ( + "TypeError: '<=' not supported between instances of 'int' and 'Timestamp'" + ) + expected_message_2 = ( + "TypeError: '<=' not supported between instances of 'float' and 'str'" + ) + + score = data_validity_property.get_score(real_data, synthetic_data, metadata) + + # Assert + + details = data_validity_property.details + details_nan = details.loc[pd.isna(details['Score'])] + column_names_nan = details_nan['Column'].tolist() + error_messages = details_nan['Error'].tolist() + assert column_names_nan == ['start_date', 'employability_perc'] + assert error_messages[0] == expected_message_1 + assert error_messages[1] == expected_message_2 + assert score == 0.9622593255151395 diff --git a/tests/unit/reports/multi_table/_properties/test_validity.py b/tests/unit/reports/multi_table/_properties/test_validity.py new file mode 100644 index 00000000..9e4a97b5 --- /dev/null +++ b/tests/unit/reports/multi_table/_properties/test_validity.py @@ -0,0 +1,14 @@ +"""Test Data Validity multi-table class.""" +from sdmetrics.reports.multi_table._properties import DataValidity +from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity + + +def test__init__(): + """Test the ``__init__`` method.""" + # Setup + column_shapes = DataValidity() + + # Assert + assert column_shapes._properties == {} + assert column_shapes._single_table_property == SingleTableDataValidity + assert column_shapes._num_iteration_case == 'column' diff --git a/tests/unit/reports/single_table/_properties/test_column_shapes.py b/tests/unit/reports/single_table/_properties/test_column_shapes.py index b2b43469..7f018b5a 100644 --- a/tests/unit/reports/single_table/_properties/test_column_shapes.py +++ b/tests/unit/reports/single_table/_properties/test_column_shapes.py @@ -119,69 +119,69 @@ def test__generate_details_error(self): assert column_names_nan == ['col1'] assert error_message == [expected_message] - @patch('sdmetrics.reports.single_table._properties.column_shapes.px') - def test_get_visualization(self, mock_px): - """Test the ``get_visualization`` method.""" - # Setup - column_shape_property = ColumnShapes() - - mock_df = pd.DataFrame({ - 'Column': ['Column1', 'Column2'], - 'Score': [0.7, 0.3], - 'Metric': ['KSComplement', 'TVComplement'] - }) - column_shape_property.details = mock_df - - mock__compute_average = Mock(return_value=0.5) - column_shape_property._compute_average = mock__compute_average - - mock_bar = Mock() - mock_px.bar.return_value = mock_bar - - # Run - column_shape_property.get_visualization() - - # Assert - mock__compute_average.assert_called_once() - - # Expected call - expected_kwargs = { - 'data_frame': mock_df, - 'x': 'Column', - 'y': 'Score', - 'title': ( - 'Data Quality: Column Shapes (Average' - f'Score={mock__compute_average.return_value})' - ), - 'category_orders': {'group': mock_df['Column'].tolist()}, - 'color': 'Metric', - 'color_discrete_map': { - 'KSComplement': '#000036', - 'TVComplement': '#03AFF1', - }, - 'pattern_shape': 'Metric', - 'pattern_shape_sequence': ['', '/'], - 'hover_name': 'Column', - 'hover_data': { - 'Column': False, - 'Metric': True, - 'Score': True, - }, - } + @patch('sdmetrics.reports.single_table._properties.column_shapes.px') + def test_get_visualization(self, mock_px): + """Test the ``get_visualization`` method.""" + # Setup + column_shape_property = ColumnShapes() - # Check call_args of mock_px.bar - _, kwargs = mock_px.bar.call_args + mock_df = pd.DataFrame({ + 'Column': ['Column1', 'Column2'], + 'Score': [0.7, 0.3], + 'Metric': ['KSComplement', 'TVComplement'] + }) + column_shape_property.details = mock_df - # Check DataFrame separately - assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame')) + mock__compute_average = Mock(return_value=0.5) + column_shape_property._compute_average = mock__compute_average - # Check other arguments - assert kwargs == expected_kwargs + mock_bar = Mock() + mock_px.bar.return_value = mock_bar - mock_bar.update_yaxes.assert_called_once_with(range=[0, 1]) - mock_bar.update_layout.assert_called_once_with( - xaxis_categoryorder='total ascending', - plot_bgcolor='#F5F5F8', - margin={'t': 150}, - font={'size': 18} - ) + # Run + column_shape_property.get_visualization() + + # Assert + mock__compute_average.assert_called_once() + + # Expected call + expected_kwargs = { + 'data_frame': mock_df, + 'x': 'Column', + 'y': 'Score', + 'title': ( + 'Data Quality: Column Shapes (Average ' + f'Score={mock__compute_average.return_value})' + ), + 'category_orders': {'group': mock_df['Column'].tolist()}, + 'color': 'Metric', + 'color_discrete_map': { + 'KSComplement': '#000036', + 'TVComplement': '#03AFF1', + }, + 'pattern_shape': 'Metric', + 'pattern_shape_sequence': ['', '/'], + 'hover_name': 'Column', + 'hover_data': { + 'Column': False, + 'Metric': True, + 'Score': True, + }, + } + + # Check call_args of mock_px.bar + _, kwargs = mock_px.bar.call_args + + # Check DataFrame separately + assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame')) + + # Check other arguments + assert kwargs == expected_kwargs + + mock_bar.update_yaxes.assert_called_once_with(range=[0, 1]) + mock_bar.update_layout.assert_called_once_with( + xaxis_categoryorder='total ascending', + plot_bgcolor='#F5F5F8', + margin={'t': 150}, + font={'size': 18} + ) diff --git a/tests/unit/reports/single_table/_properties/test_data_validity.py b/tests/unit/reports/single_table/_properties/test_data_validity.py new file mode 100644 index 00000000..1ba18518 --- /dev/null +++ b/tests/unit/reports/single_table/_properties/test_data_validity.py @@ -0,0 +1,157 @@ +from unittest.mock import Mock, call, patch + +import pandas as pd + +from sdmetrics.reports.single_table._properties.data_validity import DataValidity + + +class TestDataValidity: + + @patch('sdmetrics.reports.single_table._properties.data_validity.BoundaryAdherence.compute') + @patch('sdmetrics.reports.single_table._properties.data_validity.CategoryAdherence.compute') + @patch('sdmetrics.reports.single_table._properties.data_validity.KeyUniqueness.compute') + def test__generate_details( + self, key_uniqueness_mock, category_a_compute_mock, boundary_a_compute_mock + ): + """Test the ``_generate_details`` method.""" + # Setup + real_data = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': [False, True, True], + 'col3': ['a', 'b', 'c'], + 'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']), + 'col5': ['ID_1', 'ID_2', 'ID_3'], + 'col6': ['A', 'B', 'C'] + }) + synthetic_data = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': [False, True, True], + 'col3': ['a', 'b', 'c'], + 'col4': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']), + 'col5': ['ID_4', 'ID_5', 'ID_6'], + 'col6': ['D', 'E', 'F'] + }) + metadata = { + 'primary_key': 'col5', + 'alternate_keys': ['col6'], + 'columns': { + 'col1': {'sdtype': 'numerical'}, + 'col2': {'sdtype': 'boolean'}, + 'col3': {'sdtype': 'categorical'}, + 'col4': {'sdtype': 'datetime'}, + 'col5': {'sdtype': 'id'}, + 'col6': {'sdtype': 'other'} + } + } + + # Run + data_validity_property = DataValidity() + data_validity_property._generate_details(real_data, synthetic_data, metadata) + + # Assert + expected_calls_ba = [ + call(real_data['col1'], synthetic_data['col1']), + call(real_data['col4'], synthetic_data['col4']), + ] + expected_calls_ca = [ + call(real_data['col2'], synthetic_data['col2']), + call(real_data['col3'], synthetic_data['col3']), + ] + expected_calls_key = [ + call(real_data['col5'], synthetic_data['col5']), + call(real_data['col6'], synthetic_data['col6']) + ] + boundary_a_compute_mock.assert_has_calls(expected_calls_ba) + category_a_compute_mock.assert_has_calls(expected_calls_ca) + key_uniqueness_mock.assert_has_calls(expected_calls_key) + + def test__generate_details_error(self): + """Test the ``_generate_details`` method with the error column.""" + # Setup + real_data = pd.DataFrame({'col1': [1, '2', 3]}) + synthetic_data = pd.DataFrame({'col1': [4, 5, 6]}) + metadata = {'columns': {'col1': {'sdtype': 'numerical'}}} + + data_validity_property = DataValidity() + + # Run + result = data_validity_property._generate_details(real_data, synthetic_data, metadata) + + # Assert + expected_message = ( + "TypeError: '<=' not supported between instances of 'int' and 'str'" + ) + result_nan = result.loc[pd.isna(result['Score'])] + column_names_nan = result_nan['Column'].tolist() + error_message = result_nan['Error'].tolist() + + assert column_names_nan == ['col1'] + assert error_message == [expected_message] + + @patch('sdmetrics.reports.single_table._properties.data_validity.px') + def test_get_visualization(self, mock_px): + """Test the ``get_visualization`` method.""" + # Setup + data_validity_property = DataValidity() + + mock_df = pd.DataFrame({ + 'Column': ['Column1', 'Column2', 'Column3'], + 'Score': [0.7, 0.3, 0.5], + 'Metric': ['BoundaryAdherence', 'CategoryAdherence', 'KeyUniqueness'] + }) + data_validity_property.details = mock_df + + mock__compute_average = Mock(return_value=0.5) + data_validity_property._compute_average = mock__compute_average + + mock_bar = Mock() + mock_px.bar.return_value = mock_bar + + # Run + data_validity_property.get_visualization() + + # Assert + mock__compute_average.assert_called_once() + + # Expected call + expected_kwargs = { + 'data_frame': mock_df, + 'x': 'Column', + 'y': 'Score', + 'title': ( + 'Data Diagnostic: Data Validity (Average ' + f'Score={mock__compute_average.return_value})' + ), + 'category_orders': {'group': mock_df['Column'].tolist()}, + 'color': 'Metric', + 'color_discrete_map': { + 'BoundaryAdherence': '#000036', + 'CategoryAdherence': '#03AFF1', + 'KeyUniqueness': '#01E0C9', + }, + 'pattern_shape': 'Metric', + 'pattern_shape_sequence': ['', '/', '.'], + 'hover_name': 'Column', + 'hover_data': { + 'Column': False, + 'Metric': True, + 'Score': True, + }, + } + + # Check call_args of mock_px.bar + _, kwargs = mock_px.bar.call_args + + # Check DataFrame separately + assert kwargs.pop('data_frame').equals(expected_kwargs.pop('data_frame')) + + # Check other arguments + assert kwargs == expected_kwargs + + mock_bar.update_yaxes.assert_called_once_with(range=[0, 1]) + mock_bar.update_layout.assert_called_once_with( + xaxis_categoryorder='total ascending', + plot_bgcolor='#F5F5F8', + margin={'t': 150}, + font={'size': 18} + )