Skip to content

Commit

Permalink
Add DataValidity property (#486)
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Nov 14, 2023
1 parent 128880e commit c4550a2
Show file tree
Hide file tree
Showing 10 changed files with 495 additions and 64 deletions.
2 changes: 2 additions & 0 deletions sdmetrics/reports/multi_table/_properties/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sdmetrics.reports.multi_table._properties.column_pair_trends import ColumnPairTrends
from sdmetrics.reports.multi_table._properties.column_shapes import ColumnShapes
from sdmetrics.reports.multi_table._properties.coverage import Coverage
from sdmetrics.reports.multi_table._properties.data_validity import DataValidity
from sdmetrics.reports.multi_table._properties.inter_table_trends import InterTableTrends
from sdmetrics.reports.multi_table._properties.structure import Structure
from sdmetrics.reports.multi_table._properties.synthesis import Synthesis
Expand All @@ -20,4 +21,5 @@
'InterTableTrends',
'Synthesis',
'Structure',
'DataValidity'
]
18 changes: 18 additions & 0 deletions sdmetrics/reports/multi_table/_properties/data_validity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Data validity property for multi-table."""
from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity


class DataValidity(BaseMultiTableProperty):
"""Data Validitys property class for multi-table.
This property computes, at base, whether each column contains valid data.
The metric is based on the type data in each column.
A metric score is computed column-wise and the final score is the average over all columns.
The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence
is used for categorical and boolean columns and the KeyUniqueness for primary and
alternate keys. The other column types are ignored by this property.
"""

_single_table_property = SingleTableDataValidity
_num_iteration_case = 'column'
2 changes: 2 additions & 0 deletions sdmetrics/reports/single_table/_properties/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
from sdmetrics.reports.single_table._properties.coverage import Coverage
from sdmetrics.reports.single_table._properties.data_validity import DataValidity
from sdmetrics.reports.single_table._properties.structure import Structure
from sdmetrics.reports.single_table._properties.synthesis import Synthesis

Expand All @@ -16,4 +17,5 @@
'Boundary',
'Synthesis',
'Structure',
'DataValidity'
]
4 changes: 2 additions & 2 deletions sdmetrics/reports/single_table/_properties/column_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ def get_visualization(self):
average_score = round(self._compute_average(), 2)

fig = px.bar(
self.details,
data_frame=self.details,
x='Column',
y='Score',
title=f'Data Quality: Column Shapes (Average Score={average_score})',
category_orders={'group': self.details['Column']},
category_orders={'group': list(self.details['Column'])},
color='Metric',
color_discrete_map={
'KSComplement': PlotConfig.DATACEBO_DARK,
Expand Down
126 changes: 126 additions & 0 deletions sdmetrics/reports/single_table/_properties/data_validity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import numpy as np
import pandas as pd
import plotly.express as px

from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
from sdmetrics.reports.utils import PlotConfig
from sdmetrics.single_column import BoundaryAdherence, CategoryAdherence, KeyUniqueness


class DataValidity(BaseSingleTableProperty):
"""Data Validity property class for single table.
This property computes, at base, whether each column contains valid data.
The metric is based on the type data in each column.
The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence
is used for categorical and boolean columns and the KeyUniqueness for primary
and alternate keys. The other column types are ignored by this property.
"""

_num_iteration_case = 'column'
_sdtype_to_metric = {
'numerical': BoundaryAdherence,
'datetime': BoundaryAdherence,
'categorical': CategoryAdherence,
'boolean': CategoryAdherence,
'id': KeyUniqueness,
}

def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
"""Generate the _details dataframe for the data validity property.
Args:
real_data (pandas.DataFrame):
The real data
synthetic_data (pandas.DataFrame):
The synthetic data
metadata (dict):
The metadata of the table
progress_bar (tqdm.tqdm or None):
The progress bar to use. Defaults to None.
"""
column_names, metric_names, scores = [], [], []
error_messages = []
primary_key = metadata.get('primary_key')
alternate_keys = metadata.get('alternate_keys', [])
for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
primary_key_match = column_name == primary_key
alternate_key_match = column_name in alternate_keys
is_unique = primary_key_match or alternate_key_match

try:
if sdtype not in self._sdtype_to_metric and not is_unique:
continue

metric = self._sdtype_to_metric.get(sdtype, KeyUniqueness)
column_score = metric.compute(
real_data[column_name], synthetic_data[column_name]
)
error_message = None

except Exception as e:
column_score = np.nan
error_message = f'{type(e).__name__}: {e}'
finally:
if progress_bar:
progress_bar.update()

column_names.append(column_name)
metric_names.append(metric.__name__)
scores.append(column_score)
error_messages.append(error_message)

result = pd.DataFrame({
'Column': column_names,
'Metric': metric_names,
'Score': scores,
'Error': error_messages,
})

if result['Error'].isna().all():
result = result.drop('Error', axis=1)

return result

def get_visualization(self):
"""Create a plot to show the data validity scores.
Returns:
plotly.graph_objects._figure.Figure
"""
average_score = round(self._compute_average(), 2)

fig = px.bar(
data_frame=self.details,
x='Column',
y='Score',
title=f'Data Diagnostic: Data Validity (Average Score={average_score})',
category_orders={'group': list(self.details['Column'])},
color='Metric',
color_discrete_map={
'BoundaryAdherence': PlotConfig.DATACEBO_DARK,
'CategoryAdherence': PlotConfig.DATACEBO_BLUE,
'KeyUniqueness': PlotConfig.DATACEBO_GREEN

},
pattern_shape='Metric',
pattern_shape_sequence=['', '/', '.'],
hover_name='Column',
hover_data={
'Column': False,
'Metric': True,
'Score': True,
},
)

fig.update_yaxes(range=[0, 1])

fig.update_layout(
xaxis_categoryorder='total ascending',
plot_bgcolor=PlotConfig.BACKGROUND_COLOR,
margin={'t': 150},
font={'size': PlotConfig.FONT_SIZE},
)

return fig
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from unittest.mock import Mock

from tqdm import tqdm

from sdmetrics.demos import load_demo
from sdmetrics.reports.multi_table._properties import DataValidity


class TestDataValidity:

def test_end_to_end(self):
"""Test the ``DataValidity`` multi-table property end to end."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
column_shapes = DataValidity()

# Run
result = column_shapes.get_score(real_data, synthetic_data, metadata)

# Assert
assert result == 0.9444444444444445

def test_with_progress_bar(self):
"""Test that the progress bar is correctly updated."""
# Setup
real_data, synthetic_data, metadata = load_demo(modality='multi_table')
column_shapes = DataValidity()
num_columns = sum(len(table['columns']) for table in metadata['tables'].values())

progress_bar = tqdm(total=num_columns)
mock_update = Mock()
progress_bar.update = mock_update

# Run
result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar)

# Assert
assert result == 0.9444444444444445
assert mock_update.call_count == num_columns
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd

from sdmetrics.demos import load_demo
from sdmetrics.reports.single_table._properties import DataValidity


class TestDataValidity:

def test_get_score(self):
"""Test the ``get_score`` method"""
# Setup
real_data, synthetic_data, metadata = load_demo('single_table')

# Run
data_validity_property = DataValidity()
score = data_validity_property.get_score(real_data, synthetic_data, metadata)

# Assert
expected_details_dict = {
'Column': [
'start_date', 'end_date', 'salary', 'duration', 'student_id',
'high_perc', 'high_spec', 'mba_spec', 'second_perc', 'gender',
'degree_perc', 'placed', 'experience_years', 'employability_perc',
'mba_perc', 'work_experience', 'degree_type'
],
'Metric': [
'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence',
'KeyUniqueness', 'BoundaryAdherence', 'CategoryAdherence', 'CategoryAdherence',
'BoundaryAdherence', 'CategoryAdherence', 'BoundaryAdherence', 'CategoryAdherence',
'BoundaryAdherence', 'BoundaryAdherence', 'BoundaryAdherence', 'CategoryAdherence',
'CategoryAdherence'
],
'Score': [
0.8503937007874016, 0.8615384615384616, 0.9444444444444444,
1.0, 1.0, 0.8651162790697674, 1.0, 1.0, 0.9255813953488372,
1.0, 0.9441860465116279, 1.0, 1.0, 0.8883720930232558,
0.8930232558139535, 1.0, 1.0
]
}
expected_details = pd.DataFrame(expected_details_dict)
pd.testing.assert_frame_equal(data_validity_property.details, expected_details)
assert score == 0.9513326868551618

def test_get_score_errors(self):
"""Test the ``get_score`` method when the metrics are raising errors for some columns."""
# Setup
real_data, synthetic_data, metadata = load_demo('single_table')

real_data['start_date'].iloc[0] = 0
real_data['employability_perc'].iloc[2] = 'a'

# Run
data_validity_property = DataValidity()

expected_message_1 = (
"TypeError: '<=' not supported between instances of 'int' and 'Timestamp'"
)
expected_message_2 = (
"TypeError: '<=' not supported between instances of 'float' and 'str'"
)

score = data_validity_property.get_score(real_data, synthetic_data, metadata)

# Assert

details = data_validity_property.details
details_nan = details.loc[pd.isna(details['Score'])]
column_names_nan = details_nan['Column'].tolist()
error_messages = details_nan['Error'].tolist()
assert column_names_nan == ['start_date', 'employability_perc']
assert error_messages[0] == expected_message_1
assert error_messages[1] == expected_message_2
assert score == 0.9622593255151395
14 changes: 14 additions & 0 deletions tests/unit/reports/multi_table/_properties/test_validity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Test Data Validity multi-table class."""
from sdmetrics.reports.multi_table._properties import DataValidity
from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity


def test__init__():
"""Test the ``__init__`` method."""
# Setup
column_shapes = DataValidity()

# Assert
assert column_shapes._properties == {}
assert column_shapes._single_table_property == SingleTableDataValidity
assert column_shapes._num_iteration_case == 'column'
Loading

0 comments on commit c4550a2

Please sign in to comment.