Skip to content

Commit

Permalink
remove ignore_dtype_columns
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo committed Nov 15, 2023
1 parent 27d728c commit a2b637f
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 53 deletions.
14 changes: 1 addition & 13 deletions sdmetrics/reports/single_table/_properties/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,9 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
Returns:
pandas.DataFrame
"""
column_to_ignore_dtype = []
non_pii_sdtype = [
'numerical', 'datetime', 'categorical', 'boolean'
]
for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
if sdtype in non_pii_sdtype:
continue

column_to_ignore_dtype.append(column_name)

try:
score = TableStructure.compute(
real_data, synthetic_data,
ignore_dtype_columns=column_to_ignore_dtype
real_data, synthetic_data
)
error_message = None

Expand Down
16 changes: 3 additions & 13 deletions sdmetrics/single_table/table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,15 @@ class TableStructure(SingleTableMetric):
max_value = 1

@classmethod
def compute_breakdown(cls, real_data, synthetic_data, ignore_dtype_columns=None):
def compute_breakdown(cls, real_data, synthetic_data):
"""Compute the score breakdown of the table format metric.
Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
ignore_dtype_columns (list[str]):
List of column names to ignore when comparing data types.
Defaults to ``None``.
"""
ignore_dtype_columns = ignore_dtype_columns or []
missing_columns_in_synthetic = set(real_data.columns) - set(synthetic_data.columns)
invalid_names = []
invalid_sdtypes = []
Expand All @@ -47,9 +43,6 @@ def compute_breakdown(cls, real_data, synthetic_data, ignore_dtype_columns=None)
invalid_names.append(column)
continue

if column in ignore_dtype_columns:
continue

if synthetic_data[column].dtype != real_data[column].dtype:
invalid_sdtypes.append(column)

Expand All @@ -61,20 +54,17 @@ def compute_breakdown(cls, real_data, synthetic_data, ignore_dtype_columns=None)
return {'score': score}

@classmethod
def compute(cls, real_data, synthetic_data, ignore_dtype_columns=None):
def compute(cls, real_data, synthetic_data):
"""Compute the table format metric score.
Args:
real_data (pandas.DataFrame):
The real data.
synthetic_data (pandas.DataFrame):
The synthetic data.
ignore_dtype_columns (list[str]):
List of column names to ignore when comparing data types.
Defaults to ``None``.
Returns:
float:
The metric score.
"""
return cls.compute_breakdown(real_data, synthetic_data, ignore_dtype_columns)['score']
return cls.compute_breakdown(real_data, synthetic_data)['score']
4 changes: 2 additions & 2 deletions tests/unit/reports/single_table/_properties/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test__generate_details(self, table_format_mock):

# Assert
table_format_mock.assert_called_once_with(
real_data, synthetic_data, ignore_dtype_columns=[]
real_data, synthetic_data,
)

expected_details = pd.DataFrame({
Expand Down Expand Up @@ -78,7 +78,7 @@ def test__generate_details_with_id_column(self, table_format_mock):

# Assert
table_format_mock.assert_called_once_with(
real_data, synthetic_data, ignore_dtype_columns=['id']
real_data, synthetic_data
)

expected_details = pd.DataFrame({
Expand Down
26 changes: 1 addition & 25 deletions tests/unit/single_table/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,30 +109,6 @@ def test_compute_breakdown_with_invalid_dtypes(self, real_data):
expected_result = {'score': 0.6}
assert result == expected_result

def test_compute_breakdown_ignore_dtype_columns(self, real_data):
"""Test the ``compute_breakdown`` method when ignore_dtype_columns is set."""
# Setup
synthetic_data = pd.DataFrame({
'col_1': [3.0, 2.0, 1.0, 4.0, 5.0],
'col_2': ['A', 'B', 'C', 'D', 'E'],
'col_3': [True, False, True, False, True],
'col_4': [
'2020-01-11', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'
],
'col_5': [4.0, 2.0, 3.0, 4.0, 5.0],
})

metric = TableStructure()

# Run
result = metric.compute_breakdown(
real_data, synthetic_data, ignore_dtype_columns=['col_4']
)

# Assert
expected_result = {'score': 0.8}
assert result == expected_result

def test_compute_breakdown_multiple_error(self, real_data):
"""Test the ``compute_breakdown`` method with the different failure modes."""
synthetic_data = pd.DataFrame({
Expand Down Expand Up @@ -174,5 +150,5 @@ def test_compute(self, compute_breakdown_mock, real_data):
result = metric.compute(real_data, synthetic_data)

# Assert
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None)
compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
assert result == 0.6

0 comments on commit a2b637f

Please sign in to comment.