Skip to content

Use raw auc then normalize for detection test #236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,5 @@ ENV/
# OS Files
.DS_Store

# vcode stuff
.vcode/
4 changes: 3 additions & 1 deletion sdmetrics/single_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood
from sdmetrics.single_table.detection.base import DetectionMetric
from sdmetrics.single_table.detection.sklearn import (
LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection)
GradientBoostingDetection, LogisticDetection, ScikitLearnClassifierDetectionMetric,
SVCDetection)
from sdmetrics.single_table.efficacy.base import MLEfficacyMetric
from sdmetrics.single_table.efficacy.binary import (
BinaryAdaBoostClassifier, BinaryDecisionTreeClassifier, BinaryEfficacyMetric,
Expand Down Expand Up @@ -47,6 +48,7 @@
'DetectionMetric',
'LogisticDetection',
'SVCDetection',
'GradientBoostingDetection',
'ScikitLearnClassifierDetectionMetric',
'MLEfficacyMetric',
'BinaryEfficacyMetric',
Expand Down
4 changes: 3 additions & 1 deletion sdmetrics/single_table/detection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Machine Learning Detection metrics for single table datasets."""

from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection
from sdmetrics.single_table.detection.sklearn import (
GradientBoostingDetection, LogisticDetection, SVCDetection)

__all__ = [
'GradientBoostingDetection',
'LogisticDetection',
'SVCDetection'
]
16 changes: 9 additions & 7 deletions sdmetrics/single_table/detection/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class DetectionMetric(SingleTableMetric):
"""

name = 'SingleTable Detection'
goal = Goal.MAXIMIZE
goal = Goal.MINIMIZE
min_value = 0.0
max_value = 1.0

Expand All @@ -50,7 +50,7 @@ def compute(cls, real_data, synthetic_data, metadata=None):
This builds a Machine Learning Classifier that learns to tell the synthetic
data apart from the real data, which later on is evaluated using Cross Validation.

The output of the metric is one minus the average ROC AUC score obtained.
The output of the metric is the average ROC AUC score obtained.

Args:
real_data (Union[numpy.ndarray, pandas.DataFrame]):
Expand Down Expand Up @@ -85,22 +85,24 @@ def compute(cls, real_data, synthetic_data, metadata=None):
y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index])
roc_auc = roc_auc_score(y[test_index], y_pred)

scores.append(max(0.5, roc_auc) * 2 - 1)
scores.append(max(0.5,roc_auc))

return 1 - np.mean(scores)
return np.mean(scores)
except ValueError as err:
raise IncomputableMetricError(f'DetectionMetric: Unable to be fit with error {err}')

@classmethod
def normalize(cls, raw_score):
"""Return the `raw_score` as is, since it is already normalized.
"""Return the `raw_score`normalized to be higher-is-better in [0,1]

Args:
raw_score (float):
The value of the metric from `compute`.

Returns:
float:
Simply returns `raw_score`.
Returns `2*(1-raw_score)`.
"""
return super().normalize(raw_score)
assert raw_score >= 0.5, "raw auc score should be in [0.5,1]"
score = 2 * (1 - raw_score)
return super().normalize(score)
17 changes: 17 additions & 0 deletions sdmetrics/single_table/detection/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""scikit-learn based DetectionMetrics for single table datasets."""

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -67,3 +68,19 @@ class SVCDetection(ScikitLearnClassifierDetectionMetric):
@staticmethod
def _get_classifier():
return SVC(probability=True, gamma='scale')


class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric):
"""ScikitLearnClassifierDetectionMetric based on a GradientBoostingClassifier.

This metric builds a GradientBoostingClassifier Classifier that learns to tell the synthetic
data apart from the real data, which later on is evaluated using Cross Validation.

The output of the metric is one minus the average ROC AUC score obtained.
"""

name = 'GradientBoosting Detection'

@staticmethod
def _get_classifier():
return GradientBoostingClassifier()
4 changes: 3 additions & 1 deletion tests/integration/single_table/test_single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood
from sdmetrics.single_table.detection import LogisticDetection, SVCDetection
from sdmetrics.single_table.detection import (
GradientBoostingDetection, LogisticDetection, SVCDetection)
from sdmetrics.single_table.multi_column_pairs import (
ContingencySimilarity, ContinuousKLDivergence, DiscreteKLDivergence)
from sdmetrics.single_table.multi_single_column import (
Expand All @@ -17,6 +18,7 @@
METRICS = [
CSTest,
KSComplement,
GradientBoostingDetection,
LogisticDetection,
SVCDetection,
ContinuousKLDivergence,
Expand Down