diff --git a/.gitignore b/.gitignore index 8c6107d1..b1ea75c2 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,5 @@ ENV/ # OS Files .DS_Store +# vcode stuff +.vcode/ diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index 35704626..ab6c76c0 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -6,7 +6,8 @@ from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood from sdmetrics.single_table.detection.base import DetectionMetric from sdmetrics.single_table.detection.sklearn import ( - LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection) + GradientBoostingDetection, LogisticDetection, ScikitLearnClassifierDetectionMetric, + SVCDetection) from sdmetrics.single_table.efficacy.base import MLEfficacyMetric from sdmetrics.single_table.efficacy.binary import ( BinaryAdaBoostClassifier, BinaryDecisionTreeClassifier, BinaryEfficacyMetric, @@ -47,6 +48,7 @@ 'DetectionMetric', 'LogisticDetection', 'SVCDetection', + 'GradientBoostingDetection', 'ScikitLearnClassifierDetectionMetric', 'MLEfficacyMetric', 'BinaryEfficacyMetric', diff --git a/sdmetrics/single_table/detection/__init__.py b/sdmetrics/single_table/detection/__init__.py index b987a119..8450948b 100644 --- a/sdmetrics/single_table/detection/__init__.py +++ b/sdmetrics/single_table/detection/__init__.py @@ -1,8 +1,10 @@ """Machine Learning Detection metrics for single table datasets.""" -from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection.sklearn import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) __all__ = [ + 'GradientBoostingDetection', 'LogisticDetection', 'SVCDetection' ] diff --git a/sdmetrics/single_table/detection/base.py b/sdmetrics/single_table/detection/base.py index e3bc1295..c27dc1b9 100644 --- a/sdmetrics/single_table/detection/base.py +++ b/sdmetrics/single_table/detection/base.py @@ -34,7 +34,7 @@ class DetectionMetric(SingleTableMetric): """ name = 'SingleTable Detection' - goal = Goal.MAXIMIZE + goal = Goal.MINIMIZE min_value = 0.0 max_value = 1.0 @@ -50,7 +50,7 @@ def compute(cls, real_data, synthetic_data, metadata=None): This builds a Machine Learning Classifier that learns to tell the synthetic data apart from the real data, which later on is evaluated using Cross Validation. - The output of the metric is one minus the average ROC AUC score obtained. + The output of the metric is the average ROC AUC score obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): @@ -85,15 +85,15 @@ def compute(cls, real_data, synthetic_data, metadata=None): y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) - scores.append(max(0.5, roc_auc) * 2 - 1) + scores.append(max(0.5,roc_auc)) - return 1 - np.mean(scores) + return np.mean(scores) except ValueError as err: raise IncomputableMetricError(f'DetectionMetric: Unable to be fit with error {err}') @classmethod def normalize(cls, raw_score): - """Return the `raw_score` as is, since it is already normalized. + """Return the `raw_score`normalized to be higher-is-better in [0,1] Args: raw_score (float): @@ -101,6 +101,8 @@ def normalize(cls, raw_score): Returns: float: - Simply returns `raw_score`. + Returns `2*(1-raw_score)`. """ - return super().normalize(raw_score) + assert raw_score >= 0.5, "raw auc score should be in [0.5,1]" + score = 2 * (1 - raw_score) + return super().normalize(score) diff --git a/sdmetrics/single_table/detection/sklearn.py b/sdmetrics/single_table/detection/sklearn.py index a33a33d9..38f1aa7e 100644 --- a/sdmetrics/single_table/detection/sklearn.py +++ b/sdmetrics/single_table/detection/sklearn.py @@ -1,5 +1,6 @@ """scikit-learn based DetectionMetrics for single table datasets.""" +from sklearn.ensemble import GradientBoostingClassifier from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline @@ -67,3 +68,19 @@ class SVCDetection(ScikitLearnClassifierDetectionMetric): @staticmethod def _get_classifier(): return SVC(probability=True, gamma='scale') + + +class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric): + """ScikitLearnClassifierDetectionMetric based on a GradientBoostingClassifier. + + This metric builds a GradientBoostingClassifier Classifier that learns to tell the synthetic + data apart from the real data, which later on is evaluated using Cross Validation. + + The output of the metric is one minus the average ROC AUC score obtained. + """ + + name = 'GradientBoosting Detection' + + @staticmethod + def _get_classifier(): + return GradientBoostingClassifier() diff --git a/tests/integration/single_table/test_single_table.py b/tests/integration/single_table/test_single_table.py index 7ecd45b3..2f880887 100644 --- a/tests/integration/single_table/test_single_table.py +++ b/tests/integration/single_table/test_single_table.py @@ -7,7 +7,8 @@ from sdmetrics.goal import Goal from sdmetrics.single_table.base import SingleTableMetric from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood -from sdmetrics.single_table.detection import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) from sdmetrics.single_table.multi_column_pairs import ( ContingencySimilarity, ContinuousKLDivergence, DiscreteKLDivergence) from sdmetrics.single_table.multi_single_column import ( @@ -17,6 +18,7 @@ METRICS = [ CSTest, KSComplement, + GradientBoostingDetection, LogisticDetection, SVCDetection, ContinuousKLDivergence,