From 0b86b7be757d1321e26ec1b381fd1bb8a7256868 Mon Sep 17 00:00:00 2001 From: Erik-Jan van Kesteren Date: Fri, 7 Jun 2024 16:18:12 +0200 Subject: [PATCH] Apply dominance rule in constant distributions (#38) * Apply dominance rule in constant distributions fixes #36 * Updates for feedback from @qubixes - also gave categorical new default distributions - fixed linting and test errors * Small fix for ruff error * One last ruff fix thanks ruff --- metasyncontrib/disclosure/base.py | 15 ++------------- metasyncontrib/disclosure/categorical.py | 6 +++++- metasyncontrib/disclosure/continuous.py | 5 +++++ metasyncontrib/disclosure/datetime.py | 13 +++++++++++++ metasyncontrib/disclosure/discrete.py | 5 +++++ metasyncontrib/disclosure/string.py | 5 +++++ tests/test_constant.py | 19 +++++++++---------- 7 files changed, 44 insertions(+), 24 deletions(-) diff --git a/metasyncontrib/disclosure/base.py b/metasyncontrib/disclosure/base.py index efdf2bb..74db9cf 100644 --- a/metasyncontrib/disclosure/base.py +++ b/metasyncontrib/disclosure/base.py @@ -1,6 +1,5 @@ """Base class for all disclosure control distributions.""" -import polars as pl from metasyn.distribution.base import BaseDistribution @@ -28,16 +27,6 @@ class DisclosureConstantMixin(BaseDistribution): @classmethod def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution: """Fit constant distributions with disclosure control rules in place.""" - pl_series: pl.Series = cls._to_series(series) - - # if unique, just get that value if it occurs at least partition_size times - if pl_series.n_unique() == 1 and pl_series.len() >= partition_size: - return cls._fit(pl_series, *args, **kwargs) - - if pl_series.n_unique() > 1: - # if not unique, ensure most common value occurs at least partition_size times - _value, count = pl_series.value_counts(sort=True).row(0) - if count >= partition_size: - return cls._fit(pl_series, *args, **kwargs) - + # NB: dominance rule ensures that constant distribution is essentially never + # allowed under formal disclosure control. Always return default distribution. return cls.default_distribution() diff --git a/metasyncontrib/disclosure/categorical.py b/metasyncontrib/disclosure/categorical.py index 6ea37a1..4b0f2bc 100644 --- a/metasyncontrib/disclosure/categorical.py +++ b/metasyncontrib/disclosure/categorical.py @@ -24,7 +24,11 @@ def _fit(cls, values: pl.Series, partition_size: int = 11): probs = dist.probs[dist.probs >= partition_size / len(values)] if len(probs) == 0 or probs.max() >= 0.9: if MetaVar.get_var_type(values) == "discrete": - return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore + return cls([77777, 88888, 99999], [0.1, 0.2, 0.7]) # type: ignore return cls.default_distribution() probs /= probs.sum() return cls(labels, probs) + + @classmethod + def default_distribution(cls): # noqa: D102 + return cls(["A_REDACTED", "B_REDACTED", "C_REDACTED"], [0.1, 0.3, 0.6]) diff --git a/metasyncontrib/disclosure/continuous.py b/metasyncontrib/disclosure/continuous.py index fa543d3..4c80caa 100644 --- a/metasyncontrib/disclosure/continuous.py +++ b/metasyncontrib/disclosure/continuous.py @@ -37,6 +37,11 @@ class DisclosureTruncatedNormal(DisclosureNumericalMixin, TruncatedNormalDistrib class DisclosureExponential(DisclosureNumericalMixin, ExponentialDistribution): """Disclosure exponential distribution.""" + @metadist_disclosure() class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution): """Disclosure controlled ConstantDistribution.""" + + @classmethod + def default_distribution(cls): # noqa: D102 + return cls(99999.9) diff --git a/metasyncontrib/disclosure/datetime.py b/metasyncontrib/disclosure/datetime.py index 93a92d0..e26bb2f 100644 --- a/metasyncontrib/disclosure/datetime.py +++ b/metasyncontrib/disclosure/datetime.py @@ -60,16 +60,29 @@ def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDate: sub_series = pl.Series([dt_val.date() for dt_val in dt_sub_series]) return cls(sub_series.min(), sub_series.max()) + @metadist_disclosure() class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution): """Disclosure controlled DateTimeConstantDistribution.""" + @classmethod + def default_distribution(cls): # noqa: D102 + return cls("1970-01-01T00:00:00") + @metadist_disclosure() class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution): """Disclosure controlled TimeConstantDistribution.""" + @classmethod + def default_distribution(cls): # noqa: D102 + return cls("00:00:00") + @metadist_disclosure() class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution): """Disclosure controlled DateConstantDistribution.""" + + @classmethod + def default_distribution(cls): # noqa: D102 + return cls("1970-01-01") diff --git a/metasyncontrib/disclosure/discrete.py b/metasyncontrib/disclosure/discrete.py index 3f82954..5fbb934 100644 --- a/metasyncontrib/disclosure/discrete.py +++ b/metasyncontrib/disclosure/discrete.py @@ -51,6 +51,11 @@ def _fit(cls, values: pl.Series, partition_size: int = 11): sub_values = micro_aggregate(values, partition_size) return super()._fit(sub_values) + @metadist_disclosure() class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution): """Disclosure controlled DiscreteConstantDistribution.""" + + @classmethod + def default_distribution(cls): # noqa: D102 + return cls(99999) diff --git a/metasyncontrib/disclosure/string.py b/metasyncontrib/disclosure/string.py index e3ec778..02c5369 100644 --- a/metasyncontrib/disclosure/string.py +++ b/metasyncontrib/disclosure/string.py @@ -38,6 +38,11 @@ class DisclosureFreetext(FreeTextDistribution): def _fit(cls, values, max_values: int = 50, partition_size: int = 11): # pylint: disable=unused-argument return super()._fit(values, max_values=max_values) + @metadist_disclosure() class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution): """Disclosure controlled StringConstantDistribution.""" + + @classmethod + def default_distribution(cls): # noqa: D102 + return cls("REDACTED") diff --git a/tests/test_constant.py b/tests/test_constant.py index 03fd0e3..f658125 100644 --- a/tests/test_constant.py +++ b/tests/test_constant.py @@ -19,19 +19,18 @@ @mark.parametrize( - "dist_builtin, dist_disclosure, value", + "dist_builtin, dist_disclosure, value, disclosurevalue", [ - (ConstantDistribution, DisclosureConstant, 8.0), - (DiscreteConstantDistribution, DisclosureDiscreteConstant, 8), - (StringConstantDistribution, DisclosureStringConstant, "Secretvalue"), - (DateTimeConstantDistribution, DisclosureDateTimeConstant, "2024-02-23T12:08:38"), - (TimeConstantDistribution, DisclosureTimeConstant, "12:08:38"), - (DateConstantDistribution, DisclosureDateConstant, "2024-02-23"), + (ConstantDistribution, DisclosureConstant, 8.0, 99999.9), + (DiscreteConstantDistribution, DisclosureDiscreteConstant, 8, 99999), + (StringConstantDistribution, DisclosureStringConstant, "Secretvalue", "REDACTED"), + (DateTimeConstantDistribution, DisclosureDateTimeConstant, "2024-02-23T12:08:38", "1970-01-01T00:00:00"), # noqa: E501 + (TimeConstantDistribution, DisclosureTimeConstant, "12:08:38", "00:00:00"), + (DateConstantDistribution, DisclosureDateConstant, "2024-02-23", "1970-01-01"), ], ) -def test_constant(dist_builtin, dist_disclosure, value): +def test_constant(dist_builtin, dist_disclosure, value, disclosurevalue): # noqa: D103 dist = dist_builtin(value) data = [dist.draw() for _ in range(21)] - assert dist_disclosure.fit(data, partition_size=22)._param_dict().get("value") != value - assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == value + assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == disclosurevalue