Skip to content

Commit

Permalink
Apply dominance rule in constant distributions (#38)
Browse files Browse the repository at this point in the history
* Apply dominance rule in constant distributions

fixes #36

* Updates for feedback from @qubixes

- also gave categorical new default distributions
- fixed linting and test errors

* Small fix for ruff error

* One last ruff fix

thanks ruff
  • Loading branch information
vankesteren authored Jun 7, 2024
1 parent f5496cb commit 0b86b7b
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 24 deletions.
15 changes: 2 additions & 13 deletions metasyncontrib/disclosure/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Base class for all disclosure control distributions."""

import polars as pl
from metasyn.distribution.base import BaseDistribution


Expand Down Expand Up @@ -28,16 +27,6 @@ class DisclosureConstantMixin(BaseDistribution):
@classmethod
def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution:
"""Fit constant distributions with disclosure control rules in place."""
pl_series: pl.Series = cls._to_series(series)

# if unique, just get that value if it occurs at least partition_size times
if pl_series.n_unique() == 1 and pl_series.len() >= partition_size:
return cls._fit(pl_series, *args, **kwargs)

if pl_series.n_unique() > 1:
# if not unique, ensure most common value occurs at least partition_size times
_value, count = pl_series.value_counts(sort=True).row(0)
if count >= partition_size:
return cls._fit(pl_series, *args, **kwargs)

# NB: dominance rule ensures that constant distribution is essentially never
# allowed under formal disclosure control. Always return default distribution.
return cls.default_distribution()
6 changes: 5 additions & 1 deletion metasyncontrib/disclosure/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def _fit(cls, values: pl.Series, partition_size: int = 11):
probs = dist.probs[dist.probs >= partition_size / len(values)]
if len(probs) == 0 or probs.max() >= 0.9:
if MetaVar.get_var_type(values) == "discrete":
return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore
return cls([77777, 88888, 99999], [0.1, 0.2, 0.7]) # type: ignore
return cls.default_distribution()
probs /= probs.sum()
return cls(labels, probs)

@classmethod
def default_distribution(cls): # noqa: D102
return cls(["A_REDACTED", "B_REDACTED", "C_REDACTED"], [0.1, 0.3, 0.6])
5 changes: 5 additions & 0 deletions metasyncontrib/disclosure/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ class DisclosureTruncatedNormal(DisclosureNumericalMixin, TruncatedNormalDistrib
class DisclosureExponential(DisclosureNumericalMixin, ExponentialDistribution):
"""Disclosure exponential distribution."""


@metadist_disclosure()
class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution):
"""Disclosure controlled ConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls(99999.9)
13 changes: 13 additions & 0 deletions metasyncontrib/disclosure/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,29 @@ def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDate:
sub_series = pl.Series([dt_val.date() for dt_val in dt_sub_series])
return cls(sub_series.min(), sub_series.max())


@metadist_disclosure()
class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution):
"""Disclosure controlled DateTimeConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls("1970-01-01T00:00:00")


@metadist_disclosure()
class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution):
"""Disclosure controlled TimeConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls("00:00:00")


@metadist_disclosure()
class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution):
"""Disclosure controlled DateConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls("1970-01-01")
5 changes: 5 additions & 0 deletions metasyncontrib/disclosure/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def _fit(cls, values: pl.Series, partition_size: int = 11):
sub_values = micro_aggregate(values, partition_size)
return super()._fit(sub_values)


@metadist_disclosure()
class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution):
"""Disclosure controlled DiscreteConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls(99999)
5 changes: 5 additions & 0 deletions metasyncontrib/disclosure/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ class DisclosureFreetext(FreeTextDistribution):
def _fit(cls, values, max_values: int = 50, partition_size: int = 11): # pylint: disable=unused-argument
return super()._fit(values, max_values=max_values)


@metadist_disclosure()
class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution):
"""Disclosure controlled StringConstantDistribution."""

@classmethod
def default_distribution(cls): # noqa: D102
return cls("REDACTED")
19 changes: 9 additions & 10 deletions tests/test_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@


@mark.parametrize(
"dist_builtin, dist_disclosure, value",
"dist_builtin, dist_disclosure, value, disclosurevalue",
[
(ConstantDistribution, DisclosureConstant, 8.0),
(DiscreteConstantDistribution, DisclosureDiscreteConstant, 8),
(StringConstantDistribution, DisclosureStringConstant, "Secretvalue"),
(DateTimeConstantDistribution, DisclosureDateTimeConstant, "2024-02-23T12:08:38"),
(TimeConstantDistribution, DisclosureTimeConstant, "12:08:38"),
(DateConstantDistribution, DisclosureDateConstant, "2024-02-23"),
(ConstantDistribution, DisclosureConstant, 8.0, 99999.9),
(DiscreteConstantDistribution, DisclosureDiscreteConstant, 8, 99999),
(StringConstantDistribution, DisclosureStringConstant, "Secretvalue", "REDACTED"),
(DateTimeConstantDistribution, DisclosureDateTimeConstant, "2024-02-23T12:08:38", "1970-01-01T00:00:00"), # noqa: E501
(TimeConstantDistribution, DisclosureTimeConstant, "12:08:38", "00:00:00"),
(DateConstantDistribution, DisclosureDateConstant, "2024-02-23", "1970-01-01"),
],
)
def test_constant(dist_builtin, dist_disclosure, value):
def test_constant(dist_builtin, dist_disclosure, value, disclosurevalue): # noqa: D103
dist = dist_builtin(value)
data = [dist.draw() for _ in range(21)]

assert dist_disclosure.fit(data, partition_size=22)._param_dict().get("value") != value
assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == value
assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == disclosurevalue

0 comments on commit 0b86b7b

Please sign in to comment.