diff --git a/metasyncontrib/disclosure/base.py b/metasyncontrib/disclosure/base.py index 84fc762..efdf2bb 100644 --- a/metasyncontrib/disclosure/base.py +++ b/metasyncontrib/disclosure/base.py @@ -26,19 +26,18 @@ class DisclosureConstantMixin(BaseDistribution): """Mixin class to overload fit method for constant distributions.""" @classmethod - def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution: + def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution: """Fit constant distributions with disclosure control rules in place.""" pl_series: pl.Series = cls._to_series(series) - # if unique, just get that value if it occurs at least n_avg times - if pl_series.n_unique() == 1 and pl_series.len() >= n_avg: + # if unique, just get that value if it occurs at least partition_size times + if pl_series.n_unique() == 1 and pl_series.len() >= partition_size: return cls._fit(pl_series, *args, **kwargs) if pl_series.n_unique() > 1: - # if not unique, ensure most common value occurs at least n_avg times + # if not unique, ensure most common value occurs at least partition_size times _value, count = pl_series.value_counts(sort=True).row(0) - if count >= n_avg: + if count >= partition_size: return cls._fit(pl_series, *args, **kwargs) return cls.default_distribution() - diff --git a/metasyncontrib/disclosure/categorical.py b/metasyncontrib/disclosure/categorical.py index ec41ecb..6ea37a1 100644 --- a/metasyncontrib/disclosure/categorical.py +++ b/metasyncontrib/disclosure/categorical.py @@ -13,15 +13,15 @@ class DisclosureMultinoulli(MultinoulliDistribution): """Disclosure variant for multinoulli distribution. - It checks that all labels appear at least n_avg times, and that + It checks that all labels appear at least partition_size times, and that there is no label with >90% of the counts. """ @classmethod - def _fit(cls, values: pl.Series, n_avg: int = 11): + def _fit(cls, values: pl.Series, partition_size: int = 11): dist = super()._fit(values) - labels = dist.labels[dist.probs >= n_avg / len(values)] - probs = dist.probs[dist.probs >= n_avg / len(values)] + labels = dist.labels[dist.probs >= partition_size / len(values)] + probs = dist.probs[dist.probs >= partition_size / len(values)] if len(probs) == 0 or probs.max() >= 0.9: if MetaVar.get_var_type(values) == "discrete": return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore diff --git a/metasyncontrib/disclosure/datetime.py b/metasyncontrib/disclosure/datetime.py index b99dcdb..93a92d0 100644 --- a/metasyncontrib/disclosure/datetime.py +++ b/metasyncontrib/disclosure/datetime.py @@ -25,8 +25,8 @@ class DisclosureDateTime(DateTimeUniformDistribution): """Disclosure implementation for the datetime distribution.""" @classmethod - def _fit(cls, values: pl.Series, n_avg: int = 11) -> DisclosureDateTime: - sub_series = micro_aggregate(values, n_avg) + def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDateTime: + sub_series = micro_aggregate(values, partition_size) return cls(sub_series.min(), sub_series.max(), cls._get_precision(values)) @@ -35,11 +35,11 @@ class DisclosureTime(TimeUniformDistribution): """Disclosure implementation for the time distribution.""" @classmethod - def _fit(cls, values: pl.Series, n_avg: int = 11): + def _fit(cls, values: pl.Series, partition_size: int = 11): # Convert time to a datetime so that the microaggregation works today = dt.date(1970, 1, 1) dt_series = pl.Series([dt.datetime.combine(today, t) for t in values]) - dt_sub_series = micro_aggregate(dt_series, n_avg) + dt_sub_series = micro_aggregate(dt_series, partition_size) # Convert back into time sub_series = pl.Series([dt_val.time() for dt_val in dt_sub_series]) @@ -51,10 +51,10 @@ class DisclosureDate(DateUniformDistribution): """Disclosure implementation for the date distribution.""" @classmethod - def _fit(cls, values: pl.Series, n_avg: int = 11) -> DisclosureDate: + def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDate: # Convert dates to datetimes dt_series = pl.Series([dt.datetime.combine(d, dt.time(hour=12)) for d in values]) - dt_sub_series = micro_aggregate(dt_series, n_avg) + dt_sub_series = micro_aggregate(dt_series, partition_size) # Convert back into dates sub_series = pl.Series([dt_val.date() for dt_val in dt_sub_series]) diff --git a/metasyncontrib/disclosure/discrete.py b/metasyncontrib/disclosure/discrete.py index 5c198fe..3f82954 100644 --- a/metasyncontrib/disclosure/discrete.py +++ b/metasyncontrib/disclosure/discrete.py @@ -44,11 +44,11 @@ class DisclosureUniqueKey(UniqueKeyDistribution): """Implementation for unique key distribution.""" @classmethod - def _fit(cls, values: pl.Series, n_avg: int = 11): + def _fit(cls, values: pl.Series, partition_size: int = 11): orig_dist = super()._fit(values) if orig_dist.consecutive: return cls(0, True) - sub_values = micro_aggregate(values, n_avg) + sub_values = micro_aggregate(values, partition_size) return super()._fit(sub_values) @metadist_disclosure() diff --git a/metasyncontrib/disclosure/numerical.py b/metasyncontrib/disclosure/numerical.py index c90f3d8..cf55fa7 100644 --- a/metasyncontrib/disclosure/numerical.py +++ b/metasyncontrib/disclosure/numerical.py @@ -10,8 +10,8 @@ class DisclosureNumericalMixin(BaseDistribution): """Mixin class to create numerical distributions of the disclosure kind.""" @classmethod - def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution: + def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution: """Fit numeric distributions with disclosure control rules in place.""" pl_series = cls._to_series(series) - sub_series = micro_aggregate(pl_series, n_avg) + sub_series = micro_aggregate(pl_series, partition_size) return cls._fit(sub_series, *args, **kwargs) diff --git a/metasyncontrib/disclosure/privacy.py b/metasyncontrib/disclosure/privacy.py index bc8c15f..1a03ddb 100644 --- a/metasyncontrib/disclosure/privacy.py +++ b/metasyncontrib/disclosure/privacy.py @@ -10,7 +10,7 @@ class DisclosurePrivacy(BasePrivacy): Arguments: --------- - n_avg: + partition_size: Number of elements to aggregate into one bin. Higher values mean better protected privacy, but lower statistical accuracy. @@ -18,10 +18,10 @@ class DisclosurePrivacy(BasePrivacy): name = "disclosure" - def __init__(self, n_avg: int = 11): + def __init__(self, partition_size: int = 11): """Initialize the disclosure privacy object.""" - self.n_avg = n_avg + self.partition_size = partition_size def to_dict(self) -> dict: """Create a dictionary that gives the privacy type, and parameters.""" - return {"name": self.name, "parameters": {"n_avg": self.n_avg}} + return {"name": self.name, "parameters": {"partition_size": self.partition_size}} diff --git a/metasyncontrib/disclosure/string.py b/metasyncontrib/disclosure/string.py index 5309b34..e3ec778 100644 --- a/metasyncontrib/disclosure/string.py +++ b/metasyncontrib/disclosure/string.py @@ -15,7 +15,8 @@ class DisclosureFaker(FakerDistribution): """Faker distribution for disclosure control.""" @classmethod - def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument + def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", + partition_size: int = 11): # pylint: disable=unused-argument return super()._fit(values, faker_type=faker_type, locale=locale) @@ -24,7 +25,8 @@ class DisclosureUniqueFaker(UniqueFakerDistribution): """Faker distribution for disclosure control that produces unique values.""" @classmethod - def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument + def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", + partition_size: int = 11): # pylint: disable=unused-argument return super()._fit(values, faker_type=faker_type, locale=locale) @@ -33,7 +35,7 @@ class DisclosureFreetext(FreeTextDistribution): """Disclosure implementation of freetext distribution.""" @classmethod - def _fit(cls, values, max_values: int = 50, n_avg: int = 11): # pylint: disable=unused-argument + def _fit(cls, values, max_values: int = 50, partition_size: int = 11): # pylint: disable=unused-argument return super()._fit(values, max_values=max_values) @metadist_disclosure() diff --git a/metasyncontrib/disclosure/utils.py b/metasyncontrib/disclosure/utils.py index adc6cd3..0f2fc4f 100644 --- a/metasyncontrib/disclosure/utils.py +++ b/metasyncontrib/disclosure/utils.py @@ -28,7 +28,7 @@ def _compute_dominance(block_values, reverse=False): def _create_subsample( # pylint: disable=too-many-locals values, - n_avg: int = 11, + partition_size: int = 11, pre_remove: int = 0, post_remove: int = 0, ) -> tuple[list, float]: @@ -36,7 +36,7 @@ def _create_subsample( # pylint: disable=too-many-locals sorted_values = sorted_values[pre_remove : len(values) - post_remove] n_values = len(sorted_values) - n_blocks = n_values // n_avg + n_blocks = n_values // partition_size if n_blocks <= 1: raise ValueError("Cannot find subsample with current settings.") min_block_size = n_values // n_blocks diff --git a/tests/test_constant.py b/tests/test_constant.py index b10b5ad..03fd0e3 100644 --- a/tests/test_constant.py +++ b/tests/test_constant.py @@ -33,5 +33,5 @@ def test_constant(dist_builtin, dist_disclosure, value): dist = dist_builtin(value) data = [dist.draw() for _ in range(21)] - assert dist_disclosure.fit(data, n_avg=22)._param_dict().get("value") != value - assert dist_disclosure.fit(data, n_avg=11)._param_dict().get("value") == value + assert dist_disclosure.fit(data, partition_size=22)._param_dict().get("value") != value + assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == value diff --git a/tests/test_other_dist.py b/tests/test_other_dist.py index ad4219b..1da7a0c 100644 --- a/tests/test_other_dist.py +++ b/tests/test_other_dist.py @@ -48,6 +48,6 @@ def test_string(): dist = DisclosureFaker.default_distribution() series = pl.Series([dist.draw() for _ in range(100)]) assert len(series) - dist = DisclosureFaker.fit(series, n_avg=11) + dist = DisclosureFaker.fit(series, partition_size=11) assert isinstance(dist, DisclosureFaker) # assert len([dist.draw() for _ in range(100)]) == 100 diff --git a/tests/test_provider.py b/tests/test_provider.py index ad15543..0f3d3f8 100644 --- a/tests/test_provider.py +++ b/tests/test_provider.py @@ -11,7 +11,7 @@ def test_disclosure_provider(): @mark.parametrize("distribution", get_distribution_provider("metasyn-disclosure").distributions) -@mark.parametrize("privacy_kwargs", ({}, {"n_avg": 10}, {"n_avg": 15})) +@mark.parametrize("privacy_kwargs", ({}, {"partition_size": 10}, {"partition_size": 15})) def test_dist_validation(distribution, privacy_kwargs): np.random.seed(45) privacy = DisclosurePrivacy(**privacy_kwargs)