Skip to content

Commit

Permalink
Rename all n_avg -> partition_size (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
qubixes authored Jun 7, 2024
1 parent 90d8765 commit f5496cb
Show file tree
Hide file tree
Showing 11 changed files with 34 additions and 33 deletions.
11 changes: 5 additions & 6 deletions metasyncontrib/disclosure/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,18 @@ class DisclosureConstantMixin(BaseDistribution):
"""Mixin class to overload fit method for constant distributions."""

@classmethod
def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution:
def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution:
"""Fit constant distributions with disclosure control rules in place."""
pl_series: pl.Series = cls._to_series(series)

# if unique, just get that value if it occurs at least n_avg times
if pl_series.n_unique() == 1 and pl_series.len() >= n_avg:
# if unique, just get that value if it occurs at least partition_size times
if pl_series.n_unique() == 1 and pl_series.len() >= partition_size:
return cls._fit(pl_series, *args, **kwargs)

if pl_series.n_unique() > 1:
# if not unique, ensure most common value occurs at least n_avg times
# if not unique, ensure most common value occurs at least partition_size times
_value, count = pl_series.value_counts(sort=True).row(0)
if count >= n_avg:
if count >= partition_size:
return cls._fit(pl_series, *args, **kwargs)

return cls.default_distribution()

8 changes: 4 additions & 4 deletions metasyncontrib/disclosure/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
class DisclosureMultinoulli(MultinoulliDistribution):
"""Disclosure variant for multinoulli distribution.
It checks that all labels appear at least n_avg times, and that
It checks that all labels appear at least partition_size times, and that
there is no label with >90% of the counts.
"""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11):
def _fit(cls, values: pl.Series, partition_size: int = 11):
dist = super()._fit(values)
labels = dist.labels[dist.probs >= n_avg / len(values)]
probs = dist.probs[dist.probs >= n_avg / len(values)]
labels = dist.labels[dist.probs >= partition_size / len(values)]
probs = dist.probs[dist.probs >= partition_size / len(values)]
if len(probs) == 0 or probs.max() >= 0.9:
if MetaVar.get_var_type(values) == "discrete":
return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore
Expand Down
12 changes: 6 additions & 6 deletions metasyncontrib/disclosure/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ class DisclosureDateTime(DateTimeUniformDistribution):
"""Disclosure implementation for the datetime distribution."""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11) -> DisclosureDateTime:
sub_series = micro_aggregate(values, n_avg)
def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDateTime:
sub_series = micro_aggregate(values, partition_size)
return cls(sub_series.min(), sub_series.max(), cls._get_precision(values))


Expand All @@ -35,11 +35,11 @@ class DisclosureTime(TimeUniformDistribution):
"""Disclosure implementation for the time distribution."""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11):
def _fit(cls, values: pl.Series, partition_size: int = 11):
# Convert time to a datetime so that the microaggregation works
today = dt.date(1970, 1, 1)
dt_series = pl.Series([dt.datetime.combine(today, t) for t in values])
dt_sub_series = micro_aggregate(dt_series, n_avg)
dt_sub_series = micro_aggregate(dt_series, partition_size)

# Convert back into time
sub_series = pl.Series([dt_val.time() for dt_val in dt_sub_series])
Expand All @@ -51,10 +51,10 @@ class DisclosureDate(DateUniformDistribution):
"""Disclosure implementation for the date distribution."""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11) -> DisclosureDate:
def _fit(cls, values: pl.Series, partition_size: int = 11) -> DisclosureDate:
# Convert dates to datetimes
dt_series = pl.Series([dt.datetime.combine(d, dt.time(hour=12)) for d in values])
dt_sub_series = micro_aggregate(dt_series, n_avg)
dt_sub_series = micro_aggregate(dt_series, partition_size)

# Convert back into dates
sub_series = pl.Series([dt_val.date() for dt_val in dt_sub_series])
Expand Down
4 changes: 2 additions & 2 deletions metasyncontrib/disclosure/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ class DisclosureUniqueKey(UniqueKeyDistribution):
"""Implementation for unique key distribution."""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11):
def _fit(cls, values: pl.Series, partition_size: int = 11):
orig_dist = super()._fit(values)
if orig_dist.consecutive:
return cls(0, True)
sub_values = micro_aggregate(values, n_avg)
sub_values = micro_aggregate(values, partition_size)
return super()._fit(sub_values)

@metadist_disclosure()
Expand Down
4 changes: 2 additions & 2 deletions metasyncontrib/disclosure/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class DisclosureNumericalMixin(BaseDistribution):
"""Mixin class to create numerical distributions of the disclosure kind."""

@classmethod
def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution:
def fit(cls, series, *args, partition_size: int = 11, **kwargs) -> BaseDistribution:
"""Fit numeric distributions with disclosure control rules in place."""
pl_series = cls._to_series(series)
sub_series = micro_aggregate(pl_series, n_avg)
sub_series = micro_aggregate(pl_series, partition_size)
return cls._fit(sub_series, *args, **kwargs)
8 changes: 4 additions & 4 deletions metasyncontrib/disclosure/privacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ class DisclosurePrivacy(BasePrivacy):
Arguments:
---------
n_avg:
partition_size:
Number of elements to aggregate into one bin. Higher values
mean better protected privacy, but lower statistical accuracy.
"""

name = "disclosure"

def __init__(self, n_avg: int = 11):
def __init__(self, partition_size: int = 11):
"""Initialize the disclosure privacy object."""
self.n_avg = n_avg
self.partition_size = partition_size

def to_dict(self) -> dict:
"""Create a dictionary that gives the privacy type, and parameters."""
return {"name": self.name, "parameters": {"n_avg": self.n_avg}}
return {"name": self.name, "parameters": {"partition_size": self.partition_size}}
8 changes: 5 additions & 3 deletions metasyncontrib/disclosure/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class DisclosureFaker(FakerDistribution):
"""Faker distribution for disclosure control."""

@classmethod
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US",
partition_size: int = 11): # pylint: disable=unused-argument
return super()._fit(values, faker_type=faker_type, locale=locale)


Expand All @@ -24,7 +25,8 @@ class DisclosureUniqueFaker(UniqueFakerDistribution):
"""Faker distribution for disclosure control that produces unique values."""

@classmethod
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US",
partition_size: int = 11): # pylint: disable=unused-argument
return super()._fit(values, faker_type=faker_type, locale=locale)


Expand All @@ -33,7 +35,7 @@ class DisclosureFreetext(FreeTextDistribution):
"""Disclosure implementation of freetext distribution."""

@classmethod
def _fit(cls, values, max_values: int = 50, n_avg: int = 11): # pylint: disable=unused-argument
def _fit(cls, values, max_values: int = 50, partition_size: int = 11): # pylint: disable=unused-argument
return super()._fit(values, max_values=max_values)

@metadist_disclosure()
Expand Down
4 changes: 2 additions & 2 deletions metasyncontrib/disclosure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def _compute_dominance(block_values, reverse=False):

def _create_subsample( # pylint: disable=too-many-locals
values,
n_avg: int = 11,
partition_size: int = 11,
pre_remove: int = 0,
post_remove: int = 0,
) -> tuple[list, float]:
sorted_values = np.sort(values)
sorted_values = sorted_values[pre_remove : len(values) - post_remove]
n_values = len(sorted_values)

n_blocks = n_values // n_avg
n_blocks = n_values // partition_size
if n_blocks <= 1:
raise ValueError("Cannot find subsample with current settings.")
min_block_size = n_values // n_blocks
Expand Down
4 changes: 2 additions & 2 deletions tests/test_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ def test_constant(dist_builtin, dist_disclosure, value):
dist = dist_builtin(value)
data = [dist.draw() for _ in range(21)]

assert dist_disclosure.fit(data, n_avg=22)._param_dict().get("value") != value
assert dist_disclosure.fit(data, n_avg=11)._param_dict().get("value") == value
assert dist_disclosure.fit(data, partition_size=22)._param_dict().get("value") != value
assert dist_disclosure.fit(data, partition_size=11)._param_dict().get("value") == value
2 changes: 1 addition & 1 deletion tests/test_other_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@ def test_string():
dist = DisclosureFaker.default_distribution()
series = pl.Series([dist.draw() for _ in range(100)])
assert len(series)
dist = DisclosureFaker.fit(series, n_avg=11)
dist = DisclosureFaker.fit(series, partition_size=11)
assert isinstance(dist, DisclosureFaker)
# assert len([dist.draw() for _ in range(100)]) == 100
2 changes: 1 addition & 1 deletion tests/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_disclosure_provider():


@mark.parametrize("distribution", get_distribution_provider("metasyn-disclosure").distributions)
@mark.parametrize("privacy_kwargs", ({}, {"n_avg": 10}, {"n_avg": 15}))
@mark.parametrize("privacy_kwargs", ({}, {"partition_size": 10}, {"partition_size": 15}))
def test_dist_validation(distribution, privacy_kwargs):
np.random.seed(45)
privacy = DisclosurePrivacy(**privacy_kwargs)
Expand Down

0 comments on commit f5496cb

Please sign in to comment.