Skip to content

Commit

Permalink
Big update to bring disclosure package in line with metasyn 0.7.0 (#24)
Browse files Browse the repository at this point in the history
* Big update to bring disclosure package in line with metasyn 0.7.0

* small fix for na distirbution

* small fix for freetext distribution

* small tests fix

* reformat with ruff, lint, mypy, CI fixes

* Remove strange timezone error in tests
  • Loading branch information
vankesteren authored Feb 24, 2024
1 parent 8a8d1a4 commit 03dce16
Show file tree
Hide file tree
Showing 20 changed files with 746 additions and 263 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
pylint metasyncontrib/disclosure
- name: Check docstrings with pydocstyle
run: |
pydocstyle metasyncontrib/disclosure --convention=numpy --add-select=D417 --add-ignore="D102,D105"
pydocstyle metasyncontrib/disclosure --convention=numpy --add-select=D417 --add-ignore="D102,D105,D406"
- name: Check types with MyPy
run: |
mypy metasyncontrib/disclosure
Expand Down
497 changes: 446 additions & 51 deletions examples/tutorial.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions metasyncontrib/disclosure/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ def metadist_disclosure():
cls:
Class with the appropriate class variables.
"""

def _wrap(cls):
cls.provenance = "metasyn-disclosure"
cls.privacy = "disclosure"
return cls

return _wrap
15 changes: 10 additions & 5 deletions metasyncontrib/disclosure/categorical.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
"""Disclosure classes for categorical variables."""

import polars as pl
from __future__ import annotations

import polars as pl
from metasyn.distribution.categorical import MultinoulliDistribution
from metasyn.var import MetaVar

from metasyncontrib.disclosure.base import metadist_disclosure


@metadist_disclosure()
class DisclosureMultinoulli(MultinoulliDistribution):
"""Disclosure variant for multinoulli distribution.
It checks that all labels appear at least x times, and that
It checks that all labels appear at least n_avg times, and that
there is no label with >90% of the counts.
"""

@classmethod
def _fit(cls, values: pl.Series, n_avg: int = 11):
dist = super(DisclosureMultinoulli, cls)._fit(values)
labels = dist.labels[dist.probs >= n_avg/len(values)]
probs = dist.probs[dist.probs >= n_avg/len(values)]
dist = super()._fit(values)
labels = dist.labels[dist.probs >= n_avg / len(values)]
probs = dist.probs[dist.probs >= n_avg / len(values)]
if len(probs) == 0 or probs.max() >= 0.9:
if MetaVar.get_var_type(values) == "discrete":
return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore
return cls.default_distribution()
probs /= probs.sum()
return cls(labels, probs)
61 changes: 32 additions & 29 deletions metasyncontrib/disclosure/constant.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,66 @@
"""Module for disclosure controlled constant distributions."""
from __future__ import annotations

import polars as pl
from metasyn.distribution.base import BaseDistribution
from metasyn.distribution.constant import (
ConstantDistribution,
DateConstantDistribution,
DateTimeConstantDistribution,
DiscreteConstantDistribution,
StringConstantDistribution,
DateTimeConstantDistribution,
TimeConstantDistribution,
DateConstantDistribution,
)

from metasyncontrib.disclosure.base import metadist_disclosure


def disclosure_constant(cls):
"""Override _fit method for constant distributions using this decorator."""
def _fit(values: pl.Series, n_avg=11):
# if unique, just get that value if it occurs at least n_avg times
if values.n_unique() == 1 & values.len() >= n_avg:
return cls(values.unique()[0])
class DisclosureConstantMixin(BaseDistribution):
"""Mixin class to overload fit method for constant distributions."""

# otherwise get most common value
val_counts = values.value_counts(sort=True)
value = val_counts[0,0]
count = val_counts[0,1]
@classmethod
def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution:
"""Fit constant distributions with disclosure control rules in place."""
pl_series: pl.Series = cls._to_series(series)

if count >= n_avg:
return cls(value)
# if unique, just get that value if it occurs at least n_avg times
if pl_series.n_unique() == 1 and pl_series.len() >= n_avg:
return cls._fit(pl_series, *args, **kwargs)

return cls.default_distribution()
if pl_series.n_unique() > 1:
# if not unique, ensure most common value occurs at least n_avg times
_value, count = pl_series.value_counts(sort=True).row(0)
if count >= n_avg:
return cls._fit(pl_series, *args, **kwargs)

setattr(cls, "_fit", _fit)
return cls
return cls.default_distribution()


@metadist_disclosure()
@disclosure_constant
class DisclosureConstant(ConstantDistribution):
class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution):
"""Disclosure controlled ConstantDistribution."""


@metadist_disclosure()
@disclosure_constant
class DisclosureDiscreteConstant(DiscreteConstantDistribution):
class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution):
"""Disclosure controlled DiscreteConstantDistribution."""


@metadist_disclosure()
@disclosure_constant
class DisclosureStringConstant(StringConstantDistribution):
class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution):
"""Disclosure controlled StringConstantDistribution."""


@metadist_disclosure()
@disclosure_constant
class DisclosureDateTimeConstant(DateTimeConstantDistribution):
class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution):
"""Disclosure controlled DateTimeConstantDistribution."""


@metadist_disclosure()
@disclosure_constant
class DisclosureTimeConstant(TimeConstantDistribution):
class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution):
"""Disclosure controlled TimeConstantDistribution."""


@metadist_disclosure()
@disclosure_constant
class DisclosureDateConstant(DateConstantDistribution):
class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution):
"""Disclosure controlled DateConstantDistribution."""
29 changes: 16 additions & 13 deletions metasyncontrib/disclosure/continuous.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
"""Disclosure control implementations for continuous distributions."""

from metasyn.distribution.continuous import UniformDistribution
from metasyn.distribution.continuous import NormalDistribution, LogNormalDistribution
from metasyn.distribution.continuous import ExponentialDistribution
from metasyn.distribution.continuous import TruncatedNormalDistribution
from metasyn.distribution.continuous import (
ExponentialDistribution,
LogNormalDistribution,
NormalDistribution,
TruncatedNormalDistribution,
UniformDistribution,
)

from metasyncontrib.disclosure.numerical import DisclosureNumerical
from metasyncontrib.disclosure.base import metadist_disclosure
from metasyncontrib.disclosure.numerical import DisclosureNumericalMixin


@metadist_disclosure()
class DisclosureUniform(DisclosureNumerical, UniformDistribution):
class DisclosureUniform(DisclosureNumericalMixin, UniformDistribution):
"""Uniform distribution implementation."""


@metadist_disclosure()
class DisclosureTruncatedNormal(DisclosureNumerical, TruncatedNormalDistribution):
"""Truncated normal distribution implementation."""
class DisclosureNormal(DisclosureNumericalMixin, NormalDistribution):
"""Disclosure normal distribution."""


@metadist_disclosure()
class DisclosureNormal(DisclosureNumerical, NormalDistribution):
"""Disclosure normal distribution."""
class DisclosureLogNormal(DisclosureNumericalMixin, LogNormalDistribution):
"""Disclosure log-normal distribution."""


@metadist_disclosure()
class DisclosureLogNormal(DisclosureNumerical, LogNormalDistribution):
"""Disclosure log-normal distribution."""
class DisclosureTruncatedNormal(DisclosureNumericalMixin, TruncatedNormalDistribution):
"""Truncated normal distribution implementation."""


@metadist_disclosure()
class DisclosureExponential(DisclosureNumerical, ExponentialDistribution):
class DisclosureExponential(DisclosureNumericalMixin, ExponentialDistribution):
"""Disclosure exponential distribution."""
12 changes: 8 additions & 4 deletions metasyncontrib/disclosure/datetime.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Disclosure classes for date/time/datetime distributions."""

from __future__ import annotations

import datetime as dt

import polars as pl
from metasyn.distribution.datetime import (
DateTimeUniformDistribution,
DateUniformDistribution,
TimeUniformDistribution,
)

from metasyncontrib.disclosure.base import metadist_disclosure

from metasyn.distribution.datetime import DateTimeUniformDistribution
from metasyn.distribution.datetime import TimeUniformDistribution
from metasyn.distribution.datetime import DateUniformDistribution
# from metasyncontrib.disclosure.base import BaseDisclosureDistribution
from metasyncontrib.disclosure.utils import micro_aggregate
from metasyncontrib.disclosure.base import metadist_disclosure


@metadist_disclosure()
Expand Down
39 changes: 27 additions & 12 deletions metasyncontrib/disclosure/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,41 @@
from __future__ import annotations

import polars as pl
from metasyn.distribution.discrete import (
DiscreteNormalDistribution,
DiscreteTruncatedNormalDistribution,
DiscreteUniformDistribution,
PoissonDistribution,
UniqueKeyDistribution,
)

from metasyn.distribution.discrete import DiscreteUniformDistribution
from metasyn.distribution.discrete import PoissonDistribution
from metasyn.distribution.discrete import UniqueKeyDistribution

from metasyncontrib.disclosure.numerical import DisclosureNumerical
from metasyncontrib.disclosure.utils import micro_aggregate
from metasyncontrib.disclosure.base import metadist_disclosure
from metasyncontrib.disclosure.numerical import DisclosureNumericalMixin
from metasyncontrib.disclosure.utils import micro_aggregate


@metadist_disclosure()
class DisclosureDiscreteUniform(DisclosureNumericalMixin, DiscreteUniformDistribution):
"""Implementation for discrete uniform distribution."""


@metadist_disclosure()
class DisclosureDiscreteUniform(DisclosureNumerical, DiscreteUniformDistribution):
class DisclosureDiscreteNormal(DisclosureNumericalMixin, DiscreteNormalDistribution):
"""Implementation for discrete uniform distribution."""


@metadist_disclosure()
class DisclosureDiscreteTruncatedNormal(
DisclosureNumericalMixin, DiscreteTruncatedNormalDistribution
):
"""Implementation for discrete uniform distribution."""


@metadist_disclosure()
class DisclosurePoisson(DisclosureNumericalMixin, PoissonDistribution):
"""Disclosure implementation for Poisson distribution."""


@metadist_disclosure()
class DisclosureUniqueKey(UniqueKeyDistribution):
"""Implementation for unique key distribution."""
Expand All @@ -29,8 +49,3 @@ def _fit(cls, values: pl.Series, n_avg: int = 11):
return cls(0, True)
sub_values = micro_aggregate(values, n_avg)
return super()._fit(sub_values)


@metadist_disclosure()
class DisclosurePoisson(DisclosureNumerical, PoissonDistribution):
"""Disclosure implementation for Poisson distribution."""
36 changes: 36 additions & 0 deletions metasyncontrib/disclosure/faker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Module for disclosure control for string distributions."""

from metasyn.distribution.faker import (
FakerDistribution,
FreeTextDistribution,
UniqueFakerDistribution,
)

from metasyncontrib.disclosure.base import metadist_disclosure


@metadist_disclosure()
class DisclosureFaker(FakerDistribution):
"""Faker distribution for disclosure control."""

@classmethod
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument
return super()._fit(values, faker_type=faker_type, locale=locale)


@metadist_disclosure()
class DisclosureUniqueFaker(UniqueFakerDistribution):
"""Faker distribution for disclosure control that produces unique values."""

@classmethod
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument
return super()._fit(values, faker_type=faker_type, locale=locale)


@metadist_disclosure()
class DisclosureFreetext(FreeTextDistribution):
"""Disclosure implementation of freetext distribution."""

@classmethod
def _fit(cls, values, max_values: int = 50, n_avg: int = 11): # pylint: disable=unused-argument
return super()._fit(values, max_values=max_values)
11 changes: 11 additions & 0 deletions metasyncontrib/disclosure/na.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Module for disclosure-controlled NA distribution."""
from __future__ import annotations

from metasyn.distribution.na import NADistribution

from metasyncontrib.disclosure.base import metadist_disclosure


@metadist_disclosure()
class DisclosureNA(NADistribution):
"""Disclosure version of NA distribution."""
5 changes: 3 additions & 2 deletions metasyncontrib/disclosure/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from metasyncontrib.disclosure.utils import micro_aggregate


class DisclosureNumerical(BaseDistribution):
"""Class for numerical distributions of the disclosure kind."""
class DisclosureNumericalMixin(BaseDistribution):
"""Mixin class to create numerical distributions of the disclosure kind."""

@classmethod
def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution:
"""Fit numeric distributions with disclosure control rules in place."""
pl_series = cls._to_series(series)
sub_series = micro_aggregate(pl_series, n_avg)
return cls._fit(sub_series, *args, **kwargs)
9 changes: 4 additions & 5 deletions metasyncontrib/disclosure/privacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class DisclosurePrivacy(BasePrivacy):
"""Disclosure control privacy class that uses micro-aggregation.
Arguments
Arguments:
---------
n_avg:
Number of elements to aggregate into one bin. Higher values
Expand All @@ -18,10 +18,9 @@ class DisclosurePrivacy(BasePrivacy):
name = "disclosure"

def __init__(self, n_avg: int = 11):
"""Initialize the disclosure privacy object."""
self.n_avg = n_avg

def to_dict(self) -> dict:
return {
"type": self.name,
"parameters": {"n_avg": self.n_avg}
}
"""Create a dictionary that gives the privacy type, and parameters."""
return {"name": self.name, "parameters": {"n_avg": self.n_avg}}
Loading

0 comments on commit 03dce16

Please sign in to comment.