-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Big update to bring disclosure package in line with metasyn 0.7.0 (#24)
* Big update to bring disclosure package in line with metasyn 0.7.0 * small fix for na distirbution * small fix for freetext distribution * small tests fix * reformat with ruff, lint, mypy, CI fixes * Remove strange timezone error in tests
- Loading branch information
1 parent
8a8d1a4
commit 03dce16
Showing
20 changed files
with
746 additions
and
263 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,30 @@ | ||
"""Disclosure classes for categorical variables.""" | ||
|
||
import polars as pl | ||
from __future__ import annotations | ||
|
||
import polars as pl | ||
from metasyn.distribution.categorical import MultinoulliDistribution | ||
from metasyn.var import MetaVar | ||
|
||
from metasyncontrib.disclosure.base import metadist_disclosure | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureMultinoulli(MultinoulliDistribution): | ||
"""Disclosure variant for multinoulli distribution. | ||
It checks that all labels appear at least x times, and that | ||
It checks that all labels appear at least n_avg times, and that | ||
there is no label with >90% of the counts. | ||
""" | ||
|
||
@classmethod | ||
def _fit(cls, values: pl.Series, n_avg: int = 11): | ||
dist = super(DisclosureMultinoulli, cls)._fit(values) | ||
labels = dist.labels[dist.probs >= n_avg/len(values)] | ||
probs = dist.probs[dist.probs >= n_avg/len(values)] | ||
dist = super()._fit(values) | ||
labels = dist.labels[dist.probs >= n_avg / len(values)] | ||
probs = dist.probs[dist.probs >= n_avg / len(values)] | ||
if len(probs) == 0 or probs.max() >= 0.9: | ||
if MetaVar.get_var_type(values) == "discrete": | ||
return cls([1, 2, 3], [0.1, 0.2, 0.7]) # type: ignore | ||
return cls.default_distribution() | ||
probs /= probs.sum() | ||
return cls(labels, probs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,66 @@ | ||
"""Module for disclosure controlled constant distributions.""" | ||
from __future__ import annotations | ||
|
||
import polars as pl | ||
from metasyn.distribution.base import BaseDistribution | ||
from metasyn.distribution.constant import ( | ||
ConstantDistribution, | ||
DateConstantDistribution, | ||
DateTimeConstantDistribution, | ||
DiscreteConstantDistribution, | ||
StringConstantDistribution, | ||
DateTimeConstantDistribution, | ||
TimeConstantDistribution, | ||
DateConstantDistribution, | ||
) | ||
|
||
from metasyncontrib.disclosure.base import metadist_disclosure | ||
|
||
|
||
def disclosure_constant(cls): | ||
"""Override _fit method for constant distributions using this decorator.""" | ||
def _fit(values: pl.Series, n_avg=11): | ||
# if unique, just get that value if it occurs at least n_avg times | ||
if values.n_unique() == 1 & values.len() >= n_avg: | ||
return cls(values.unique()[0]) | ||
class DisclosureConstantMixin(BaseDistribution): | ||
"""Mixin class to overload fit method for constant distributions.""" | ||
|
||
# otherwise get most common value | ||
val_counts = values.value_counts(sort=True) | ||
value = val_counts[0,0] | ||
count = val_counts[0,1] | ||
@classmethod | ||
def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution: | ||
"""Fit constant distributions with disclosure control rules in place.""" | ||
pl_series: pl.Series = cls._to_series(series) | ||
|
||
if count >= n_avg: | ||
return cls(value) | ||
# if unique, just get that value if it occurs at least n_avg times | ||
if pl_series.n_unique() == 1 and pl_series.len() >= n_avg: | ||
return cls._fit(pl_series, *args, **kwargs) | ||
|
||
return cls.default_distribution() | ||
if pl_series.n_unique() > 1: | ||
# if not unique, ensure most common value occurs at least n_avg times | ||
_value, count = pl_series.value_counts(sort=True).row(0) | ||
if count >= n_avg: | ||
return cls._fit(pl_series, *args, **kwargs) | ||
|
||
setattr(cls, "_fit", _fit) | ||
return cls | ||
return cls.default_distribution() | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureConstant(ConstantDistribution): | ||
class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution): | ||
"""Disclosure controlled ConstantDistribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureDiscreteConstant(DiscreteConstantDistribution): | ||
class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution): | ||
"""Disclosure controlled DiscreteConstantDistribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureStringConstant(StringConstantDistribution): | ||
class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution): | ||
"""Disclosure controlled StringConstantDistribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureDateTimeConstant(DateTimeConstantDistribution): | ||
class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution): | ||
"""Disclosure controlled DateTimeConstantDistribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureTimeConstant(TimeConstantDistribution): | ||
class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution): | ||
"""Disclosure controlled TimeConstantDistribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
@disclosure_constant | ||
class DisclosureDateConstant(DateConstantDistribution): | ||
class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution): | ||
"""Disclosure controlled DateConstantDistribution.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,37 @@ | ||
"""Disclosure control implementations for continuous distributions.""" | ||
|
||
from metasyn.distribution.continuous import UniformDistribution | ||
from metasyn.distribution.continuous import NormalDistribution, LogNormalDistribution | ||
from metasyn.distribution.continuous import ExponentialDistribution | ||
from metasyn.distribution.continuous import TruncatedNormalDistribution | ||
from metasyn.distribution.continuous import ( | ||
ExponentialDistribution, | ||
LogNormalDistribution, | ||
NormalDistribution, | ||
TruncatedNormalDistribution, | ||
UniformDistribution, | ||
) | ||
|
||
from metasyncontrib.disclosure.numerical import DisclosureNumerical | ||
from metasyncontrib.disclosure.base import metadist_disclosure | ||
from metasyncontrib.disclosure.numerical import DisclosureNumericalMixin | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureUniform(DisclosureNumerical, UniformDistribution): | ||
class DisclosureUniform(DisclosureNumericalMixin, UniformDistribution): | ||
"""Uniform distribution implementation.""" | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureTruncatedNormal(DisclosureNumerical, TruncatedNormalDistribution): | ||
"""Truncated normal distribution implementation.""" | ||
class DisclosureNormal(DisclosureNumericalMixin, NormalDistribution): | ||
"""Disclosure normal distribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureNormal(DisclosureNumerical, NormalDistribution): | ||
"""Disclosure normal distribution.""" | ||
class DisclosureLogNormal(DisclosureNumericalMixin, LogNormalDistribution): | ||
"""Disclosure log-normal distribution.""" | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureLogNormal(DisclosureNumerical, LogNormalDistribution): | ||
"""Disclosure log-normal distribution.""" | ||
class DisclosureTruncatedNormal(DisclosureNumericalMixin, TruncatedNormalDistribution): | ||
"""Truncated normal distribution implementation.""" | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureExponential(DisclosureNumerical, ExponentialDistribution): | ||
class DisclosureExponential(DisclosureNumericalMixin, ExponentialDistribution): | ||
"""Disclosure exponential distribution.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Module for disclosure control for string distributions.""" | ||
|
||
from metasyn.distribution.faker import ( | ||
FakerDistribution, | ||
FreeTextDistribution, | ||
UniqueFakerDistribution, | ||
) | ||
|
||
from metasyncontrib.disclosure.base import metadist_disclosure | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureFaker(FakerDistribution): | ||
"""Faker distribution for disclosure control.""" | ||
|
||
@classmethod | ||
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument | ||
return super()._fit(values, faker_type=faker_type, locale=locale) | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureUniqueFaker(UniqueFakerDistribution): | ||
"""Faker distribution for disclosure control that produces unique values.""" | ||
|
||
@classmethod | ||
def _fit(cls, values, faker_type: str = "city", locale: str = "en_US", n_avg: int = 11): # pylint: disable=unused-argument | ||
return super()._fit(values, faker_type=faker_type, locale=locale) | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureFreetext(FreeTextDistribution): | ||
"""Disclosure implementation of freetext distribution.""" | ||
|
||
@classmethod | ||
def _fit(cls, values, max_values: int = 50, n_avg: int = 11): # pylint: disable=unused-argument | ||
return super()._fit(values, max_values=max_values) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
"""Module for disclosure-controlled NA distribution.""" | ||
from __future__ import annotations | ||
|
||
from metasyn.distribution.na import NADistribution | ||
|
||
from metasyncontrib.disclosure.base import metadist_disclosure | ||
|
||
|
||
@metadist_disclosure() | ||
class DisclosureNA(NADistribution): | ||
"""Disclosure version of NA distribution.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.