diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 885e653..6f03a60 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -43,7 +43,7 @@ jobs:
pylint metasyncontrib/disclosure
- name: Check docstrings with pydocstyle
run: |
- pydocstyle metasyncontrib/disclosure --convention=numpy --add-select=D417 --add-ignore="D102,D105"
+ pydocstyle metasyncontrib/disclosure --convention=numpy --add-select=D417 --add-ignore="D102,D105,D406"
- name: Check types with MyPy
run: |
mypy metasyncontrib/disclosure
diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb
index 0c66bc3..ed270ee 100644
--- a/examples/tutorial.ipynb
+++ b/examples/tutorial.ipynb
@@ -13,23 +13,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "2442cb34",
"metadata": {},
"outputs": [],
"source": [
- "# import required packages\n",
"from collections import defaultdict\n",
- "import datetime as dt\n",
"\n",
"import numpy as np\n",
"import polars as pl\n",
"from matplotlib import pyplot as plt\n",
- "\n",
"from metasyn import MetaFrame, demo_file\n",
- "from metasyncontrib.disclosure import DisclosurePrivacy\n",
"from metasyn.provider import DistributionProviderList\n",
- "#from utils import get_demonstration_fp"
+ "from metasyn.distribution import (\n",
+ " DiscreteUniformDistribution,\n",
+ " FakerDistribution,\n",
+ " RegexDistribution,\n",
+ " MultinoulliDistribution,\n",
+ ")\n",
+ "\n",
+ "from metasyncontrib.disclosure import DisclosurePrivacy"
]
},
{
@@ -45,19 +48,56 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "3c2a44b7",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
shape: (5, 13)PassengerId | Name | Sex | Age | Parch | Ticket | Fare | Cabin | Embarked | Birthday | Board time | Married since | all_NA |
---|
i64 | str | cat | i64 | i64 | str | f64 | str | cat | date | time | datetime[μs] | str |
1 | "Braund, Mr. Ow… | "male" | 22 | 0 | "A/5 21171" | 7.25 | null | "S" | 1937-10-28 | 15:53:04 | 2022-08-05 04:43:34 | null |
2 | "Cumings, Mrs. … | "female" | 38 | 0 | "PC 17599" | 71.2833 | "C85" | "C" | null | 12:26:00 | 2022-08-07 01:56:33 | null |
3 | "Heikkinen, Mis… | "female" | 26 | 0 | "STON/O2. 31012… | 7.925 | null | "S" | 1931-09-24 | 16:08:25 | 2022-08-04 20:27:37 | null |
4 | "Futrelle, Mrs.… | "female" | 35 | 0 | "113803" | 53.1 | "C123" | "S" | 1936-11-30 | null | 2022-08-07 07:05:55 | null |
5 | "Allen, Mr. Wil… | "male" | 35 | 0 | "373450" | 8.05 | null | "S" | 1918-11-07 | 10:59:08 | 2022-08-02 15:13:34 | null |
"
+ ],
+ "text/plain": [
+ "shape: (5, 13)\n",
+ "┌─────────────┬───────────────┬────────┬─────┬───┬────────────┬────────────┬──────────────┬────────┐\n",
+ "│ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- │\n",
+ "│ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ str │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[μs] ┆ │\n",
+ "╞═════════════╪═══════════════╪════════╪═════╪═══╪════════════╪════════════╪══════════════╪════════╡\n",
+ "│ 1 ┆ Braund, Mr. ┆ male ┆ 22 ┆ … ┆ 1937-10-28 ┆ 15:53:04 ┆ 2022-08-05 ┆ null │\n",
+ "│ ┆ Owen Harris ┆ ┆ ┆ ┆ ┆ ┆ 04:43:34 ┆ │\n",
+ "│ 2 ┆ Cumings, Mrs. ┆ female ┆ 38 ┆ … ┆ null ┆ 12:26:00 ┆ 2022-08-07 ┆ null │\n",
+ "│ ┆ John Bradley ┆ ┆ ┆ ┆ ┆ ┆ 01:56:33 ┆ │\n",
+ "│ ┆ (Flor… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
+ "│ 3 ┆ Heikkinen, ┆ female ┆ 26 ┆ … ┆ 1931-09-24 ┆ 16:08:25 ┆ 2022-08-04 ┆ null │\n",
+ "│ ┆ Miss. Laina ┆ ┆ ┆ ┆ ┆ ┆ 20:27:37 ┆ │\n",
+ "│ 4 ┆ Futrelle, ┆ female ┆ 35 ┆ … ┆ 1936-11-30 ┆ null ┆ 2022-08-07 ┆ null │\n",
+ "│ ┆ Mrs. Jacques ┆ ┆ ┆ ┆ ┆ ┆ 07:05:55 ┆ │\n",
+ "│ ┆ Heath (Li… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
+ "│ 5 ┆ Allen, Mr. ┆ male ┆ 35 ┆ … ┆ 1918-11-07 ┆ 10:59:08 ┆ 2022-08-02 ┆ null │\n",
+ "│ ┆ William Henry ┆ ┆ ┆ ┆ ┆ ┆ 15:13:34 ┆ │\n",
+ "└─────────────┴───────────────┴────────┴─────┴───┴────────────┴────────────┴──────────────┴────────┘"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "demonstration_fp =demo_file()\n",
+ "titanic_path = demo_file()\n",
"df = pl.read_csv(\n",
- " source=demonstration_fp, \n",
+ " source=titanic_path,\n",
" try_parse_dates=True,\n",
- " dtypes={\n",
- " \"Sex\": pl.Categorical,\n",
- " \"Embarked\": pl.Categorical\n",
- " }\n",
+ " dtypes={\"Sex\": pl.Categorical, \"Embarked\": pl.Categorical},\n",
")\n",
"df.head()"
]
@@ -75,27 +115,91 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "b2f5eadd",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 13/13 [00:01<00:00, 12.23it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lower bound distribution: 2022-07-15 12:21:15\n",
+ "Lowest value in dataframe: 2022-07-15 12:21:15\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 13)PassengerId | Name | Sex | Age | Parch | Ticket | Fare | Cabin | Embarked | Birthday | Board time | Married since | all_NA |
---|
i64 | str | cat | i64 | i64 | str | f64 | f32 | cat | date | time | datetime[μs] | f32 |
1 | "Brittany Brown… | "male" | 25 | 0 | "36347" | 51.982691 | null | "S" | 1907-12-02 | 17:07:57 | 2022-07-18 17:57:00 | null |
2 | "Steven Wright" | "male" | 25 | 0 | "73559" | 35.181497 | null | "C" | 1933-12-12 | 11:25:19 | 2022-08-14 11:51:18 | null |
3 | "Michael Yang" | "male" | 20 | 0 | "73510" | 77.549664 | null | "S" | 1915-04-27 | 13:42:48 | 2022-07-28 21:00:47 | null |
4 | "Andrew Stout" | "female" | null | 0 | "78506" | 23.700753 | null | "S" | null | 16:17:26 | 2022-08-10 00:55:50 | null |
5 | "Misty Landry" | "female" | null | 0 | "9510" | 15.404263 | null | "S" | 1932-12-07 | 12:42:44 | 2022-07-19 05:43:19 | null |
"
+ ],
+ "text/plain": [
+ "shape: (5, 13)\n",
+ "┌─────────────┬──────────────┬────────┬──────┬───┬────────────┬────────────┬──────────────┬────────┐\n",
+ "│ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- │\n",
+ "│ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ f32 │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[μs] ┆ │\n",
+ "╞═════════════╪══════════════╪════════╪══════╪═══╪════════════╪════════════╪══════════════╪════════╡\n",
+ "│ 1 ┆ Brittany ┆ male ┆ 25 ┆ … ┆ 1907-12-02 ┆ 17:07:57 ┆ 2022-07-18 ┆ null │\n",
+ "│ ┆ Browning ┆ ┆ ┆ ┆ ┆ ┆ 17:57:00 ┆ │\n",
+ "│ 2 ┆ Steven ┆ male ┆ 25 ┆ … ┆ 1933-12-12 ┆ 11:25:19 ┆ 2022-08-14 ┆ null │\n",
+ "│ ┆ Wright ┆ ┆ ┆ ┆ ┆ ┆ 11:51:18 ┆ │\n",
+ "│ 3 ┆ Michael Yang ┆ male ┆ 20 ┆ … ┆ 1915-04-27 ┆ 13:42:48 ┆ 2022-07-28 ┆ null │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 21:00:47 ┆ │\n",
+ "│ 4 ┆ Andrew Stout ┆ female ┆ null ┆ … ┆ null ┆ 16:17:26 ┆ 2022-08-10 ┆ null │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:55:50 ┆ │\n",
+ "│ 5 ┆ Misty Landry ┆ female ┆ null ┆ … ┆ 1932-12-07 ┆ 12:42:44 ┆ 2022-07-19 ┆ null │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 05:43:19 ┆ │\n",
+ "└─────────────┴──────────────┴────────┴──────┴───┴────────────┴────────────┴──────────────┴────────┘"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "from metasyn.distribution import RegexDistribution, FakerDistribution\n",
- "from metasyn.distribution import DiscreteUniformDistribution\n",
+ "specs = [\n",
+ " # we set passengerId to unique\n",
+ " {\"name\": \"PassengerId\", \"distribution\": {\"unique\": True}},\n",
+ " # We create new fake names for the name column\n",
+ " {\"name\": \"Name\", \"distribution\": FakerDistribution(\"name\")},\n",
+ " # Fit an exponential distribution based on the data for fare\n",
+ " {\"name\": \"Fare\", \"distribution\": {\"implements\": \"core.exponential\"}},\n",
+ " # For age we enforce a specific uniform distribution\n",
+ " {\"name\": \"Age\", \"distribution\": DiscreteUniformDistribution(20, 40)},\n",
+ " # We know cabin has a specific regular expression\n",
+ " {\"name\": \"Cabin\", \"distribution\": RegexDistribution(r\"[ABCDEF][0-9]{2,3}\")},\n",
+ "]\n",
"\n",
- "cabin_distribution = RegexDistribution(r\"[ABCDEF][0-9]{2,3}\")\n",
- "var_spec = {\n",
- " \"PassengerId\": {\"unique\": True}, \n",
- " \"Name\": {\"distribution\": FakerDistribution(\"name\")},\n",
- " \"Fare\": {\"distribution\": \"exponential\"}, # Fit an exponential distribution based on the data\n",
- " \"Age\": {\"distribution\": DiscreteUniformDistribution(20, 40)},\n",
- " \"Cabin\": {\"distribution\": cabin_distribution}\n",
- "}\n",
+ "mf = MetaFrame.fit_dataframe(df, var_specs=specs)\n",
"\n",
- "meta_frame = MetaFrame.fit_dataframe(df, spec=var_spec)\n",
- "print(f\"Lower bound distribution: {meta_frame['Married since'].distribution.start}\\n\"\n",
- " f\"Lowest value in dataframe: {df['Married since'].min()}\")\n",
- "meta_frame.synthesize(5)"
+ "print(\n",
+ " f\"Lower bound distribution: {mf['Married since'].distribution.lower}\\n\"\n",
+ " f\"Lowest value in dataframe: {df['Married since'].min()}\"\n",
+ ")\n",
+ "mf.synthesize(5)"
]
},
{
@@ -120,19 +224,79 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "b8b96c16",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 13/13 [00:02<00:00, 5.16it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lower bound distribution: 2022-07-15 17:12:24\n",
+ "Lowest value in dataframe: 2022-07-15 12:21:15\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 13)PassengerId | Name | Sex | Age | Parch | Ticket | Fare | Cabin | Embarked | Birthday | Board time | Married since | all_NA |
---|
i64 | str | cat | i64 | i64 | f32 | f64 | f32 | cat | date | time | datetime[μs] | f32 |
3 | "Store. Rise." | "male" | 30 | 0 | null | 135.292425 | null | "Q" | 1921-01-30 | 13:25:14 | null | null |
3 | "Adult." | "female" | 30 | 0 | null | 37.80435 | null | "S" | 1928-09-01 | 12:18:42 | null | null |
3 | "Doctor." | "female" | null | 0 | null | 8.235304 | null | "S" | null | 18:26:39 | 2022-08-14 15:50:52 | null |
3 | "View. The." | "male" | 19 | 0 | null | 10.90519 | null | "C" | null | 13:42:53 | 2022-07-31 18:46:04 | null |
3 | "Some. I. Plant… | "male" | null | 1 | null | 0.539828 | null | "S" | null | 16:37:16 | null | null |
"
+ ],
+ "text/plain": [
+ "shape: (5, 13)\n",
+ "┌─────────────┬──────────────┬────────┬──────┬───┬────────────┬────────────┬──────────────┬────────┐\n",
+ "│ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- │\n",
+ "│ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ f32 │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[μs] ┆ │\n",
+ "╞═════════════╪══════════════╪════════╪══════╪═══╪════════════╪════════════╪══════════════╪════════╡\n",
+ "│ 3 ┆ Store. Rise. ┆ male ┆ 30 ┆ … ┆ 1921-01-30 ┆ 13:25:14 ┆ null ┆ null │\n",
+ "│ 3 ┆ Adult. ┆ female ┆ 30 ┆ … ┆ 1928-09-01 ┆ 12:18:42 ┆ null ┆ null │\n",
+ "│ 3 ┆ Doctor. ┆ female ┆ null ┆ … ┆ null ┆ 18:26:39 ┆ 2022-08-14 ┆ null │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 15:50:52 ┆ │\n",
+ "│ 3 ┆ View. The. ┆ male ┆ 19 ┆ … ┆ null ┆ 13:42:53 ┆ 2022-07-31 ┆ null │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 18:46:04 ┆ │\n",
+ "│ 3 ┆ Some. I. ┆ male ┆ null ┆ … ┆ null ┆ 16:37:16 ┆ null ┆ null │\n",
+ "│ ┆ Plant. ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
+ "└─────────────┴──────────────┴────────┴──────┴───┴────────────┴────────────┴──────────────┴────────┘"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"meta_frame = MetaFrame.fit_dataframe(\n",
- " df=df, \n",
- " spec=var_spec,\n",
+ " df=df,\n",
+ " var_specs=specs,\n",
" dist_providers=\"metasyn-disclosure\", # Use the metasyn-disclosure plugin\n",
- " privacy=DisclosurePrivacy() # Use disclosure control\n",
- ") \n",
- "print(f\"Lower bound distribution: {meta_frame['Married since'].distribution.start}\\n\"\n",
- " f\"Lowest value in dataframe: {df['Married since'].min()}\")\n",
+ " privacy=DisclosurePrivacy(), # Use disclosure control\n",
+ ")\n",
+ "print(\n",
+ " f\"Lower bound distribution: {meta_frame['Married since'].distribution.lower}\\n\"\n",
+ " f\"Lowest value in dataframe: {df['Married since'].min()}\"\n",
+ ")\n",
"meta_frame.synthesize(5)"
]
},
@@ -167,17 +331,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "6630b9a3",
"metadata": {},
"outputs": [],
"source": [
- "from metasyn.distribution import MultinoulliDistribution\n",
- "\n",
"def plot_outliers(dist_type, series_size=50):\n",
" dist_providers = DistributionProviderList([\"builtin\", \"metasyn-disclosure\"])\n",
- " disc_distributions = dist_providers.get_distributions(var_type=dist_type, privacy=DisclosurePrivacy())\n",
- " \n",
+ " disc_distributions = dist_providers.get_distributions(\n",
+ " var_type=dist_type, privacy=DisclosurePrivacy()\n",
+ " )\n",
+ "\n",
" for disc_class in disc_distributions:\n",
" if issubclass(disc_class, MultinoulliDistribution):\n",
" continue\n",
@@ -190,6 +354,7 @@
"\n",
" base_param = defaultdict(lambda: [])\n",
" disc_param = defaultdict(lambda: [])\n",
+ "\n",
" def _add(parameters, param, new_val):\n",
" for key, val in param.items():\n",
" parameters[key].append(val)\n",
@@ -205,13 +370,21 @@
" for param in base_param:\n",
" if param == \"new_val\":\n",
" continue\n",
- " plt.plot(base_param[\"new_val\"], np.array(base_param[param])-clean_base_param[param], label=\"base\")\n",
- " plt.plot(disc_param[\"new_val\"], np.array(disc_param[param]) - clean_disc_param[param], label=\"disclosure\")\n",
+ " plt.plot(\n",
+ " base_param[\"new_val\"],\n",
+ " np.array(base_param[param]) - clean_base_param[param],\n",
+ " label=\"base\",\n",
+ " )\n",
+ " plt.plot(\n",
+ " disc_param[\"new_val\"],\n",
+ " np.array(disc_param[param]) - clean_disc_param[param],\n",
+ " label=\"disclosure\",\n",
+ " )\n",
" plt.title(f\"{disc_class.__name__}: {param}\")\n",
" plt.ylabel(\"Difference between dist with and without outlier\")\n",
" plt.xlabel(\"Value of the outlier\")\n",
" plt.legend()\n",
- " plt.show()\n"
+ " plt.show()"
]
},
{
@@ -225,10 +398,131 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "fd6903c2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "