Add sttack class and Add receipt test

glassonion1 · Nov 10, 2021 · b0c764d · b0c764d
1 parent 532f121
commit b0c764d
Show file tree

Hide file tree

Showing 7 changed files with 14,295 additions and 6 deletions.
diff --git a/anonypy/anonypy.py b/anonypy/anonypy.py
@@ -45,7 +45,9 @@ def count_t_closeness(self, k, p):
 def agg_categorical_column(series):
     # this is workaround for dtype bug of series
     series.astype("category")
-    return [",".join(set(series))]
+
+    l = [str(n) for n in set(series)]
+    return [",".join(l)]
 
 
 def agg_numerical_column(series):

diff --git a/anonypy/attack.py b/anonypy/attack.py
@@ -30,8 +30,6 @@ def attack(df, knowledge):
 
     di = pd.DataFrame(np.hstack((index, dist)))
 
-    print(di[2].median())
-
-    di.loc[di[2] > di[2].median(), :] = -1
+    di.loc[di[3] > di[3].median(), :] = -1
     # Display the top three
     return di.iloc[:, 0:k].astype(int)
diff --git a/data/NHANES_attack.csv b/data/NHANES_attack.csv
@@ -0,0 +1,13 @@
+Female,30.0,Hispanic,College,Parther,26.6,0,0,0,0,Q4,0
+Male,67.0,Black,Graduate,Widowed,28.8,0,0,0,0,Q3,1
+Female,57.0,Hispanic,9th,Separated,35.4,1,1,0,0,Q3,1
+Female,24.0,Other,Graduate,Never,25.3,0,1,0,0,Q1,0
+Male,33.0,Japanese,11th,Never,25.3,0,1,0,0,Q1,0
+Female,27.0,Black,College,Never,38.0,0,0,0,0,Q1,0
+Male,49.0,Other,11th,Married,25.0,0,0,0,0,Q1,0
+Female,69.0,Hispanic,9th,Separated,30.3,0,1,0,0,Q4,0
+Male,56.0,Other,11th,Married,25.0,1,0,0,0,Q2,1
+Male,22.0,Hispanic,College,Never,25.3,0,0,0,0,Q4,0
+Female,60.0,Hispanic,Graduate,Divorced,35.9,1,0,0,0,Q2,1
+Male,10.0,Japanese,AAA,Nver,35.9,1,0,0,0,Q2,1
+Female,51.0,Other,hoge,Divorced,35.9,1,0,0,0,Q2,1
diff --git a/data/receipt.csv b/data/receipt.csv
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="anonypy",
-    version="0.1.4",
+    version="0.1.7",
     packages=find_packages(),
     author="glassonion1",
     author_email="[email protected]",

diff --git a/tests/attack_test.py b/tests/attack_test.py
@@ -12,6 +12,9 @@ def test_attack():
     df = pd.read_csv("data/NHANES.csv", header=None, names=columns)
     print(f"\n{df.head()}")
 
+    df_attack = pd.read_csv("data/NHANES_attack.csv", header=None, names=columns)
+    print(f"\n{df_attack.head()}")
+
     for name in categorical:
         df[name] = df[name].astype("category")
 
@@ -25,7 +28,7 @@ def test_attack():
     dfn = pd.DataFrame(rows).loc[:, ["col1", "col2", "col3"]]
 
     # this is attackers knowledge
-    knowledge = df[40:50].loc[:, ["col1", "col2", "col3"]]
+    knowledge = df_attack.loc[:, ["col1", "col2", "col3"]]
 
     rl = attack.attack(dfn, knowledge)
     print(rl)
diff --git a/tests/receipt_test.py b/tests/receipt_test.py
@@ -0,0 +1,41 @@
+import anonypy
+import pandas as pd
+from datetime import datetime, date
+
+
+def calculate_age(born):
+    born = datetime.strptime(born, "%Y/%m")
+    today = date.today()
+    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
+
+
+def test_receipt():
+    path = "data/receipt.csv"
+    df = pd.read_csv(path)
+
+    # カテゴリ属性の設定
+    categorical = set(
+        (
+            "r_type",
+            "sex",
+            "family",
+            "icd10",
+        )
+    )
+    for name in categorical:
+        df[name] = df[name].astype("category")
+
+    print(len(df))
+    print(df.head())
+
+    df["birth_ym"] = df["birth_ym"].map(lambda x: calculate_age(x))
+
+    feature_columns = ["sex", "family", "birth_ym"]
+    sensitive_column = "iid"
+
+    p = anonypy.Preserver(df, feature_columns, sensitive_column)
+    rows = p.anonymize_k_anonymity(k=2)
+
+    dfn = pd.DataFrame(rows)
+    print(len(dfn))
+    print(dfn.head())