forked from jiangnanpro/ppml-workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_results_hz.py
199 lines (133 loc) · 6.54 KB
/
compute_results_hz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import numpy as np
from sklearn.metrics import roc_auc_score
import argparse
import scipy.stats
import pandas as pd
def compute_mean_and_confidence_interval(x, confidence=0.95):
"""
returns the mean and the confidence interval, which are two real numbers
x: iterable
high - mean_ == mean_ - low except some numerical errors in rare cases
"""
mean_ = np.mean(x)
low, high = scipy.stats.t.interval(confidence, len(x) - 1, loc=mean_, scale=scipy.stats.sem(x))
return mean_, high - mean_
def compute_AUC_score(yhat_all, oneStep_yhat_all, df, N, M):
distances = - np.sum( yhat_all * np.log( oneStep_yhat_all ), axis = 1 )
auc_list = []
for seed in np.array(list(range(10))) + 10:
np.random.seed(seed)
mask1_D = np.random.choice(np.arange(0,N), size=M)
mask2_D = np.random.choice(np.arange(0,N), size=M)
mask1_R = np.random.choice(np.arange(N,2*N), size=M)
mask2_R = np.random.choice(np.arange(N,2*N), size=M)
resample_mem_lbls = np.concatenate( ( np.ones((1,M)), np.zeros((1,M)) ), axis=1 )
distance_diff = np.concatenate(( distances[mask1_R] - distances[mask1_D], distances[mask2_D] - distances[mask2_R]), axis = 0)
try:
auc = roc_auc_score(resample_mem_lbls[0,:], distance_diff)
auc_list.append(auc)
except ValueError as e:
pass
if len(auc_list) > 0:
auc_average, auc_confidence_interval = compute_mean_and_confidence_interval(auc_list)
df.append((os.path.basename(folder_name), auc_average, auc_confidence_interval))
return df
# cross entropy between probability vectors
def cross_ent1(p, q):
return -1*(p * np.log(q)).sum(axis=0)
# instead of probabilities 'q', it uses the outputs before softmax
def cross_ent2(p, xw):
colmaxes = np.amax(xw, axis=0, keepdims=True) # to avoid overflow
log_q = np.log( np.sum( np.exp(xw - colmaxes), axis=0 ) ) - xw + colmaxes
return np.sum( p * log_q, axis=0 )
# cross entropy with buffer, in case any probabilities in q are 0 or 1
def cross_ent3(p, q):
if (np.min(q)<0 or np.max(q)>1):
raise ValueError("Array has values not in [0,1]")
# to avoid errors in logs for q=0 or q=1
if np.min(q)==0:
low = np.min(q[q>0])
q += low/(1e12) # adding a small positive value
if np.max(q) >= 1:
high = np.max(q)
q /= (high*(1+1e-15)) # dividing by number slightly larger than 1
return -1*(p * np.log(q)).sum(axis=0)
# estimates the accuracy A_ltu, based on a sample of M pairs
def A_ltu(yhat_all, OneStep_yhat_all, fun=3, N=None, Nb=None, M=2500):
# Use if all probabilities are strictly between 0 and 1
if fun == 1:
dist_to_Mi_Ui = cross_ent1(yhat_all.T, OneStep_yhat_all.T)
# Use if OneStep_yhat_all is outputs w/o softmax, instead of probabilities
if fun == 2:
dist_to_Mi_Ui = cross_ent2(yhat_all.T, OneStep_yhat_all.T)
# Use if some probabilities are exactly 0 or 1
if fun == 3:
dist_to_Mi_Ui = cross_ent3(yhat_all.T, OneStep_yhat_all.T)
# Number of pairs to sample
#M = 2500
# By default, the first half of the data are "defender"
if not N:
N = yhat_all.shape[0]//2
# The rest are "reserved"
if not Nb:
Nb = yhat_all.shape[0] - N
if (N + Nb) != yhat_all.shape[0]:
print(N)
print(Nb)
print(yhat_all.shape[0])
raise ValueError("N + Nb does not equal the size of yhat_all")
if yhat_all.shape != OneStep_yhat_all.shape:
raise ValueError("The shapes of yhat_all and OneStep_yhat_all should be the same")
# Masks of randomly chosen points
mask1_D = np.random.choice(N, size=M)
mask1_R = np.random.choice(Nb, size=M) + N
# Accuracy is proportion for which the "defender" have smaller distances
acc_ltu = (np.mean(dist_to_Mi_Ui[mask1_D] < dist_to_Mi_Ui[mask1_R])
+ 0.5*np.mean(dist_to_Mi_Ui[mask1_D] == dist_to_Mi_Ui[mask1_R]) )
se = np.sqrt(acc_ltu*(1-acc_ltu)/M)
return acc_ltu, se
def Privacy(yhat_all, oneStep_yhat_all, N=None, Nb=None, M=2500):
acc_ltu, se = A_ltu(yhat_all, oneStep_yhat_all, fun=3, N=N, Nb=Nb, M=M)
priv = 2*(1 - acc_ltu)
se = 2*se
return priv, se
def get_se(p, sample_size):
return np.sqrt(p * (1 - p) / sample_size)
if __name__ == "__main__":
N = 3000
folder_names = {#"supervised_normal_fc": 97.186,
"supervised_normal_whole": 99.733,
#"supervised_long_fc": 97.616,
#"supervised_long_whole": 99.727,
#"supervised_flipped_fc": 78.223,
"supervised_flipped_whole": 77.828,
#"small_fake_mnist__attack_mode_forward_target_domain": 98.50556532793307,
"large_fake_mnist__attack_mode_forward_target_domain": 98.65239735308174,
#"small_fake_mnist__attack_mode_transfer_loss": 98.50556532793307,
"large_fake_mnist__attack_mode_transfer_loss": 98.65239735308174,
#"small_fake_mnist__attack_mode_total_loss": 98.50556532793307,
"large_fake_mnist__attack_mode_total_loss": 98.65239735308174}
df = []
for folder_name, reserve_set_classification_accuracy in folder_names.items():
folder_name = os.path.join("hz_intermediate_results", folder_name)
with open(os.path.join(folder_name, "yhat_all.npy"), "rb") as f:
yhat_all = np.load(f)
with open(os.path.join(folder_name, "oneStep_yhat_all.npy"), "rb") as f:
oneStep_yhat_all = np.load(f)
"""
with open(os.path.join(folder_name, "gradNorm_all.npy"), "rb") as f:
gradNorm_all = np.load(f)
"""
M = int(N / 2)
#compute_AUC_score(yhat_all, oneStep_yhat_all, df, N, M)
reserve_set_classification_accuracy /= 100
privacy, privacy_standard_error = Privacy(yhat_all, oneStep_yhat_all, N=N, Nb=N, M=M)
df.append((os.path.basename(folder_name), reserve_set_classification_accuracy,
get_se(reserve_set_classification_accuracy, 202953), privacy, privacy_standard_error))
df = pd.DataFrame(df, columns=["folder_name", "utility", "utility_standard_error",
"privacy", "privacy_standard_error"])
for col in ["utility", "utility_standard_error", "privacy", "privacy_standard_error"]:
df[col] = df[col].apply("{:.2f}".format)
df.to_csv("hz_table.csv", sep="\t", encoding="utf-8")
print(df)