-
Notifications
You must be signed in to change notification settings - Fork 196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Code for the Location Heatmaps paper. #47
base: master
Are you sure you want to change the base?
Changes from 1 commit
5cd1d21
bb7372a
b377c5e
2eef817
b87b69b
559225c
56f0dce
e0e3fbe
ec363b4
009edfd
92826fb
d2697fd
8c6890d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import dataclasses | ||
import random | ||
from typing import List, Any | ||
from scipy.stats import norm | ||
|
||
import numpy as np | ||
import pygtrie | ||
|
@@ -32,7 +33,16 @@ | |
DEFAULT_CHILDREN = ['00', '01', '10', '11'] | ||
|
||
|
||
def get_default_children(aux_data, split=None): | ||
def get_default_children(aux_data=False, split=None): | ||
"""Returns a quad tree first 4 nodes. If aux_data (boolean) provided expands | ||
to 2 more bits or a specific pos/neg nodes. | ||
Args: | ||
aux_data: a boolean to use additional bit for data, e.g. pos/neg. | ||
split: specific subset of aux_data (pos/neg). | ||
|
||
Returns: | ||
A list of nodes to initialize the tree. | ||
""" | ||
if aux_data: | ||
if split == 'pos': | ||
return ['001', '011', '101', '111'] | ||
|
@@ -85,16 +95,16 @@ def coordinates_to_binary_path(xy_tuple, depth=10): | |
Returns: | ||
binary version of the coordinate. | ||
""" | ||
aux_data = '' | ||
if len(xy_tuple) == 2: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it'd be cleaner to just add another arg for 'pos', and maybe even split the x_coord and y_coord into separate args, but at the very least we should document in the docstring that xy_tuple can actually be an x, y, pos triplet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this comment applies to aux_data now that that is being pulled from xy_tuple. |
||
x_coord, y_coord = xy_tuple | ||
aux_data = False | ||
pos = '' | ||
else: | ||
x_coord, y_coord, pos = xy_tuple | ||
x_coord, y_coord, aux_data = xy_tuple | ||
path = '' | ||
for j in reversed(range(depth)): | ||
path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{pos}/' | ||
path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{aux_data}/' | ||
path = path[:-1] | ||
|
||
return path | ||
|
||
|
||
|
@@ -189,7 +199,7 @@ def transform_region_to_coordinates(x_coord, | |
|
||
|
||
def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold=0, | ||
aux_data=False, count_min=False): | ||
aux_data=False, count_min=None): | ||
"""Using coordinate vector and the tree produce a resulting image. | ||
|
||
For each value in the vector it finds the corresponding prefix and plots the | ||
|
@@ -260,9 +270,9 @@ def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold | |
return current_image, pos_image, neg_image | ||
|
||
|
||
def update_tree(prefix, tree, tree_prefix_list): | ||
def append_to_tree(prefix, tree, tree_prefix_list): | ||
""" | ||
Update tree with new prefix | ||
Append new node to the tree. | ||
Args: | ||
prefix: new path, e.g. '10/01/10' | ||
tree: current tree | ||
|
@@ -284,9 +294,9 @@ def split_regions(tree_prefix_list, | |
split_threshold, | ||
image_bit_level, | ||
collapse_threshold=None, | ||
expand_all=False, | ||
last_result: AlgResult = None, | ||
count_min=None): | ||
count_min=None, | ||
print_output=False): | ||
"""Modify the tree by splitting and collapsing the nodes. | ||
|
||
This implementation collapses and splits nodes of the tree according to | ||
|
@@ -299,23 +309,23 @@ def split_regions(tree_prefix_list, | |
split_threshold: threshold value used to split the nodes. | ||
image_bit_level: stopping criteria once the final resolution is reached. | ||
collapse_threshold: threshold value used to collapse the nodes. | ||
expand_all: expand all regions, | ||
last_result: use previous level results to compute conf intervals, | ||
count_min: use count-min sketch | ||
count_min: use count-min sketch. | ||
print_output: print results of splitting. | ||
Returns: | ||
new_tree, new_tree_prefix_list, fresh_expand | ||
new_tree, new_tree_prefix_list, num_newly_expanded_nodes | ||
""" | ||
collapsed = 0 | ||
created = 0 | ||
fresh_expand = 0 | ||
num_newly_expanded_nodes = 0 | ||
unchanged = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. collapsed, created, and unchanged do not appear to be used for anything anymore. let's delete them xor do something with them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. printing the results in the end of the function now |
||
new_tree_prefix_list = list() | ||
new_tree = pygtrie.StringTrie() | ||
for i in range(len(tree_prefix_list)): | ||
if count_min: | ||
count = count_min.query(tree_prefix_list[i]) | ||
else: | ||
count = vector_counts[i] if not expand_all else np.inf | ||
count = vector_counts[i] if vector_counts else np.inf | ||
prefix = tree_prefix_list[i] | ||
|
||
# check whether the tree has reached the bottom | ||
|
@@ -326,30 +336,32 @@ def split_regions(tree_prefix_list, | |
split_threshold) | ||
else: | ||
cond = count > split_threshold | ||
if expand_all or cond: | ||
if cond: | ||
for child in DEFAULT_CHILDREN: | ||
new_prefix = f'{prefix}/{child}' | ||
fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
else: | ||
if collapse_threshold is not None and \ | ||
count <= collapse_threshold and len(prefix) > 2: | ||
old_prefix = prefix[:-3] | ||
collapsed += 1 | ||
created += update_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
else: | ||
unchanged += update_tree(prefix, new_tree, new_tree_prefix_list) | ||
|
||
return new_tree, new_tree_prefix_list, fresh_expand | ||
unchanged += append_to_tree(prefix, new_tree, new_tree_prefix_list) | ||
if print_output: | ||
print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \ | ||
f'Created from collapsed: {created}. Unchanged: {unchanged}.') | ||
return new_tree, new_tree_prefix_list, num_newly_expanded_nodes | ||
|
||
|
||
def split_regions_aux(tree_prefix_list, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect that even more of these two functions could be shared (in particular, the basic structure of looping over prefixes and adding nodes to the tree as appropriate for the splitting & collapsing criteria), but acknowledge that it may not actually improve readability much more to do further surgery. Please consider sharing that prefix-looping structure, but if you can't see a clean and easy way to do so, that's fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, I agree, it's just that I need to look at both bits in data that is hard to unify. Maybe once we go to multiple dimensions we can just unify everything. |
||
vector_counts, | ||
split_threshold, | ||
image_bit_level, | ||
collapse_threshold=None, | ||
expand_all=False, | ||
last_result: AlgResult = None, | ||
count_min=None): | ||
count_min=None, | ||
print_output=False): | ||
"""Use expansion with aux data. | ||
|
||
We check both counts for positive and negative attributes for each location. | ||
|
@@ -360,24 +372,24 @@ def split_regions_aux(tree_prefix_list, | |
split_threshold: threshold value used to split the nodes. | ||
image_bit_level: stopping criteria once the final resolution is reached. | ||
collapse_threshold: threshold value used to collapse the nodes. | ||
expand_all: expand all regions, | ||
last_result: use previous level results to compute conf intervals, | ||
count_min: use count-min sketch | ||
print_output: print results of splitting | ||
Returns: | ||
new_tree, new_tree_prefix_list, fresh_expand | ||
new_tree, new_tree_prefix_list, num_newly_expanded_nodes | ||
""" | ||
new_tree_prefix_list = list() | ||
new_tree = pygtrie.StringTrie() | ||
collapsed = 0 | ||
created = 0 | ||
fresh_expand = 0 | ||
num_newly_expanded_nodes = 0 | ||
unchanged = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likewise re. collapsed, created, and unchanged being unused There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
for i in range(0, len(tree_prefix_list), 2): | ||
if count_min: | ||
raise ValueError('CountMin is not implemented for Aux data.') | ||
neg_count = vector_counts[i] if not expand_all else np.inf | ||
pos_count = vector_counts[i + 1] if not expand_all else np.inf | ||
neg_count = vector_counts[i] if vector_counts else np.inf | ||
pos_count = vector_counts[i + 1] if vector_counts else np.inf | ||
neg_prefix = tree_prefix_list[i] | ||
pos_prefix = tree_prefix_list[i + 1] | ||
|
||
|
@@ -392,28 +404,30 @@ def split_regions_aux(tree_prefix_list, | |
cond = p_cond and n_cond | ||
else: | ||
cond = (pos_count > split_threshold and neg_count > split_threshold) | ||
if expand_all or cond: | ||
if cond: | ||
neg_child = get_default_children(aux_data=True, split='neg') | ||
pos_child = get_default_children(aux_data=True, split='pos') | ||
for j in range(len(pos_child)): | ||
new_prefix = f'{neg_prefix}/{neg_child[j]}' | ||
fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
new_prefix = f'{pos_prefix}/{pos_child[j]}' | ||
update_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
append_to_tree(new_prefix, new_tree, new_tree_prefix_list) | ||
else: | ||
if collapse_threshold is not None and \ | ||
(pos_count < collapse_threshold or neg_count < collapse_threshold) \ | ||
and len(pos_prefix) > 3 and len(neg_prefix) > 3: | ||
old_prefix = neg_prefix[:-4] | ||
collapsed += 1 | ||
created += update_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
old_prefix = pos_prefix[:-4] | ||
update_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
append_to_tree(old_prefix, new_tree, new_tree_prefix_list) | ||
else: | ||
unchanged += update_tree(neg_prefix, new_tree, new_tree_prefix_list) | ||
update_tree(pos_prefix, new_tree, new_tree_prefix_list) | ||
|
||
return new_tree, new_tree_prefix_list, fresh_expand | ||
unchanged += append_to_tree(neg_prefix, new_tree, new_tree_prefix_list) | ||
append_to_tree(pos_prefix, new_tree, new_tree_prefix_list) | ||
if print_output: | ||
print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \ | ||
f'Created from collapsed: {created}. Unchanged: {unchanged}.') | ||
return new_tree, new_tree_prefix_list, num_newly_expanded_nodes | ||
|
||
|
||
def build_from_sample(samples, total_size): | ||
|
@@ -500,8 +514,6 @@ def convert_to_dataset(image, total_size, value=None): | |
|
||
|
||
def compute_conf_intervals(sum_vector: np.ndarray, level=95): | ||
from scipy.stats import norm | ||
|
||
conf_intervals = dict() | ||
conf_interval_weighted = dict() | ||
z = norm.ppf(1-(1-level/100)/2) | ||
|
@@ -521,7 +533,19 @@ def compute_conf_intervals(sum_vector: np.ndarray, level=95): | |
return conf_intervals, conf_interval_weighted | ||
|
||
|
||
def create_confidence_interval_condition(last_result, prefix, count, split_threshold): | ||
def evaluate_confidence_interval_condition(last_result, prefix, count, split_threshold): | ||
"""Evaluate whether the confidence interval is smaller than the the threshold. | ||
We compute confidence interval by comparing a current value in a sub-region | ||
with its parent region value from the previous level | ||
Args: | ||
last_result: a previous level tree results and vector counts | ||
prefix: current node prefix. | ||
count: current node count. | ||
split_threshold: threshold to cutoff confidence interval. | ||
|
||
Returns: | ||
whether the node satisfies confidence interval threshold. | ||
""" | ||
|
||
(last_prefix, last_prefix_pos) = last_result.tree.longest_prefix(prefix) | ||
if last_prefix is None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,15 +32,15 @@ | |
import mechanisms | ||
import metrics | ||
import plotting | ||
from sketches import get_count_min_sketch | ||
from sketches import CountMinSketch | ||
from config import Config | ||
|
||
TOPK = 1000 | ||
TOTAL_SIZE = 1024 | ||
|
||
|
||
def get_data(path, crop_tuple=(512, 100, 1536, 1124), | ||
total_size=1024, save=True): | ||
total_size=1024, save=True, dataset_name='dataset.npy'): | ||
"""Download the map image. | ||
|
||
Downloads the image from a given path, crops it and transforms into a list | ||
|
@@ -61,20 +61,16 @@ def get_data(path, crop_tuple=(512, 100, 1536, 1124), | |
image = Image.open(f).convert('L') | ||
image = image.crop(crop_tuple) | ||
true_image = np.asarray(image) | ||
if os.path.isfile('dataset.npy'): | ||
dataset = np.load('dataset.npy') | ||
if os.path.isfile(dataset_name): | ||
dataset = np.load(dataset_name) | ||
else: | ||
dataset = geo_utils.convert_to_dataset(true_image, total_size) | ||
if save: | ||
np.save('dataset', dataset) | ||
np.save(dataset_name, dataset) | ||
|
||
return true_image, dataset | ||
|
||
|
||
def get_split_data(path): | ||
dataset = np.load(path) | ||
|
||
|
||
def print_output(text, flag): | ||
"""Simple flag to suppress output.""" | ||
|
||
|
@@ -106,8 +102,8 @@ def run_experiment(true_image, | |
start_with_level=0, | ||
ignore_start_eps=False, | ||
last_result_ci=None, | ||
count_min=False) -> List[geo_utils.AlgResult]: | ||
"""The main method to run an experiment using TrieHH. | ||
count_min=None) -> List[geo_utils.AlgResult]: | ||
""" The main method to run the experiments. | ||
|
||
Args: | ||
true_image: original image for comparison | ||
|
@@ -136,7 +132,7 @@ def run_experiment(true_image, | |
start_with_level: skip first levels and always expand them. | ||
ignore_start_eps: ignore spending epsilon when using start_with_level. | ||
last_result_ci: for two label save previous results. | ||
count_min: use count-min sketch. | ||
count_min: to use count-min sketch use dict: {'depth': 20, 'width': 4000} | ||
|
||
Returns: | ||
A list of per level geo_utls.AlgResult objects. | ||
|
@@ -167,7 +163,7 @@ def run_experiment(true_image, | |
tree, tree_prefix_list = geo_utils.init_tree(config.aux_data) | ||
per_level_results = list() | ||
per_level_grid = list() | ||
fresh_expand = None | ||
num_newly_expanded_nodes = None | ||
sum_vector = None | ||
print_output(f'aux_data: {config.aux_data}', config.output_flag) | ||
process_split = geo_utils.split_regions_aux if aux_data else geo_utils.split_regions | ||
|
@@ -181,13 +177,15 @@ def run_experiment(true_image, | |
# define DP round size | ||
dp_round_size = config.min_dp_size if config.min_dp_size else config.secagg_round_size | ||
if config.split_threshold and config.split_threshold_func: | ||
raise ValueError('Specify either `threshold` or `threshold_func`.') | ||
raise ValueError('Specify either `threshold` xor `threshold_func`.') | ||
if collapse_threshold and collapse_func: | ||
raise ValueError( | ||
'Specify either `collapse_threshold` or `collapse_func`.') | ||
'Specify either `collapse_threshold` xor `collapse_func`.') | ||
|
||
# sample devices that will participate in the algorithm (same across levels): | ||
samples = np.random.choice(dataset, config.level_sample_size, replace=False) | ||
if count_min: | ||
count_min_sketch = get_count_min_sketch(depth=20, width=2000) | ||
if count_min is not None: | ||
count_min_sketch = CountMinSketch(depth=count_min['depth'], width=count_min['width']) | ||
sensitivity = 20 | ||
else: | ||
count_min_sketch = None | ||
|
@@ -217,7 +215,7 @@ def run_experiment(true_image, | |
# prevent spilling over the budget | ||
if remaining_budget: | ||
# last round, no progress in tree, or cannot run at least two rounds. | ||
if i == max_levels - 1 or fresh_expand == 0 \ | ||
if i == max_levels - 1 or num_newly_expanded_nodes == 0 \ | ||
or remaining_budget < 2 * eps * samples_len: | ||
print_output( | ||
'Last round. Spending remaining epsilon budget: ' + \ | ||
|
@@ -227,6 +225,7 @@ def run_experiment(true_image, | |
noiser = noise_class(dp_round_size, sensitivity, eps) | ||
if ignore_start_eps and start_with_level <= i: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a comment here or in the documentation for 'ignore_start_eps' explaining the motivation? If the idea is to only start accounting at level 'start_with_level', shouldn't the second condition be reversed such that spent budget is ignored for the levels before 'start_with_level'? Apologies if I'm just totally misreading this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated and added into docstring |
||
print_output('Ignoring eps spent', flag=output_flag) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: this is a frightening message; it would be nice to have a bit of extra context here (e.g., "Ignoring epsilon spent expanding first {start_with_level} levels, including current level {i}."). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
spent_budget = 0 | ||
else: | ||
spent_budget += eps * samples_len | ||
|
||
|
@@ -242,12 +241,12 @@ def run_experiment(true_image, | |
|
||
# to prevent OOM errors we use vectors of size partial. | ||
if start_with_level > i: | ||
tree, tree_prefix_list, fresh_expand = process_split( | ||
tree, tree_prefix_list, num_newly_expanded_nodes = process_split( | ||
tree_prefix_list=tree_prefix_list, | ||
vector_counts=None, | ||
split_threshold=split_threshold, image_bit_level=10, | ||
split_threshold=-np.inf, image_bit_level=10, | ||
collapse_threshold=collapse_threshold, | ||
expand_all=True, count_min=count_min) | ||
count_min=count_min, print_output=output_flag) | ||
print_output(f"Expanding all at the level: {i}.", output_flag) | ||
continue | ||
|
||
|
@@ -287,12 +286,12 @@ def run_experiment(true_image, | |
else: | ||
last_result = per_level_results[i - 1] | ||
|
||
tree, tree_prefix_list, fresh_expand = process_split( | ||
tree, tree_prefix_list, num_newly_expanded_nodes = process_split( | ||
tree_prefix_list=result.tree_prefix_list, vector_counts=result.sum_vector, | ||
split_threshold=split_threshold, image_bit_level=10, | ||
collapse_threshold=collapse_threshold, | ||
last_result=last_result) | ||
if fresh_expand==0: | ||
last_result=last_result, print_output=output_flag) | ||
if num_newly_expanded_nodes==0: | ||
break | ||
if output_flag: | ||
print(f'Total epsilon-users: {spent_budget:.2f} with ' + \ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO aux_data sounds like an actual data object rather than a boolean parameter; I would prefer a name like "has_aux_data" or even "has_aux_bit" since a single bit is all that's supported here. This also goes for other usages of "aux_data" as a boolean in other functions, below.
Really it would be ideal to just generalize this to support an arbitrary number of extra bits with an automatic encoding from the value specified in "split", rather than a single extra bit with a predefined 'pos'-->1 and 'neg'-->0 encoding, but I understand that is probably out of scope at present.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, let me change it to has_aux_bit for now, and maybe can expand it later