google-research · ebagdasa · Jun 21, 2021 · Nov 4, 2021 · Dec 10, 2021 · Dec 10, 2021
diff --git a/analytics/location_heatmaps/geo_utils.py b/analytics/location_heatmaps/geo_utils.py
@@ -24,6 +24,7 @@
 import dataclasses
 import random
 from typing import List, Any
+from scipy.stats import norm
 
 import numpy as np
 import pygtrie
@@ -32,7 +33,16 @@
 DEFAULT_CHILDREN = ['00', '01', '10', '11']
 
 
-def get_default_children(aux_data, split=None):
+def get_default_children(aux_data=False, split=None):
+  """Returns a quad tree first 4 nodes. If aux_data (boolean) provided expands
+  to 2 more bits or a specific pos/neg nodes.
+  Args:
+    aux_data: a boolean to use additional bit for data, e.g. pos/neg.
+    split: specific subset of aux_data (pos/neg).
+
+  Returns:
+    A list of nodes to initialize the tree.
+  """
   if aux_data:
     if split == 'pos':
       return ['001', '011', '101', '111']
@@ -85,16 +95,16 @@ def coordinates_to_binary_path(xy_tuple, depth=10):
   Returns:
     binary version of the coordinate.
   """
+  aux_data = ''
   if len(xy_tuple) == 2:
     x_coord, y_coord = xy_tuple
-    aux_data = False
-    pos = ''
   else:
-    x_coord, y_coord, pos = xy_tuple
+    x_coord, y_coord, aux_data = xy_tuple
   path = ''
   for j in reversed(range(depth)):
-    path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{pos}/'
+    path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{aux_data}/'
   path = path[:-1]
+
   return path
 
 
@@ -189,7 +199,7 @@ def transform_region_to_coordinates(x_coord,
 
 
 def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold=0,
-                        aux_data=False, count_min=False):
+                        aux_data=False, count_min=None):
   """Using coordinate vector and the tree produce a resulting image.
 
   For each value in the vector it finds the corresponding prefix and plots the
@@ -260,9 +270,9 @@ def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold
   return current_image, pos_image, neg_image
 
 
-def update_tree(prefix, tree, tree_prefix_list):
+def append_to_tree(prefix, tree, tree_prefix_list):
   """
-  Update tree with new prefix
+  Append new node to the tree.
   Args:
     prefix: new path, e.g. '10/01/10'
     tree: current tree
@@ -284,9 +294,9 @@ def split_regions(tree_prefix_list,
                   split_threshold,
                   image_bit_level,
                   collapse_threshold=None,
-                  expand_all=False,
                   last_result: AlgResult = None,
-                  count_min=None):
+                  count_min=None,
+                  print_output=False):
   """Modify the tree by splitting and collapsing the nodes.
 
   This implementation collapses and splits nodes of the tree according to
@@ -299,23 +309,23 @@ def split_regions(tree_prefix_list,
       split_threshold: threshold value used to split the nodes.
       image_bit_level: stopping criteria once the final resolution is reached.
       collapse_threshold: threshold value used to collapse the nodes.
-      expand_all: expand all regions,
       last_result: use previous level results to compute conf intervals,
-      count_min: use count-min sketch
+      count_min: use count-min sketch.
+      print_output: print results of splitting.
   Returns:
-      new_tree, new_tree_prefix_list, fresh_expand
+      new_tree, new_tree_prefix_list, num_newly_expanded_nodes
   """
   collapsed = 0
   created = 0
-  fresh_expand = 0
+  num_newly_expanded_nodes = 0
   unchanged = 0
   new_tree_prefix_list = list()
   new_tree = pygtrie.StringTrie()
   for i in range(len(tree_prefix_list)):
     if count_min:
       count = count_min.query(tree_prefix_list[i])
     else:
-      count = vector_counts[i] if not expand_all else np.inf
+      count = vector_counts[i] if vector_counts else np.inf
     prefix = tree_prefix_list[i]
 
     # check whether the tree has reached the bottom
@@ -326,30 +336,32 @@ def split_regions(tree_prefix_list,
                                                   split_threshold)
     else:
       cond = count > split_threshold
-    if expand_all or cond:
+    if cond:
       for child in DEFAULT_CHILDREN:
         new_prefix = f'{prefix}/{child}'
-        fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list)
+        num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
     else:
       if collapse_threshold is not None and \
         count <= collapse_threshold and len(prefix) > 2:
         old_prefix = prefix[:-3]
         collapsed += 1
-        created += update_tree(old_prefix, new_tree, new_tree_prefix_list)
+        created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
       else:
-        unchanged += update_tree(prefix, new_tree, new_tree_prefix_list)
-
-  return new_tree, new_tree_prefix_list, fresh_expand
+        unchanged += append_to_tree(prefix, new_tree, new_tree_prefix_list)
+  if print_output:
+    print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \
+        f'Created from collapsed: {created}. Unchanged: {unchanged}.')
+  return new_tree, new_tree_prefix_list, num_newly_expanded_nodes
 
 
 def split_regions_aux(tree_prefix_list,
                   vector_counts,
                   split_threshold,
                   image_bit_level,
                   collapse_threshold=None,
-                  expand_all=False,
                   last_result: AlgResult = None,
-                  count_min=None):
+                  count_min=None,
+                  print_output=False):
   """Use expansion with aux data.
 
   We check both counts for positive and negative attributes for each location.
@@ -360,24 +372,24 @@ def split_regions_aux(tree_prefix_list,
       split_threshold: threshold value used to split the nodes.
       image_bit_level: stopping criteria once the final resolution is reached.
       collapse_threshold: threshold value used to collapse the nodes.
-      expand_all: expand all regions,
       last_result: use previous level results to compute conf intervals,
       count_min: use count-min sketch
+      print_output: print results of splitting
   Returns:
-      new_tree, new_tree_prefix_list, fresh_expand
+      new_tree, new_tree_prefix_list, num_newly_expanded_nodes
   """
   new_tree_prefix_list = list()
   new_tree = pygtrie.StringTrie()
   collapsed = 0
   created = 0
-  fresh_expand = 0
+  num_newly_expanded_nodes = 0
   unchanged = 0
 
   for i in range(0, len(tree_prefix_list), 2):
     if count_min:
       raise ValueError('CountMin is not implemented for Aux data.')
-    neg_count = vector_counts[i] if not expand_all else np.inf
-    pos_count = vector_counts[i + 1] if not expand_all else np.inf
+    neg_count = vector_counts[i] if vector_counts else np.inf
+    pos_count = vector_counts[i + 1] if vector_counts else np.inf
     neg_prefix = tree_prefix_list[i]
     pos_prefix = tree_prefix_list[i + 1]
 
@@ -392,28 +404,30 @@ def split_regions_aux(tree_prefix_list,
       cond = p_cond and n_cond
     else:
       cond = (pos_count > split_threshold and neg_count > split_threshold)
-    if expand_all or cond:
+    if cond:
       neg_child = get_default_children(aux_data=True, split='neg')
       pos_child = get_default_children(aux_data=True, split='pos')
       for j in range(len(pos_child)):
         new_prefix = f'{neg_prefix}/{neg_child[j]}'
-        fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list)
+        num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
         new_prefix = f'{pos_prefix}/{pos_child[j]}'
-        update_tree(new_prefix, new_tree, new_tree_prefix_list)
+        append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
     else:
       if collapse_threshold is not None and \
           (pos_count < collapse_threshold or neg_count < collapse_threshold) \
           and len(pos_prefix) > 3 and len(neg_prefix) > 3:
         old_prefix = neg_prefix[:-4]
         collapsed += 1
-        created += update_tree(old_prefix, new_tree, new_tree_prefix_list)
+        created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
         old_prefix = pos_prefix[:-4]
-        update_tree(old_prefix, new_tree, new_tree_prefix_list)
+        append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
       else:
-        unchanged += update_tree(neg_prefix, new_tree, new_tree_prefix_list)
-        update_tree(pos_prefix, new_tree, new_tree_prefix_list)
-
-  return new_tree, new_tree_prefix_list, fresh_expand
+        unchanged += append_to_tree(neg_prefix, new_tree, new_tree_prefix_list)
+        append_to_tree(pos_prefix, new_tree, new_tree_prefix_list)
+  if print_output:
+    print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \
+        f'Created from collapsed: {created}. Unchanged: {unchanged}.')
+  return new_tree, new_tree_prefix_list, num_newly_expanded_nodes
 
 
 def build_from_sample(samples, total_size):
@@ -500,8 +514,6 @@ def convert_to_dataset(image, total_size, value=None):
 
 
 def compute_conf_intervals(sum_vector: np.ndarray, level=95):
-  from scipy.stats import norm
-
   conf_intervals = dict()
   conf_interval_weighted = dict()
   z = norm.ppf(1-(1-level/100)/2)
@@ -521,7 +533,19 @@ def compute_conf_intervals(sum_vector: np.ndarray, level=95):
   return conf_intervals, conf_interval_weighted
 
 
-def create_confidence_interval_condition(last_result, prefix, count, split_threshold):
+def evaluate_confidence_interval_condition(last_result, prefix, count, split_threshold):
+  """Evaluate whether the confidence interval is smaller than the the threshold.
+  We compute confidence interval by comparing a current value in a sub-region
+  with its parent region value from the previous level
+  Args:
+    last_result: a previous level tree results and vector counts
+    prefix: current node prefix.
+    count: current node count.
+    split_threshold: threshold to cutoff confidence interval.
+
+  Returns:
+    whether the node satisfies confidence interval threshold.
+  """
 
   (last_prefix, last_prefix_pos) = last_result.tree.longest_prefix(prefix)
   if last_prefix is None:

diff --git a/analytics/location_heatmaps/run_experiment.py b/analytics/location_heatmaps/run_experiment.py
@@ -32,15 +32,15 @@
 import mechanisms
 import metrics
 import plotting
-from sketches import get_count_min_sketch
+from sketches import CountMinSketch
 from config import Config
 
 TOPK = 1000
 TOTAL_SIZE = 1024
 
 
 def get_data(path, crop_tuple=(512, 100, 1536, 1124),
-             total_size=1024, save=True):
+             total_size=1024, save=True, dataset_name='dataset.npy'):
   """Download the map image.
 
   Downloads the image from a given path, crops it and transforms into a list
@@ -61,20 +61,16 @@ def get_data(path, crop_tuple=(512, 100, 1536, 1124),
     image = Image.open(f).convert('L')
   image = image.crop(crop_tuple)
   true_image = np.asarray(image)
-  if os.path.isfile('dataset.npy'):
-    dataset = np.load('dataset.npy')
+  if os.path.isfile(dataset_name):
+    dataset = np.load(dataset_name)
   else:
     dataset = geo_utils.convert_to_dataset(true_image, total_size)
     if save:
-      np.save('dataset', dataset)
+      np.save(dataset_name, dataset)
 
   return true_image, dataset
 
 
-def get_split_data(path):
-  dataset = np.load(path)
-
-
 def print_output(text, flag):
   """Simple flag to suppress output."""
 
@@ -106,8 +102,8 @@ def run_experiment(true_image,
                    start_with_level=0,
                    ignore_start_eps=False,
                    last_result_ci=None,
-                   count_min=False) -> List[geo_utils.AlgResult]:
-  """The main method to run an experiment using TrieHH.
+                   count_min=None) -> List[geo_utils.AlgResult]:
+  """ The main method to run the experiments.
 
   Args:
       true_image: original image for comparison
@@ -136,7 +132,7 @@ def run_experiment(true_image,
       start_with_level: skip first levels and always expand them.
       ignore_start_eps: ignore spending epsilon when using start_with_level.
       last_result_ci: for two label save previous results.
-      count_min: use count-min sketch.
+      count_min: to use count-min sketch use dict: {'depth': 20, 'width': 4000}
 
   Returns:
       A list of per level geo_utls.AlgResult objects.
@@ -167,7 +163,7 @@ def run_experiment(true_image,
   tree, tree_prefix_list = geo_utils.init_tree(config.aux_data)
   per_level_results = list()
   per_level_grid = list()
-  fresh_expand = None
+  num_newly_expanded_nodes = None
   sum_vector = None
   print_output(f'aux_data: {config.aux_data}', config.output_flag)
   process_split = geo_utils.split_regions_aux if aux_data else geo_utils.split_regions
@@ -181,13 +177,15 @@ def run_experiment(true_image,
   # define DP round size
   dp_round_size = config.min_dp_size if config.min_dp_size else config.secagg_round_size
   if config.split_threshold and config.split_threshold_func:
-    raise ValueError('Specify either `threshold` or `threshold_func`.')
+    raise ValueError('Specify either `threshold` xor `threshold_func`.')
   if collapse_threshold and collapse_func:
     raise ValueError(
-      'Specify either `collapse_threshold` or `collapse_func`.')
+      'Specify either `collapse_threshold` xor `collapse_func`.')
+
+  # sample devices that will participate in the algorithm (same across levels):
   samples = np.random.choice(dataset, config.level_sample_size, replace=False)
-  if count_min:
-    count_min_sketch = get_count_min_sketch(depth=20, width=2000)
+  if count_min is not None:
+    count_min_sketch = CountMinSketch(depth=count_min['depth'], width=count_min['width'])
     sensitivity = 20
   else:
     count_min_sketch = None
@@ -217,7 +215,7 @@ def run_experiment(true_image,
       # prevent spilling over the budget
       if remaining_budget:
         # last round, no progress in tree, or cannot run at least two rounds.
-        if i == max_levels - 1 or fresh_expand == 0 \
+        if i == max_levels - 1 or num_newly_expanded_nodes == 0 \
             or remaining_budget < 2 * eps * samples_len:
           print_output(
             'Last round. Spending remaining epsilon budget: ' + \
@@ -227,6 +225,7 @@ def run_experiment(true_image,
       noiser = noise_class(dp_round_size, sensitivity, eps)
       if ignore_start_eps and start_with_level <= i:
         print_output('Ignoring eps spent', flag=output_flag)
+        spent_budget = 0
       else:
         spent_budget += eps * samples_len
 
@@ -242,12 +241,12 @@ def run_experiment(true_image,
 
     # to prevent OOM errors we use vectors of size partial.
     if start_with_level > i:
-      tree, tree_prefix_list, fresh_expand = process_split(
+      tree, tree_prefix_list, num_newly_expanded_nodes = process_split(
         tree_prefix_list=tree_prefix_list,
         vector_counts=None,
-        split_threshold=split_threshold, image_bit_level=10,
+        split_threshold=-np.inf, image_bit_level=10,
         collapse_threshold=collapse_threshold,
-        expand_all=True, count_min=count_min)
+        count_min=count_min, print_output=output_flag)
       print_output(f"Expanding all at the level: {i}.", output_flag)
       continue
 
@@ -287,12 +286,12 @@ def run_experiment(true_image,
     else:
       last_result = per_level_results[i - 1]
 
-    tree, tree_prefix_list, fresh_expand = process_split(
+    tree, tree_prefix_list, num_newly_expanded_nodes = process_split(
       tree_prefix_list=result.tree_prefix_list, vector_counts=result.sum_vector,
       split_threshold=split_threshold, image_bit_level=10,
       collapse_threshold=collapse_threshold,
-      last_result=last_result)
-    if fresh_expand==0:
+      last_result=last_result, print_output=output_flag)
+    if num_newly_expanded_nodes==0:
       break
   if output_flag:
     print(f'Total epsilon-users: {spent_budget:.2f} with ' + \