forked from WICG/attribution-reporting-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
noise_corrector.py
executable file
·70 lines (59 loc) · 2.54 KB
/
noise_corrector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
import collections
import random
# This file provides a helper function to correct noisy grous of reports coming
# from the event level API (i.e. those where conversion metadata is potentially
# randomly selected). It implements an unbiased estimator for the true counts
# of conversions for each metadata bucket.
# TODO(csharrison): This estimator is unbiased, but its variance can be
# improved if we allow bias. This is something we should be able to smoothly
# trade off with another parameter.
def corrected_buckets(buckets, noise_probability=.05):
"""Returns a map of conversion bits --> corrected counts
buckets: A map from integer conversion metadata to conversion counts.
note, this needs to include buckets with 0 counts.
noise_probability: The probability the metadata was randomly selected
"""
total_records = sum(buckets.values())
num_conversion_buckets = len(buckets)
# |noise_probability| of the reports are noised and uniformly distributed
# among the conversion buckets so one can calculate how many values have
# were from noised, per bucket.
noised_values_per_bucket = total_records * noise_probability / num_conversion_buckets
# Subtract the reports added to each bucket due to noise, and rescale to
# account for the reports that were shifted due to the initial noise.
corrected_buckets = {
bucket: (v - noised_values_per_bucket) / (1 - noise_probability)
for bucket, v in buckets.items()
}
return corrected_buckets
if __name__ == "__main__":
# The following is an example showing how to use the function.
# |example_reports| is a map from bucket --> count of conversions
# with that bucket.
example_reports = {
0: 50,
1: 150,
2: 300,
3: 400,
4: 700,
5: 200,
6: 0,
7: 2000
}
# Simulate the API randomly flipping reports
noisy_reports = collections.defaultdict(int)
buckets = list(example_reports.keys())
noise_probability = .05
for bucket, count in example_reports.items():
for r in range(count):
if random.random() <= noise_probability:
new_bucket = random.choice(buckets)
noisy_reports[new_bucket] += 1
else:
noisy_reports[bucket] += 1
corrected = corrected_buckets(noisy_reports, noise_probability)
column_names = ["Bucket", "True count", "Noisy count", "Corrected count"]
print("{:<20}{:<20}{:<20}{:<20}".format(*column_names))
for bucket, count in sorted(corrected.items()):
print(f"{bucket:<20}{example_reports[bucket]:<20}{noisy_reports[bucket]:<20}{count:<20.2f}")