forked from censusreporter/censusreporter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache_clearer.py
179 lines (160 loc) · 6.2 KB
/
cache_clearer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""A crude script for clearing out cached profiles. It has mutated a lot, so treat it as a scratchpad and tweak it
as needed. Perhaps we'll ultimately figure out some patterns and we can make it more robust.
You must set environment variables for AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to be able to make changes to the S3 bucket.
This script doesn't assume database access; instead:
Do something like this to get a seed file of geoids from a database with our ACS 1 year schema:
\copy (select geoid from acs2016_1yr.geoheader where component = '00' order by geoid) to '/tmp/2016_1yr_geoids.txt';
.
"""
from boto.s3.connection import S3Connection, OrdinaryCallingFormat
from boto.s3.key import Key
from gzip import GzipFile
from cStringIO import StringIO
import json
import re
import requests
import time
import os, sys
CACHE_KEY_YEAR = '2016' # this specifies the S3 prefix we're checking for profile JSON which needs to be cleared
OBSOLETE_YEAR = '2015' # change this when we bump the 5-year release so we can recognize what is no longer welcome.
GEOID_LIST = '/tmp/2016_1yr_geoids.txt' # make a file of 1 year geoids to massively reduce search space
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY')
if AWS_ACCESS_KEY_ID is None or AWS_SECRET_ACCESS_KEY is None:
print("You must define AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY as environment variables")
sys.exit()
def decode_key(k):
sio = StringIO(k.get_contents_as_string())
return GzipFile(fileobj=sio).read()
def releases(j_string):
pat = re.compile('ACS (.+?)-year')
return set(pat.findall(j_string))
def get_key(b,geoid):
key_path = '1.0/data/profiles/{}/{}.json'.format(CACHE_KEY_YEAR,geoid)
return b.get_key(key_path)
s3 = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, calling_format=OrdinaryCallingFormat())
bucket = s3.get_bucket('embed.censusreporter.org')
deleted = []
def prime_cache(geoid):
url = "https://censusreporter.org/profiles/{}".format(geoid)
resp = requests.get(url)
if resp.status_code == 200:
return True
else:
print("Fetch error {} for geoid {}".format(resp.status_code,geoid))
return False
# with open(GEOID_LIST) as f:
# for i,geoid in enumerate(f):
# if i % 100 == 0: print i
# geoid = geoid[:-1] # trim newline
# k = get_key(bucket, geoid)
# if k:
# j = decode_key(k)
# r = releases(j)
# for release in r:
# if OBSOLETE_YEAR in release:
# deleted.append(geoid)
# k.delete()
# print "deleted", geoid
# break
#
# errors = 0
# to_restore = list(deleted)
# for i,geoid in enumerate(to_restore):
# if i % 100 == 0: print "{} of {} - {}".format(i,len(to_restore),geoid)
# url = "https://censusreporter.org/profiles/{}".format(geoid)
# resp = requests.get(url)
# if resp.status_code == 200:
# deleted.remove(geoid)
# else:
# errors += 1
# if errors > 5:
# print "5 errors so I quit"
# break
# time.sleep(5)
def delete_embed(release_slug, embed_slug,delete=False):
"""Given a 'release_slug' (such as 'ACS_2017_5-year') and an 'embed_slug' (such as 'social-place_of_birth-distribution'),
delete any existing JSON embeds for that combination, or report that you would have, depending on the value of 'delete'
Note that this JSON is not automatically generated on 404 from outside callers, so things only exist here because someone is
using an embed, and if you delete the JSON without fixing it, you break their embed. So be nice and go to the profile pages which
are impacted and find the right section and click 'embed' -- or, better, be a mensch and fix https://github.com/censusreporter/censusreporter/issues/249
so that the missing cached embeds are autogenerated when needed!
"""
to_fix = []
for key in bucket.list('1.0/data/charts/{}'.format(release_slug)):
if key.name.endswith('{}.json'.format(embed_slug)):
if delete:
print("deleting {}".format(key.name))
key.delete()
else:
print("would delete {}".format(key.name))
geoid = key.name.split('/')[-1].split('-')[0]
to_fix.append("https://censusreporter.org/profiles/{}".format(geoid))
if len(to_fix) > 0:
if delete:
word = ''
else:
word = 'would '
print("{}need to fix these embeds for {}:".format(word, embed_slug))
for x in to_fix:
print(x)
else:
print("nothing to fix")
def delete_all_profiles(year_str):
deleted = []
for key in bucket.list('1.0/data/profiles/{}'.format(year_str)):
key.delete()
deleted.append(key)
print(key.name)
# if len(deleted) > 10:
# print("stopping at 10 for now")
# break
if len(deleted) % 100 == 0:
print(len(deleted))
# most recently, I wanted to
# delete_embed('ACS_2017_5-year', 'social-place_of_birth-distribution',delete=False)
def delete_by_pattern(key_prefix,patterns,do_it=False):
"""Given a prefix, look at every key matching that prefix, and, if
the non-prefix part of the key matches any pattern, delete it
(or, if do_it is False, print that it would be done)
"""
regexes = []
if isinstance(patterns,basestring):
print("patterns should be a sequence not a string")
return
for pat in patterns:
if type(pat) == '_sre.SRE_Pattern':
regexes.append(pat)
else:
regexes.append(re.compile(pat))
from collections import defaultdict
delete_dict = defaultdict(list)
for key in bucket.list(key_prefix):
fn = key.name.replace(key_prefix,'')
if fn[0] == '/': fn = fn[1:]
for pat in regexes:
if pat.match(fn):
delete_dict[pat.pattern].append(fn)
if do_it:
key.delete()
else:
print("would delete ", key.name)
continue
for k,v in delete_dict.items():
print("Pattern", k, " - ", len(v))
if __name__ == '__main__':
delete_by_pattern('1.0/data/profiles/2018/',
[
'05000US.*',
'31000US.*',
'33000US.*',
],
do_it=True
)
delete_by_pattern('tiger2018/show/',
[
'05000US.*parents.json',
'31000US.*parents.json',
],
do_it=True
)