forked from danvk/sfhistory
-
Notifications
You must be signed in to change notification settings - Fork 0
/
record_fixer.py
executable file
·50 lines (44 loc) · 1.45 KB
/
record_fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python
# This fixes some of the image URLs in records.pickle.
import record
import re
import cPickle
rs = record.AllRecords()
for idx, r in enumerate(rs):
url = r.photo_url
# Some images have thumbnails but are missing the full photo URL.
# Convert
# http://webbie1.sfpl.org/multimedia/thumbnails/aaa9774_x.jpg
# -> http://webbie1.sfpl.org/multimedia/sfphotos/aaa-9774.jpg
if not url:
url = r.thumbnail_url
if 'aaf' in url:
# There are only two of these:
# http://sflib1.sfpl.org:82/record=b1026391~S0
# http://sflib1.sfpl.org:82/record=b1036043~S0
# They're both links to larger collections of images and don't really fit
# the mold of the other pages. So we remove them.
del rs[idx]
continue
else:
# Two of these... I think they're just omissions.
url = re.sub(r'(.*)/thumbnails/(...)(\d+)_x\.jpg', r'\1/sfphotos/\2-\3.jpg', url)
# Remove trailing spaces from image URLs.
# Maybe ~4 of these.
if url[-1] == ' ':
url = url[0:-1]
# Change 'foojpg' -> 'foo.jpg'
# Just a typo. There are ~4 of these, too.
if url[-3:] == 'jpg' and url[-4:] != '.jpg':
print '%s -> %s' % (url, url[:-3] + '.jpg')
url = url[:-3] + '.jpg'
if url != r.photo_url:
r.photo_url = url
print "Re-pickling"
output_file = "records.pickle"
f = file(output_file, "w")
p = cPickle.Pickler(f, 2)
for i, r in enumerate(rs):
p.dump(r)
if i % 100 == 0:
print "Pickled %d records" % i