-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathscraper.py
168 lines (133 loc) · 5.68 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from __future__ import print_function
import time
import sys
import json
import re
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
with open('credentials.json') as infile:
creds = json.load(infile)
KEY = creds['KEY']
SECRET = creds['SECRET']
def download_file(url, local_filename):
if local_filename is None:
local_filename = url.split('/')[-1]
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_group_id_from_url(url):
params = {
'method' : 'flickr.urls.lookupGroup',
'url': url,
'format': 'json',
'api_key': KEY,
'format': 'json',
'nojsoncallback': 1
}
results = requests.get('https://api.flickr.com/services/rest', params=params).json()
return results['group']['id']
def get_photos(qs, qg, page=1, original=False, bbox=None):
params = {
'content_type': '7',
'per_page': '500',
'media': 'photos',
'format': 'json',
'advanced': 1,
'nojsoncallback': 1,
'extras': 'media,realname,%s,o_dims,geo,tags,machine_tags,date_taken' % ('url_o' if original else 'url_l'), #url_c,url_l,url_m,url_n,url_q,url_s,url_sq,url_t,url_z',
'page': page,
'api_key': KEY
}
if qs is not None:
params['method'] = 'flickr.photos.search',
params['text'] = qs
elif qg is not None:
params['method'] = 'flickr.groups.pools.getPhotos',
params['group_id'] = qg
# bbox should be: minimum_longitude, minimum_latitude, maximum_longitude, maximum_latitude
if bbox is not None and len(bbox) == 4:
params['bbox'] = ','.join(bbox)
results = requests.get('https://api.flickr.com/services/rest', params=params).json()
if "photos" not in results:
print(results)
return None
return results["photos"]
def search(qs, qg, bbox=None, original=False, max_pages=None, start_page=1, output_dir='images'):
# create a folder for the query if it does not exist
foldername = os.path.join(output_dir, re.sub(r'[\W]', '_', qs if qs is not None else "group_%s"%qg))
if bbox is not None:
foldername += '_'.join(bbox)
if not os.path.exists(foldername):
os.makedirs(foldername)
jsonfilename = os.path.join(foldername, 'results' + str(start_page) + '.json')
if not os.path.exists(jsonfilename):
# save results as a json file
photos = []
current_page = start_page
results = get_photos(qs, qg, page=current_page, original=original, bbox=bbox)
if results is None:
return
total_pages = results['pages']
if max_pages is not None and total_pages > start_page + max_pages:
total_pages = start_page + max_pages
photos += results['photo']
while current_page < total_pages:
print('downloading metadata, page {} of {}'.format(current_page, total_pages))
current_page += 1
photos += get_photos(qs, qg, page=current_page, original=original, bbox=bbox)['photo']
time.sleep(0.5)
with open(jsonfilename, 'w') as outfile:
json.dump(photos, outfile)
else:
with open(jsonfilename, 'r') as infile:
photos = json.load(infile)
# download images
print('Downloading images')
for photo in tqdm(photos):
try:
url = photo.get('url_o' if original else 'url_l')
extension = url.split('.')[-1]
localname = os.path.join(foldername, '{}.{}'.format(photo['id'], extension))
if not os.path.exists(localname):
download_file(url, localname)
except Exception as e:
continue
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Download images from flickr')
parser.add_argument('--search', '-s', dest='q_search', default=None, required=False, help='Search term')
parser.add_argument('--group', '-g', dest='q_group', default=None, required=False, help='Group url, e.g. https://www.flickr.com/groups/scenery/')
parser.add_argument('--original', '-o', dest='original', action='store_true', default=False, required=False, help='Download original sized photos if True, large (1024px) otherwise')
parser.add_argument('--output_dir', '-t', dest='output_dir', default='images', required=False, help='Root directory to download to')
parser.add_argument('--max-pages', '-m', dest='max_pages', required=False, help='Max pages (default none)')
parser.add_argument('--start-page', '-st', dest='start_page', required=False, default=1, help='Start page (default 1)')
parser.add_argument('--bbox', '-b', dest='bbox', required=False, help='Bounding box to search in, separated by spaces like so: minimum_longitude minimum_latitude maximum_longitude maximum_latitude')
args = parser.parse_args()
qs = args.q_search
qg = args.q_group
original = args.original
output_dir = args.output_dir
if qs is None and qg is None:
sys.exit('Must specify a search term or group id')
try:
bbox = args.bbox.split(' ')
except Exception as e:
bbox = None
if bbox and len(bbox) != 4:
bbox = None
if qg is not None:
qg = get_group_id_from_url(qg)
print('Searching for {}'.format(qs if qs is not None else "group %s"%qg))
if bbox:
print('Within', bbox)
max_pages = None
if args.max_pages:
max_pages = int(args.max_pages)
if args.start_page:
start_page = int(args.start_page)
search(qs, qg, bbox, original, max_pages, start_page, output_dir)