forked from antiboredom/flickr-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
129 lines (101 loc) · 3.84 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from __future__ import print_function
import time
import sys
import json
import re
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
with open('credentials.json') as infile:
creds = json.load(infile)
KEY = creds['KEY']
SECRET = creds['SECRET']
def download_file(url, local_filename):
if local_filename is None:
local_filename = url.split('/')[-1]
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_photos(q, page=1, bbox=None):
params = {
'content_type': '7',
'per_page': '500',
'media': 'photos',
# 'license': '9,10', # see README.md
'method': 'flickr.photos.search',
'format': 'json',
'advanced': 1,
'nojsoncallback': 1,
'extras': 'media,realname,url_l,o_dims,geo,tags,machine_tags,date_taken',#url_c,url_l,url_m,url_n,url_q,url_s,url_sq,url_t,url_z',
'page': page,
'text': q,
'api_key': KEY,
}
# bbox should be: minimum_longitude, minimum_latitude, maximum_longitude, maximum_latitude
if bbox is not None and len(bbox) == 4:
params['bbox'] = ','.join(bbox)
results = requests.get('https://api.flickr.com/services/rest', params=params).json()['photos']
return results
def search(q, bbox=None, max_pages=None):
# create a folder for the query if it does not exist
foldername = os.path.join('images', re.sub(r'[\W]', '_', q))
if bbox is not None:
foldername += '_'.join(bbox)
if not os.path.exists(foldername):
os.makedirs(foldername)
jsonfilename = os.path.join(foldername, 'results.json')
if not os.path.exists(jsonfilename):
# save results as a json file
photos = []
current_page = 1
results = get_photos(q, page=current_page, bbox=bbox)
total_pages = results['pages']
if max_pages is not None and total_pages > max_pages:
total_pages = max_pages
photos += results['photo']
while current_page < total_pages:
print('downloading metadata, page {} of {}'.format(current_page, total_pages))
current_page += 1
photos += get_photos(q, page=current_page, bbox=bbox)['photo']
time.sleep(0.5)
with open(jsonfilename, 'w') as outfile:
json.dump(photos, outfile)
else:
with open(jsonfilename, 'r') as infile:
photos = json.load(infile)
# download images
print('Downloading images')
for photo in tqdm(photos):
try:
url = photo.get('url_l')
extension = url.split('.')[-1]
localname = os.path.join(foldername, '{}.{}'.format(photo['id'], extension))
if not os.path.exists(localname):
download_file(url, localname)
except Exception as e:
continue
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Download images from flickr')
parser.add_argument('--search', '-s', dest='q', required=True, help='Search term')
parser.add_argument('--max-pages', '-m', dest='max_pages', required=False, help='Max pages (default none)')
parser.add_argument('--bbox', '-b', dest='bbox', required=False, help='Bounding box to search in, separated by commas like so: minimum_longitude,minimum_latitude,maximum_longitude,maximum_latitude')
args = parser.parse_args()
q = args.q
try:
bbox = args.bbox.split(',')
except Exception as e:
bbox = None
if bbox and len(bbox) != 4:
bbox = None
print('Searching for {}'.format(q))
if bbox:
print('Within', bbox)
max_pages = None
if args.max_pages:
max_pages = int(args.max_pages)
search(q, bbox, max_pages)