forked from IATI/IATI-Registry-Refresher
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrab_urls.py
101 lines (85 loc) · 3.5 KB
/
grab_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import sys
import shutil
import json
from os.path import dirname, exists, join
from os import makedirs
from time import sleep
import requests
def request_with_backoff(*args, attempts=100, backoff=0.1, **kwargs):
for attempt in range(1, attempts + 1):
# exponential backoff
wait = (pow(2, attempt) - 1) * backoff
try:
result = requests.request(*args, **kwargs)
if result.status_code == 200:
sleep(wait)
return result
except requests.exceptions.ConnectionError:
pass
print(f'Error! Retrying after {wait} seconds')
sleep(wait)
raise Exception(f'Failed after {attempts} attempts. Giving up.')
def main(args):
cache = '--cache' in args
skip_metadata = '--skip-metadata' in args
if skip_metadata:
print('Skipping metadata')
# Loop through each page and save the URL end-points of the data files
# You may need to set up an empty directory called "urls"
print('Fetching:')
tmpl = 'curl -L -k -f -s ' + \
'-H "Accept: application/xhtml+xml, application/xml,*/*;q=0.9" ' + \
'--retry 4 --retry-delay 10 -y 30 ' + \
'-Y 1000 -A "IATI data dump 1.0" ' + \
'--create-dirs -o data/{publisher_id}/{dataset_name}.xml ' + \
'"{url}" 2>&1 >/dev/null ; exitcode=$? ; ' + \
'test "$exitcode" != 0 && ' + \
'echo $exitcode {publisher_id} {dataset_name} "{url}" > ' + \
'logs/{dataset_name}.log && touch ' + \
'data/{publisher_id}/{dataset_name}.xml\n'
datasets = requests.get(
"https://registry.codeforiati.org/dataset_list.json").json()["result"]
publishers = requests.get(
"https://registry.codeforiati.org/publisher_list.json").json()["result"]
publishers = {
publisher["name"]: publisher
for publisher in publishers
}
for dataset in datasets:
organization = dataset['organization']
if dataset['resources'] == [] or not organization:
continue
dataset_name = dataset['name']
url = dataset['resources'][0]['url']
publisher_id = organization["name"]
if cache:
filename = join(publisher_id, dataset_name + '.xml')
cache_file = join('cache', filename)
if exists(cache_file):
out_file = join('data', filename)
out_path = dirname(out_file)
if not exists(out_path):
makedirs(out_path, exist_ok=True)
shutil.move(cache_file, out_file)
if not skip_metadata:
metadata_filepath = f'metadata/{publisher_id}'
if not exists(metadata_filepath):
makedirs(metadata_filepath)
publisher_metadata = publishers[publisher_id]
publisher_metadata_file = f'{metadata_filepath}.json'
with open(publisher_metadata_file, 'w') as f:
json.dump(publisher_metadata, f)
metadata_file = f'{metadata_filepath}/{dataset_name}.json'
with open(metadata_file, 'w') as f:
json.dump(dataset, f)
with open(f'urls/{publisher_id}', 'a') as f:
f.write(f'{dataset_name} {url}\n')
output = tmpl.format(
publisher_id=publisher_id,
dataset_name=dataset_name,
url=url.replace(' ', '%20'),
)
with open('downloads.curl', 'a') as f:
f.write(output)
if __name__ == '__main__':
main(sys.argv[1:])