Skip to content

Commit

Permalink
Use FTS API for scraper, fixes #39
Browse files Browse the repository at this point in the history
  • Loading branch information
markbrough committed Jan 8, 2025
1 parent f9e597c commit adf0262
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 34 deletions.
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
requests
openpyxl
51 changes: 18 additions & 33 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,30 @@
from datetime import datetime
import html
from os.path import join
from io import BytesIO
import csv
from operator import itemgetter

import requests
import openpyxl

def to_date(date_str):
return datetime.strptime(date_str, '%d %B %Y').date()
url = 'https://api.hpc.tools/v2/public/plan'
headers = ['Plan name', 'Plan code', 'Plan type', 'Plan year', 'Start date', 'End date']

# via https://fts.unocha.org/plan-code-list-iati
url = 'https://fts.unocha.org/download/initiate/views_executable/xlsx?uri=/plan-code-list-iati&query%5Buri%5D=/plan-code-list-iati&query%5Bview_id%5D=plan_code_list_for_iati&query%5Bview_display%5D=page&query%5B_wrapper_format%5D=drupal_modal&view_id=plan_code_list_for_iati&view_display=page'
r = requests.get(url)
download_id = r.json()[0].get('download_id')

r = requests.get(f'https://fts.unocha.org/download/{download_id}/download')
wb = openpyxl.load_workbook(BytesIO(r.content))
sheet = wb['Export data']
rows = [
[
html.unescape(str(cell.value)) if cell.value is not None else None
for cell in row
] for row in sheet.rows]

# bin the first 2 rows
rows = rows[2:]

# pop the header row
headers = rows.pop(0)

# standardise headers
headers = [h[0] + h[1:].lower() for h in headers]

# zip it up
rows = [dict(zip(headers, row)) for row in rows]
data = r.json()
rows = []

# fix the dates
for row in rows:
row['Start date'] = to_date(row['Start date'])
row['End date'] = to_date(row['End date'])
for item in data['data']:
rows.append({
'Plan name': item['planVersion']['name'],
'Plan code': item['planVersion']['code'],
'Plan type': ",".join([it['name'] for it in item['categories'] if it['group'] == 'planType']),
'Plan year': ",".join([it['year'] for it in item['years']]),
'Start date': item['planVersion']['startDate'],
'End date': item['planVersion']['endDate'],
})


rows.sort(key=itemgetter('Plan name'))
rows.sort(key=itemgetter('Plan year'), reverse=True)

with open(join('output', 'humanitarian-plan.csv'), 'w') as f:
writer = csv.DictWriter(f, fieldnames=headers)
Expand Down

0 comments on commit adf0262

Please sign in to comment.