Use FTS API for scraper, fixes #39

codeforIATI · Jan 8, 2025 · adf0262 · adf0262
1 parent f9e597c
commit adf0262
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 34 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1 @@
 requests
-openpyxl
diff --git a/scraper.py b/scraper.py
@@ -1,45 +1,30 @@
-from datetime import datetime
-import html
 from os.path import join
-from io import BytesIO
 import csv
+from operator import itemgetter
 
 import requests
-import openpyxl
 
-def to_date(date_str):
-    return datetime.strptime(date_str, '%d %B %Y').date()
+url = 'https://api.hpc.tools/v2/public/plan'
+headers = ['Plan name', 'Plan code', 'Plan type', 'Plan year', 'Start date', 'End date']
 
-# via https://fts.unocha.org/plan-code-list-iati
-url = 'https://fts.unocha.org/download/initiate/views_executable/xlsx?uri=/plan-code-list-iati&query%5Buri%5D=/plan-code-list-iati&query%5Bview_id%5D=plan_code_list_for_iati&query%5Bview_display%5D=page&query%5B_wrapper_format%5D=drupal_modal&view_id=plan_code_list_for_iati&view_display=page'
 r = requests.get(url)
-download_id = r.json()[0].get('download_id')
-
-r = requests.get(f'https://fts.unocha.org/download/{download_id}/download')
-wb = openpyxl.load_workbook(BytesIO(r.content))
-sheet = wb['Export data']
-rows = [
-    [
-        html.unescape(str(cell.value)) if cell.value is not None else None
-        for cell in row
-    ] for row in sheet.rows]
-
-# bin the first 2 rows
-rows = rows[2:]
-
-# pop the header row
-headers = rows.pop(0)
-
-# standardise headers
-headers = [h[0] + h[1:].lower() for h in headers]
-
-# zip it up
-rows = [dict(zip(headers, row)) for row in rows]
+data = r.json()
+rows = []
 
 # fix the dates
-for row in rows:
-    row['Start date'] = to_date(row['Start date'])
-    row['End date'] = to_date(row['End date'])
+for item in data['data']:
+    rows.append({
+        'Plan name': item['planVersion']['name'],
+        'Plan code': item['planVersion']['code'],
+        'Plan type': ",".join([it['name'] for it in item['categories'] if it['group'] == 'planType']),
+        'Plan year': ",".join([it['year'] for it in item['years']]),
+        'Start date': item['planVersion']['startDate'],
+        'End date': item['planVersion']['endDate'],
+    })
+
+
+rows.sort(key=itemgetter('Plan name'))
+rows.sort(key=itemgetter('Plan year'), reverse=True)
 
 with open(join('output', 'humanitarian-plan.csv'), 'w') as f:
     writer = csv.DictWriter(f, fieldnames=headers)