diff --git a/HISTORY.rst b/HISTORY.rst index 1c558d8..758abe3 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -1,9 +1,11 @@ History ======= -unreleased ----------- +v1.2.1 +------ * Update GitHub actions versions +* Update data acquisition notebook, using helper functions to download input data from + zenodo v1.2.0 ------ diff --git a/notebooks/data-acquisition.ipynb b/notebooks/data-acquisition.ipynb index 5b7b3ee..1c377c5 100644 --- a/notebooks/data-acquisition.ipynb +++ b/notebooks/data-acquisition.ipynb @@ -39,18 +39,17 @@ }, "outputs": [], "source": [ - "import tempfile\n", - "from io import BytesIO\n", - "from os import environ\n", "from pathlib import Path\n", - "from zipfile import ZipFile\n", "\n", - "import numpy as np\n", "import pandas as pd\n", "import requests\n", "from cartopy.io import shapereader\n", - "from fsspec import FSTimeoutError\n", - "from fsspec.implementations.zip import ZipFileSystem\n", + "from pyCIAM.io import (\n", + " download_and_extract_from_zenodo,\n", + " download_and_extract_partial_zip,\n", + " get_zenodo_file_list,\n", + ")\n", + "from pyCIAM.utils import copy\n", "from shared import (\n", " DIR_SHP,\n", " DIR_SLR_AR5_IFILES_RAW,\n", @@ -71,9 +70,7 @@ " PATH_SLR_HIST_TREND_MAP,\n", " PATHS_SURGE_LOOKUP,\n", " save,\n", - ")\n", - "\n", - "from pyCIAM.utils import copy" + ")" ] }, { @@ -123,76 +120,6 @@ "Z_URL_SLIIDERS_PC = Z_URL_RECORDS" ] }, - { - "cell_type": "code", - "execution_count": 49, - "id": "8d519f83-eb91-4cb0-b2e7-5918b91d5143", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_download_link(files, prefix):\n", - " links = [\n", - " i[\"links\"]\n", - " for i in files\n", - " if i.get(\"filename\", \"\").startswith(prefix)\n", - " or i.get(\"key\", \"\").startswith(prefix)\n", - " ]\n", - " assert len(links) == 1\n", - " links = links[0]\n", - " return links.get(\"download\", links[\"self\"])\n", - "\n", - "\n", - "def download_and_extract_full_zip(lpath, url):\n", - " if lpath.exists():\n", - " return None\n", - " lpath.parent.mkdir(exist_ok=True, parents=True)\n", - "\n", - " content = BytesIO(requests.get(url, params=PARAMS).content)\n", - " if isinstance(lpath, Path):\n", - " with ZipFile(content, \"r\") as zip_ref:\n", - " zip_ref.extractall(lpath)\n", - " else:\n", - " with tempfile.TemporaryDirectory() as tmpdir:\n", - " with ZipFile(content, \"r\") as zip_ref:\n", - " zip_ref.extractall(tmpdir)\n", - " copy(Path(tmpdir), lpath)\n", - "\n", - "\n", - "def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5):\n", - " lpath.mkdir(exist_ok=True, parents=True)\n", - " z = ZipFileSystem(url)\n", - " if isinstance(zip_glob, (list, set, tuple, np.ndarray)):\n", - " files_remote = zip_glob\n", - " else:\n", - " files_remote = [p for p in z.glob(zip_glob) if not p.endswith(\"/\")]\n", - " files_local = [lpath / Path(f).name for f in files_remote]\n", - " for fr, fl in list(zip(files_remote, files_local)):\n", - " if not fl.is_file():\n", - " retries = 0\n", - " while retries < n_retries:\n", - " print(f\"...Downloading {fl.name} (attempt {retries+1}/{n_retries})\")\n", - " try:\n", - " data = z.cat_file(fr)\n", - " break\n", - " except FSTimeoutError:\n", - " if retries < (n_retries - 1):\n", - " retries += 1\n", - " else:\n", - " raise\n", - " print(f\"...Writing {fl.name}\")\n", - " fl.write_bytes(data)\n", - "\n", - "\n", - "def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None):\n", - " dl = get_download_link(files, prefix)\n", - " if zip_glob is None:\n", - " return download_and_extract_full_zip(lpath, dl)\n", - " else:\n", - " return download_and_extract_partial_zip(lpath, dl, zip_glob)" - ] - }, { "cell_type": "code", "execution_count": 5, @@ -202,9 +129,7 @@ }, "outputs": [], "source": [ - "pyciam_files = requests.get(\n", - " Z_URL_SLIIDERS_PC.format(doi=Z_PYCIAM_DOI), params=PARAMS\n", - ").json()[\"files\"]" + "pyciam_files = get_zenodo_file_list(Z_PYCIAM_DOI)" ] }, { @@ -628,7 +553,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.12.2" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/pyCIAM/io.py b/pyCIAM/io.py index 258326d..f7f956b 100644 --- a/pyCIAM/io.py +++ b/pyCIAM/io.py @@ -8,12 +8,22 @@ * load_diaz_inputs """ +import tempfile +from collections.abc import Iterable +from io import BytesIO +from pathlib import Path +from zipfile import ZipFile + import dask.array as da import numpy as np import pandas as pd import pint_xarray # noqa: F401 +import requests import xarray as xr +from fsspec import FSTimeoutError +from fsspec.implementations.zip import ZipFileSystem +from pyCIAM.utils import copy from pyCIAM.utils import spherical_nearest_neighbor as snn from .utils import _s2d @@ -783,3 +793,70 @@ def load_diaz_inputs( inputs = inputs.drop_dims("rcp_pt") return inputs, slr + + +def get_zenodo_file_list(doi, params={}): + return requests.get(f"https://zenodo.org/api/records/{doi}", params=params).json()[ + "files" + ] + + +def get_download_link(files, prefix): + links = [ + i["links"] + for i in files + if i.get("filename", "").startswith(prefix) + or i.get("key", "").startswith(prefix) + ] + assert len(links) == 1 + links = links[0] + return links.get("download", links["self"]) + + +def _download_and_extract_full_zip(lpath, url, params={}): + if lpath.exists(): + return None + lpath.parent.mkdir(exist_ok=True, parents=True) + + content = BytesIO(requests.get(url, params=params).content) + if isinstance(lpath, Path): + with ZipFile(content, "r") as zip_ref: + zip_ref.extractall(lpath) + else: + with tempfile.TemporaryDirectory() as tmpdir: + with ZipFile(content, "r") as zip_ref: + zip_ref.extractall(tmpdir) + copy(Path(tmpdir), lpath) + + +def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5): + lpath.mkdir(exist_ok=True, parents=True) + z = ZipFileSystem(url) + if isinstance(zip_glob, (list, set, tuple, np.ndarray)): + files_remote = zip_glob + else: + files_remote = [p for p in z.glob(zip_glob) if not p.endswith("/")] + files_local = [lpath / Path(f).name for f in files_remote] + for fr, fl in list(zip(files_remote, files_local)): + if not fl.is_file(): + retries = 0 + while retries < n_retries: + print(f"...Downloading {fl.name} (attempt {retries+1}/{n_retries})") + try: + data = z.cat_file(fr) + break + except FSTimeoutError: + if retries < (n_retries - 1): + retries += 1 + else: + raise + print(f"...Writing {fl.name}") + fl.write_bytes(data) + + +def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None): + dl = get_download_link(files, prefix) + if zip_glob is None: + return _download_and_extract_full_zip(lpath, dl) + else: + return download_and_extract_partial_zip(lpath, dl, zip_glob)