ClimateImpactLab · bolliger32 · Apr 3, 2024 · Mar 2, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,9 +1,11 @@
 History
 =======
 
-unreleased
-----------
+v1.2.1
+------
 * Update GitHub actions versions
+* Update data acquisition notebook, using helper functions to download input data from
+  zenodo
 
 v1.2.0
 ------

diff --git a/notebooks/data-acquisition.ipynb b/notebooks/data-acquisition.ipynb
@@ -39,18 +39,17 @@
    },
    "outputs": [],
    "source": [
-    "import tempfile\n",
-    "from io import BytesIO\n",
-    "from os import environ\n",
     "from pathlib import Path\n",
-    "from zipfile import ZipFile\n",
     "\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "import requests\n",
     "from cartopy.io import shapereader\n",
-    "from fsspec import FSTimeoutError\n",
-    "from fsspec.implementations.zip import ZipFileSystem\n",
+    "from pyCIAM.io import (\n",
+    "    download_and_extract_from_zenodo,\n",
+    "    download_and_extract_partial_zip,\n",
+    "    get_zenodo_file_list,\n",
+    ")\n",
+    "from pyCIAM.utils import copy\n",
     "from shared import (\n",
     "    DIR_SHP,\n",
     "    DIR_SLR_AR5_IFILES_RAW,\n",
@@ -71,9 +70,7 @@
     "    PATH_SLR_HIST_TREND_MAP,\n",
     "    PATHS_SURGE_LOOKUP,\n",
     "    save,\n",
-    ")\n",
-    "\n",
-    "from pyCIAM.utils import copy"
+    ")"
    ]
   },
   {
@@ -123,76 +120,6 @@
     "Z_URL_SLIIDERS_PC = Z_URL_RECORDS"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "8d519f83-eb91-4cb0-b2e7-5918b91d5143",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def get_download_link(files, prefix):\n",
-    "    links = [\n",
-    "        i[\"links\"]\n",
-    "        for i in files\n",
-    "        if i.get(\"filename\", \"\").startswith(prefix)\n",
-    "        or i.get(\"key\", \"\").startswith(prefix)\n",
-    "    ]\n",
-    "    assert len(links) == 1\n",
-    "    links = links[0]\n",
-    "    return links.get(\"download\", links[\"self\"])\n",
-    "\n",
-    "\n",
-    "def download_and_extract_full_zip(lpath, url):\n",
-    "    if lpath.exists():\n",
-    "        return None\n",
-    "    lpath.parent.mkdir(exist_ok=True, parents=True)\n",
-    "\n",
-    "    content = BytesIO(requests.get(url, params=PARAMS).content)\n",
-    "    if isinstance(lpath, Path):\n",
-    "        with ZipFile(content, \"r\") as zip_ref:\n",
-    "            zip_ref.extractall(lpath)\n",
-    "    else:\n",
-    "        with tempfile.TemporaryDirectory() as tmpdir:\n",
-    "            with ZipFile(content, \"r\") as zip_ref:\n",
-    "                zip_ref.extractall(tmpdir)\n",
-    "            copy(Path(tmpdir), lpath)\n",
-    "\n",
-    "\n",
-    "def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5):\n",
-    "    lpath.mkdir(exist_ok=True, parents=True)\n",
-    "    z = ZipFileSystem(url)\n",
-    "    if isinstance(zip_glob, (list, set, tuple, np.ndarray)):\n",
-    "        files_remote = zip_glob\n",
-    "    else:\n",
-    "        files_remote = [p for p in z.glob(zip_glob) if not p.endswith(\"/\")]\n",
-    "    files_local = [lpath / Path(f).name for f in files_remote]\n",
-    "    for fr, fl in list(zip(files_remote, files_local)):\n",
-    "        if not fl.is_file():\n",
-    "            retries = 0\n",
-    "            while retries < n_retries:\n",
-    "                print(f\"...Downloading {fl.name} (attempt {retries+1}/{n_retries})\")\n",
-    "                try:\n",
-    "                    data = z.cat_file(fr)\n",
-    "                    break\n",
-    "                except FSTimeoutError:\n",
-    "                    if retries < (n_retries - 1):\n",
-    "                        retries += 1\n",
-    "                    else:\n",
-    "                        raise\n",
-    "            print(f\"...Writing {fl.name}\")\n",
-    "            fl.write_bytes(data)\n",
-    "\n",
-    "\n",
-    "def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None):\n",
-    "    dl = get_download_link(files, prefix)\n",
-    "    if zip_glob is None:\n",
-    "        return download_and_extract_full_zip(lpath, dl)\n",
-    "    else:\n",
-    "        return download_and_extract_partial_zip(lpath, dl, zip_glob)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -202,9 +129,7 @@
    },
    "outputs": [],
    "source": [
-    "pyciam_files = requests.get(\n",
-    "    Z_URL_SLIIDERS_PC.format(doi=Z_PYCIAM_DOI), params=PARAMS\n",
-    ").json()[\"files\"]"
+    "pyciam_files = get_zenodo_file_list(Z_PYCIAM_DOI)"
    ]
   },
   {
@@ -628,7 +553,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.12.2"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

diff --git a/pyCIAM/io.py b/pyCIAM/io.py
@@ -8,12 +8,22 @@
 * load_diaz_inputs
 """
 
+import tempfile
+from collections.abc import Iterable
+from io import BytesIO
+from pathlib import Path
+from zipfile import ZipFile
+
 import dask.array as da
 import numpy as np
 import pandas as pd
 import pint_xarray  # noqa: F401
+import requests
 import xarray as xr
+from fsspec import FSTimeoutError
+from fsspec.implementations.zip import ZipFileSystem
 
+from pyCIAM.utils import copy
 from pyCIAM.utils import spherical_nearest_neighbor as snn
 
 from .utils import _s2d
@@ -783,3 +793,70 @@ def load_diaz_inputs(
 
     inputs = inputs.drop_dims("rcp_pt")
     return inputs, slr
+
+
+def get_zenodo_file_list(doi, params={}):
+    return requests.get(f"https://zenodo.org/api/records/{doi}", params=params).json()[
+        "files"
+    ]
+
+
+def get_download_link(files, prefix):
+    links = [
+        i["links"]
+        for i in files
+        if i.get("filename", "").startswith(prefix)
+        or i.get("key", "").startswith(prefix)
+    ]
+    assert len(links) == 1
+    links = links[0]
+    return links.get("download", links["self"])
+
+
+def _download_and_extract_full_zip(lpath, url, params={}):
+    if lpath.exists():
+        return None
+    lpath.parent.mkdir(exist_ok=True, parents=True)
+
+    content = BytesIO(requests.get(url, params=params).content)
+    if isinstance(lpath, Path):
+        with ZipFile(content, "r") as zip_ref:
+            zip_ref.extractall(lpath)
+    else:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with ZipFile(content, "r") as zip_ref:
+                zip_ref.extractall(tmpdir)
+            copy(Path(tmpdir), lpath)
+
+
+def download_and_extract_partial_zip(lpath, url, zip_glob, n_retries=5):
+    lpath.mkdir(exist_ok=True, parents=True)
+    z = ZipFileSystem(url)
+    if isinstance(zip_glob, (list, set, tuple, np.ndarray)):
+        files_remote = zip_glob
+    else:
+        files_remote = [p for p in z.glob(zip_glob) if not p.endswith("/")]
+    files_local = [lpath / Path(f).name for f in files_remote]
+    for fr, fl in list(zip(files_remote, files_local)):
+        if not fl.is_file():
+            retries = 0
+            while retries < n_retries:
+                print(f"...Downloading {fl.name} (attempt {retries+1}/{n_retries})")
+                try:
+                    data = z.cat_file(fr)
+                    break
+                except FSTimeoutError:
+                    if retries < (n_retries - 1):
+                        retries += 1
+                    else:
+                        raise
+            print(f"...Writing {fl.name}")
+            fl.write_bytes(data)
+
+
+def download_and_extract_from_zenodo(lpath, files, prefix, zip_glob=None):
+    dl = get_download_link(files, prefix)
+    if zip_glob is None:
+        return _download_and_extract_full_zip(lpath, dl)
+    else:
+        return download_and_extract_partial_zip(lpath, dl, zip_glob)