coiled · jrbourbeau · Sep 20, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -31,7 +31,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python_version: ["3.10"]
-        pytest_args: [tests --ignore=tests/tpch]
+        pytest_args: [tests/geospatial/test_netcdf_to_zarr.py --ignore=tests/tpch]
         extra-env: [""]
         name_prefix: [tests]
         include:

diff --git a/ci/environment.yml b/ci/environment.yml
@@ -48,6 +48,7 @@ dependencies:
   - gilknocker ==0.4.1
   - openssl >1.1.0g
   - rioxarray ==0.17.0
+  - h5netcdf ==1.3.0
 
 ########################################################
 # PLEASE READ:

diff --git a/tests/geospatial/test_netcdf_to_zarr.py b/tests/geospatial/test_netcdf_to_zarr.py
@@ -0,0 +1,92 @@
+import xarray as xr
+from dask.utils import format_bytes
+
+
+def test_netcdf_to_zarr(
+    scale,
+    s3,
+    s3_url,
+    client_factory,
+    cluster_kwargs={
+        "workspace": "dask-engineering",
+        "region": "us-west-2",
+        "wait_for_workers": True,
+    },
+    scale_kwargs={
+        "small": {"n_workers": 10},
+        "medium": {"n_workers": 100},
+        "large": {"n_workers": 200},
+    },
+):
+    with client_factory(
+        **scale_kwargs[scale], **cluster_kwargs
+    ) as client:  # noqa: F841
+        # Define models and variables of interest
+        models = [
+            "ACCESS-CM2",
+            "ACCESS-ESM1-5",
+            "CMCC-ESM2",
+            "CNRM-CM6-1",
+            "CNRM-ESM2-1",
+            "CanESM5",
+            "EC-Earth3",
+            "EC-Earth3-Veg-LR",
+            "FGOALS-g3",
+            "GFDL-ESM4",
+            "GISS-E2-1-G",
+            "INM-CM4-8",
+            "INM-CM5-0",
+            "KACE-1-0-G",
+            "MIROC-ES2L",
+            "MPI-ESM1-2-HR",
+            "MPI-ESM1-2-LR",
+            "MRI-ESM2-0",
+            "NorESM2-LM",
+            "NorESM2-MM",
+            "TaiESM1",
+            "UKESM1-0-LL",
+        ]
+        variables = [
+            "hurs",
+            "huss",
+            "pr",
+            "rlds",
+            "rsds",
+            "sfcWind",
+            "tas",
+            "tasmax",
+            "tasmin",
+        ]
+
+        if scale == "small":
+            # 130 files (152.83 GiB). One model and one variable.
+            models = models[:1]
+            variables = variables[:1]
+        elif scale == "medium":
+            # 715 files. One model and all variables.
+            # Currently fails after hitting 20 minute idle timeout
+            # sending `to_zarr` graph to the scheduler.
+            models = models[:1]
+        else:
+            # 11635 files. All models and variables.
+            pass
+
+        # Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6
+        # for dataset details.
+        file_list = []
+        for model in models:
+            for variable in variables:
+                data_dir = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc"
+                file_list += [f"s3://{path}" for path in s3.glob(data_dir)]
+        files = [s3.open(f) for f in file_list]
+        print(f"Processing {len(files)} NetCDF files")
+
+        ds = xr.open_mfdataset(
+            files,
+            engine="h5netcdf",
+            combine="nested",
+            concat_dim="time",
+            parallel=True,
+        )
+        print(f"Converting {format_bytes(ds.nbytes)} from NetCDF to Zarr")
+        ds.to_zarr(s3_url)