Implement era5 land (#21)

* add era5 land dataset * add demo notebook * import era5 land in init * update alma convention * move shared parser function to utils * add tests for utils * add test for era5 land dataset * remove unneccesary duplication check * Revert "remove unneccesary duplication check" This reverts commit d2242b6. * era5land inherit from era5 * make a general ecmwf dataset and inherit for era5(land) * please mypy * move era5 land to era5.py * drop py3.8, fix circular import and please mypy with proper protocol usage * fix dataset import in notebooks * implement consistent zampy name for ecmwf datasets
EcoExtreML · Aug 15, 2023 · 08a7ee4 · 08a7ee4
1 parent 73f36c2
commit 08a7ee4
Show file tree

Hide file tree

Showing 28 changed files with 2,048 additions and 398 deletions.
diff --git a/demo/era5-land_dataset_demo.ipynb b/demo/era5-land_dataset_demo.ipynb
diff --git a/demo/era5_dataset_demo.ipynb b/demo/era5_dataset_demo.ipynb
diff --git a/demo/eth_dataset_demo.ipynb b/demo/eth_dataset_demo.ipynb
@@ -24,7 +24,7 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "from zampy.datasets import EthCanopyHeight\n",
+    "from zampy.datasets.catalog import EthCanopyHeight\n",
     "from zampy.datasets.dataset_protocol import TimeBounds, SpatialBounds\n",
     "from pathlib import Path\n",
     "\n",
@@ -2660,7 +2660,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.0"
   },
   "orig_nbformat": 4
  },

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ classifiers = [
   "Programming Language :: Python :: 3 :: Only",
   "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
   "requests",
@@ -135,10 +136,11 @@ testpaths = ["tests"]
 [tool.mypy]
 ignore_missing_imports = true
 disallow_untyped_defs = true
+python_version = "3.9"
 
 [tool.black]
 line-length = 88
-target-version = ['py38', 'py39', 'py310']
+target-version = ['py39', 'py310', 'py311']
 include = '\.pyi?$'
 
 [tool.ruff]
@@ -169,7 +171,7 @@ line-length = 88
 exclude = ["docs", "build"]
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-target-version = "py38"
+target-version = "py39"
 
 [tool.ruff.per-file-ignores]
 "tests/**" = ["D"]

diff --git a/src/zampy/conventions/ALMA.json b/src/zampy/conventions/ALMA.json
@@ -60,5 +60,13 @@
     "total_precipitation": {
         "variable": "Rainf",
         "units": "millimeter/second"
+    },
+    "air_temperature": {
+        "variable": "Tair",
+        "units": "kelvin"
+    },
+    "dewpoint_temperature": {
+        "variable": "d2m",
+        "units": "kelvin"
     }
 }
diff --git a/src/zampy/datasets/__init__.py b/src/zampy/datasets/__init__.py
@@ -1,16 +1,7 @@
 """Datasets implementations."""
 from zampy.datasets import dataset_protocol
 from zampy.datasets import validation
-from zampy.datasets.era5 import ERA5
-from zampy.datasets.eth_canopy_height import EthCanopyHeight
+from zampy.datasets.catalog import DATASETS
 
 
-__all__ = ["dataset_protocol", "validation", "EthCanopyHeight", "ERA5"]
-
-
-# This object tracks which datasets are available.
-DATASETS: dict[str, type[dataset_protocol.Dataset]] = {
-    # All lowercase key.
-    "era5": ERA5,
-    "eth_canopy_height": EthCanopyHeight,
-}
+__all__ = ["dataset_protocol", "validation", "DATASETS"]
diff --git a/src/zampy/datasets/catalog.py b/src/zampy/datasets/catalog.py
@@ -0,0 +1,14 @@
+"""Catalog of datasets."""
+from zampy.datasets import dataset_protocol
+from zampy.datasets.era5 import ERA5
+from zampy.datasets.era5 import ERA5Land
+from zampy.datasets.eth_canopy_height import EthCanopyHeight
+
+
+# This object tracks which datasets are available.
+DATASETS: dict[str, type[dataset_protocol.Dataset]] = {
+    # All lowercase key.
+    "era5": ERA5,
+    "era5_land": ERA5Land,
+    "eth_canopy_height": EthCanopyHeight,
+}
diff --git a/src/zampy/datasets/dataset_protocol.py b/src/zampy/datasets/dataset_protocol.py
@@ -1,14 +1,11 @@
 """Outline of the dataset protocol."""
 import json
 import shutil
-from abc import abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
-from typing import List
 from typing import Optional
 from typing import Protocol
-from typing import Tuple
 import numpy as np
 import xarray as xr
 
@@ -79,21 +76,20 @@ class Dataset(Protocol):
     crs: str
     license: str
     bib: str
-    raw_variables: Tuple[Variable, ...]
-    variable_names: Tuple[str, ...]
-    variables: Tuple[Variable, ...]
+    raw_variables: list[Variable]
+    variable_names: list[str]
+    variables: list[Variable]
 
     def __init__(self) -> None:
         """Init."""
         ...
 
-    @abstractmethod
     def download(
         self,
         download_dir: Path,
         time_bounds: TimeBounds,
         spatial_bounds: SpatialBounds,
-        variable_names: List[str],
+        variable_names: list[str],
         overwrite: bool = False,
     ) -> bool:
         """Download the data.
@@ -111,7 +107,6 @@ def download(
         """
         ...
 
-    @abstractmethod
     def ingest(
         self,
         download_dir: Path,
@@ -130,15 +125,14 @@ def ingest(
         """
         ...
 
-    @abstractmethod
     def load(
         self,
         ingest_dir: Path,
         time_bounds: TimeBounds,
         spatial_bounds: SpatialBounds,
         resolution: float,
         regrid_method: str,
-        variable_names: List[str],
+        variable_names: list[str],
     ) -> xr.Dataset:
         """Get the dataset as an xarray Dataset.
 
@@ -160,7 +154,6 @@ def load(
         """
         ...
 
-    @abstractmethod
     def convert(
         self,
         ingest_dir: Path,
@@ -182,7 +175,7 @@ def write_properties_file(
     dataset_folder: Path,
     spatial_bounds: SpatialBounds,
     time_bounds: TimeBounds,
-    variable_names: List[str],
+    variable_names: list[str],
 ) -> None:
     """Write the (serialized) spatial and time bounds to a json file.
 
@@ -211,7 +204,7 @@ def write_properties_file(
 
 def read_properties_file(
     dataset_folder: Path,
-) -> Tuple[SpatialBounds, TimeBounds, List[str]]:
+) -> tuple[SpatialBounds, TimeBounds, list[str]]:
     """Load the serialized spatial and time bounds from the json file.
 
     Args:

diff --git a/src/zampy/datasets/ecmwf_dataset.py b/src/zampy/datasets/ecmwf_dataset.py
@@ -0,0 +1,150 @@
+"""Base module for datasets available on CDS."""
+
+from pathlib import Path
+from typing import Union
+import xarray as xr
+from zampy.datasets import converter
+from zampy.datasets import utils
+from zampy.datasets import validation
+from zampy.datasets.dataset_protocol import SpatialBounds
+from zampy.datasets.dataset_protocol import TimeBounds
+from zampy.datasets.dataset_protocol import Variable
+from zampy.datasets.dataset_protocol import copy_properties_file
+from zampy.datasets.dataset_protocol import write_properties_file
+from zampy.utils import regrid
+
+
+## Ignore missing class/method docstrings: they are implemented in the Dataset class.
+# ruff: noqa: D102
+
+
+class ECMWFDataset:  # noqa: D101
+    name: str
+    time_bounds: TimeBounds
+    spatial_bounds = SpatialBounds(90, 180, -90, -180)
+    crs = "EPSG:4326"
+
+    raw_variables: list[Variable]
+    cds_var_names: dict[str, str]
+    variable_names: list[str]
+    variables: list[Variable]
+    license = "cc-by-4.0"
+    bib = """
+    @article{hersbach2020era5,
+        title={The ERA5 global reanalysis},
+        author={Hersbach, Hans et al.},
+        journal={Quarterly Journal of the Royal Meteorological Society},
+        volume={146},
+        number={730},
+        pages={1999--2049},
+        year={2020},
+        publisher={Wiley Online Library}
+        }
+    """
+    cds_dataset: str
+
+    def __init__(self) -> None:
+        """Init."""
+        pass
+
+    def download(
+        self,
+        download_dir: Path,
+        time_bounds: TimeBounds,
+        spatial_bounds: SpatialBounds,
+        variable_names: list[str],
+        overwrite: bool = False,
+    ) -> bool:
+        validation.validate_download_request(
+            self,
+            download_dir,
+            time_bounds,
+            spatial_bounds,
+            variable_names,
+        )
+
+        download_folder = download_dir / self.name
+        download_folder.mkdir(parents=True, exist_ok=True)
+
+        utils.cds_request(
+            dataset=self.cds_dataset,
+            variables=variable_names,
+            time_bounds=time_bounds,
+            spatial_bounds=spatial_bounds,
+            path=download_folder,
+            cds_var_names=self.cds_var_names,
+            overwrite=overwrite,
+        )
+
+        write_properties_file(
+            download_folder, spatial_bounds, time_bounds, variable_names
+        )
+
+        return True
+
+    def ingest(
+        self,
+        download_dir: Path,
+        ingest_dir: Path,
+        overwrite: bool = False,
+    ) -> bool:
+        download_folder = download_dir / self.name
+        ingest_folder = ingest_dir / self.name
+        ingest_folder.mkdir(parents=True, exist_ok=True)
+
+        data_file_pattern = f"{self.name}_*.nc"
+        data_files = list(download_folder.glob(data_file_pattern))
+
+        for file in data_files:
+            utils.convert_to_zampy(
+                ingest_folder,
+                file=file,
+                overwrite=overwrite,
+            )
+
+        copy_properties_file(download_folder, ingest_folder)
+
+        return True
+
+    def load(
+        self,
+        ingest_dir: Path,
+        time_bounds: TimeBounds,
+        spatial_bounds: SpatialBounds,
+        resolution: float,
+        regrid_method: str,
+        variable_names: list[str],
+    ) -> xr.Dataset:
+        files: list[Path] = []
+        for var in self.variable_names:
+            if var in variable_names:
+                files += (ingest_dir / self.name).glob(f"{self.name}_{var}*.nc")
+
+        ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200})
+        ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
+        ds = regrid.regrid_data(ds, spatial_bounds, resolution, regrid_method)
+
+        return ds
+
+    def convert(
+        self,
+        ingest_dir: Path,
+        convention: Union[str, Path],
+    ) -> bool:
+        converter.check_convention(convention)
+        ingest_folder = ingest_dir / self.name
+
+        data_file_pattern = f"{self.name}_*.nc"
+
+        data_files = list(ingest_folder.glob(data_file_pattern))
+
+        for file in data_files:
+            # start conversion process
+            print(f"Start processing file `{file.name}`.")
+            ds = xr.open_dataset(file, chunks={"x": 50, "y": 50})
+            ds = converter.convert(ds, dataset=self, convention=convention)
+            # TODO: support derived variables
+            # TODO: other calculations
+            # call ds.compute()
+
+        return True