📊 climate: era-5 august update (#3081)

owid · Aug 7, 2024 · cfdf9af · cfdf9af
1 parent 2072960
commit cfdf9af
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 16 deletions.
diff --git a/dag/climate.yml b/dag/climate.yml
@@ -32,7 +32,7 @@ steps:
   # Copernicus Climate Change Service - Surface temperature.
   #
   data://meadow/climate/2023-12-20/surface_temperature:
-  - snapshot://climate/2024-07-08/surface_temperature.gz
+  - snapshot://climate/2024-08-06/surface_temperature.gz
   - snapshot://countries/2023-12-27/world_bank.zip
   data://garden/climate/2023-12-20/surface_temperature:
     - data://meadow/climate/2023-12-20/surface_temperature

diff --git a/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py b/etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py
@@ -1,5 +1,7 @@
 """Load a snapshot and create a meadow dataset."""
+
 import gzip
+import io
 import zipfile
 
 import geopandas as gpd
@@ -25,8 +27,12 @@
 def _load_data_array(snap: Snapshot) -> xr.DataArray:
     log.info("load_data_array.start")
     # Load data from snapshot.
-    with gzip.open(snap.path, "r") as _file:
-        ds = xr.open_dataset(_file)
+    with gzip.open(snap.path, "rb") as file:
+        file_content = file.read()
+
+    # Create an in-memory bytes file and load the dataset
+    with io.BytesIO(file_content) as memfile:
+        ds = xr.open_dataset(memfile).load()  # .load() ensures data is eagerly loaded
 
     # The latest 3 months in this dataset are made available through ERA5T, which is slightly different to ERA5. In the downloaded file, an extra dimenions ‘expver’ indicates which data is ERA5 (expver = 1) and which is ERA5T (expver = 5).
     # If a value is missing in the first dataset, it is filled with the value from the second dataset.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,6 +70,7 @@ pymysql = "^1.1.1"
 tiktoken = "^0.7.0"
 earthengine-api = "^0.1.411"
 python-docx = "^1.1.2"
+h5netcdf = "^1.3.0"
 
 [tool.poetry.group.api.dependencies]
 fastapi = "^0.109.0"

diff --git a/snapshots/climate/2024-08-06/surface_temperature.gz.dvc b/snapshots/climate/2024-08-06/surface_temperature.gz.dvc
@@ -0,0 +1,27 @@
+meta:
+  origin:
+    title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of 2m Surface Temperature
+    title: ERA5 monthly averaged data on single levels from 1940 to present
+    description: |-
+      ERA5 is the latest climate reanalysis produced by ECMWF, providing hourly data on many atmospheric, land-surface and sea-state parameters together with estimates of uncertainty.
+
+      ERA5 data are available in the Climate Data Store on regular latitude-longitude grids at 0.25° x 0.25° resolution, with atmospheric parameters on 37 pressure levels.
+
+      ERA5 is available from 1940 and continues to be extended forward in time, with daily updates being made available 5 days behind real time
+
+      Initial release data, i.e., data no more than three months behind real time, are called ERA5T.
+    producer: Contains modified Copernicus Climate Change Service information
+    version_producer: 2
+    citation_full: |-
+      Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 08-July-2024)
+    url_main: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
+    date_accessed: 2024-08-06
+    date_published: 2019-04-18
+    license:
+      name: Copernicus License
+      url: https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf
+
+outs:
+  - md5: e6348789b23c09d9413a856b017acda0
+    size: 2353412191
+    path: surface_temperature.gz
diff --git a/snapshots/climate/2024-08-06/surface_temperature.py b/snapshots/climate/2024-08-06/surface_temperature.py
@@ -0,0 +1,69 @@
+"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service.
+
+   The script assumes that the data is available on the CDS API.
+   Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS
+
+   More information on how to access the data is here: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
+
+   The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html
+   """
+
+import gzip
+import shutil
+import tempfile
+from pathlib import Path
+
+# CDS API
+import cdsapi
+import click
+import xarray as xr
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/surface_temperature.gz")
+    # Save data as a compressed temporary file.
+    with tempfile.TemporaryDirectory() as temp_dir:
+        c = cdsapi.Client()
+        output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc"
+
+        c.retrieve(
+            "reanalysis-era5-single-levels-monthly-means",
+            {
+                "product_type": "monthly_averaged_reanalysis",
+                "variable": "2m_temperature",
+                "year": [str(year) for year in range(1940, 2025)],
+                "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
+                "time": "00:00",
+                "area": [90, -180, -90, 180],
+                "format": "netcdf",
+            },
+            output_file,
+        )
+
+        # Convert data to float32
+        with xr.open_dataset(output_file) as ds:
+            # Use smaller types
+            ds["t2m"] = ds["t2m"].astype("float32")
+
+            ds.to_netcdf(output_file)
+
+        # Compress the file
+        with open(output_file, "rb") as f_in:
+            with gzip.open(str(output_file) + ".gz", "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+        gzip_file = str(output_file) + ".gz"
+        # Upload snapshot.
+        snap.create_snapshot(filename=gzip_file, upload=upload)
+
+
+if __name__ == "__main__":
+    main()