Skip to content

Commit

Permalink
📊 climate: era-5 august update (#3081)
Browse files Browse the repository at this point in the history
  • Loading branch information
veronikasamborska1994 authored Aug 7, 2024
1 parent 2072960 commit cfdf9af
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 16 deletions.
2 changes: 1 addition & 1 deletion dag/climate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ steps:
# Copernicus Climate Change Service - Surface temperature.
#
data://meadow/climate/2023-12-20/surface_temperature:
- snapshot://climate/2024-07-08/surface_temperature.gz
- snapshot://climate/2024-08-06/surface_temperature.gz
- snapshot://countries/2023-12-27/world_bank.zip
data://garden/climate/2023-12-20/surface_temperature:
- data://meadow/climate/2023-12-20/surface_temperature
Expand Down
10 changes: 8 additions & 2 deletions etl/steps/data/meadow/climate/2023-12-20/surface_temperature.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Load a snapshot and create a meadow dataset."""

import gzip
import io
import zipfile

import geopandas as gpd
Expand All @@ -25,8 +27,12 @@
def _load_data_array(snap: Snapshot) -> xr.DataArray:
log.info("load_data_array.start")
# Load data from snapshot.
with gzip.open(snap.path, "r") as _file:
ds = xr.open_dataset(_file)
with gzip.open(snap.path, "rb") as file:
file_content = file.read()

# Create an in-memory bytes file and load the dataset
with io.BytesIO(file_content) as memfile:
ds = xr.open_dataset(memfile).load() # .load() ensures data is eagerly loaded

# The latest 3 months in this dataset are made available through ERA5T, which is slightly different to ERA5. In the downloaded file, an extra dimenions ‘expver’ indicates which data is ERA5 (expver = 1) and which is ERA5T (expver = 5).
# If a value is missing in the first dataset, it is filled with the value from the second dataset.
Expand Down
66 changes: 53 additions & 13 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ pymysql = "^1.1.1"
tiktoken = "^0.7.0"
earthengine-api = "^0.1.411"
python-docx = "^1.1.2"
h5netcdf = "^1.3.0"

[tool.poetry.group.api.dependencies]
fastapi = "^0.109.0"
Expand Down
27 changes: 27 additions & 0 deletions snapshots/climate/2024-08-06/surface_temperature.gz.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
meta:
origin:
title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of 2m Surface Temperature
title: ERA5 monthly averaged data on single levels from 1940 to present
description: |-
ERA5 is the latest climate reanalysis produced by ECMWF, providing hourly data on many atmospheric, land-surface and sea-state parameters together with estimates of uncertainty.

ERA5 data are available in the Climate Data Store on regular latitude-longitude grids at 0.25° x 0.25° resolution, with atmospheric parameters on 37 pressure levels.

ERA5 is available from 1940 and continues to be extended forward in time, with daily updates being made available 5 days behind real time

Initial release data, i.e., data no more than three months behind real time, are called ERA5T.
producer: Contains modified Copernicus Climate Change Service information
version_producer: 2
citation_full: |-
Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 08-July-2024)
url_main: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
date_accessed: 2024-08-06
date_published: 2019-04-18
license:
name: Copernicus License
url: https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf

outs:
- md5: e6348789b23c09d9413a856b017acda0
size: 2353412191
path: surface_temperature.gz
69 changes: 69 additions & 0 deletions snapshots/climate/2024-08-06/surface_temperature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service.
The script assumes that the data is available on the CDS API.
Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS
More information on how to access the data is here: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html
"""

import gzip
import shutil
import tempfile
from pathlib import Path

# CDS API
import cdsapi
import click
import xarray as xr

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/surface_temperature.gz")
# Save data as a compressed temporary file.
with tempfile.TemporaryDirectory() as temp_dir:
c = cdsapi.Client()
output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc"

c.retrieve(
"reanalysis-era5-single-levels-monthly-means",
{
"product_type": "monthly_averaged_reanalysis",
"variable": "2m_temperature",
"year": [str(year) for year in range(1940, 2025)],
"month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
"time": "00:00",
"area": [90, -180, -90, 180],
"format": "netcdf",
},
output_file,
)

# Convert data to float32
with xr.open_dataset(output_file) as ds:
# Use smaller types
ds["t2m"] = ds["t2m"].astype("float32")

ds.to_netcdf(output_file)

# Compress the file
with open(output_file, "rb") as f_in:
with gzip.open(str(output_file) + ".gz", "wb") as f_out:
shutil.copyfileobj(f_in, f_out)

gzip_file = str(output_file) + ".gz"
# Upload snapshot.
snap.create_snapshot(filename=gzip_file, upload=upload)


if __name__ == "__main__":
main()

0 comments on commit cfdf9af

Please sign in to comment.