Add AQ-NO2-S5P

microsoft · adamjstewart · Mar 26, 2025 · Feb 23, 2025 · Mar 25, 2025 · Mar 25, 2025
commit ef154e865132635d23a3c12cdf8357d9d8da0c57
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -601,6 +601,7 @@ Copernicus-Bench is a comprehensive evaluation benchmark with 15 downstream task
 .. autoclass:: CopernicusBenchFloodS1
 .. autoclass:: CopernicusBenchLCZS2
 .. autoclass:: CopernicusBenchBiomassS3
+.. autoclass:: CopernicusBenchAQNO2S5P
 
 Base Classes
 ------------

diff --git a/docs/api/datasets/copernicus_bench.csv b/docs/api/datasets/copernicus_bench.csv
@@ -12,3 +12,4 @@ L2,DFC2020-S2,S,Sentinel-2,CC-BY-4.0,"5,128",10,256x256,10,MSI
 L3,Flood-S1,CD,Sentinel-1,MIT,"5,000",3,224x224,10,SAR
 L3,LCZ-S2,C,Sentinel-2,CC-BY-4.0,"25,000",17,32x32,10,MSI
 L3,Biomass-S3,R,Sentinel-3,CC-BY-4.0,"5,000",-,96x96,300,MSI
+L3,AQ-NO2-S5P,R,Sentinel-5,CC-BY-4.0,"2,467",-,56x56,"1,000",-
diff --git a/tests/data/copernicus/l3_airquality_s5p/airquality_s5p.zip b/tests/data/copernicus/l3_airquality_s5p/airquality_s5p.zip
diff --git a/...1_no2_avg_34_13/2021-10-01_2021-12-31.tif → ...1_no2_avg_34_13/2021-01-01_2021-12-31.tif b/...1_no2_avg_34_13/2021-10-01_2021-12-31.tif → ...1_no2_avg_34_13/2021-01-01_2021-12-31.tif
diff --git a/tests/data/copernicus/l3_airquality_s5p/data.py b/tests/data/copernicus/l3_airquality_s5p/data.py
@@ -48,6 +48,7 @@
 }
 
 Z = np.random.random(size=(profile['height'], profile['width']))
+file = '2021-01-01_2021-12-31.tif'
 files = [
     '2021-01-01_2021-04-01.tif',
     '2021-04-01_2021-07-01.tif',
@@ -60,7 +61,7 @@
     # Image (annual)
     directory = os.path.join('airquality_s5p', variable, 's5p_annual', pid)
     os.makedirs(directory, exist_ok=True)
-    path = os.path.join(directory, files[-1])
+    path = os.path.join(directory, file)
     with rio.open(path, 'w', **profile) as src:
         src.write(Z, 1)
 

diff --git a/tests/datasets/test_copernicus.py b/tests/datasets/test_copernicus.py
@@ -40,6 +40,8 @@ class TestCopernicusBench:
             ('lcz_s2', 'l3_lcz_s2', {}),
             ('biomass_s3', 'l3_biomass_s3', {'mode': 'static'}),
             ('biomass_s3', 'l3_biomass_s3', {'mode': 'time-series'}),
+            ('aq_no2_s5p', 'l3_airquality_s5p', {'mode': 'annual'}),
+            ('aq_no2_s5p', 'l3_airquality_s5p', {'mode': 'seasonal'}),
         ]
     )
     def dataset(self, request: SubRequest) -> CopernicusBench:
@@ -108,6 +110,8 @@ def test_not_rgb(self, dataset: CopernicusBench) -> None:
 
         if dataset.name.endswith('s1'):
             all_bands = ['VV']
+        elif dataset.name.endswith('s5p'):
+            pytest.skip('single-band dataset')
 
         dataset = CopernicusBench(dataset.name, dataset.root, bands=all_bands)
         match = 'Dataset does not contain some of the RGB bands'

diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -32,6 +32,7 @@
 from .cms_mangrove_canopy import CMSGlobalMangroveCanopy
 from .copernicus import (
     CopernicusBench,
+    CopernicusBenchAQNO2S5P,
     CopernicusBenchBase,
     CopernicusBenchBigEarthNetS1,
     CopernicusBenchBigEarthNetS2,
@@ -175,7 +176,6 @@
 from .western_usa_live_fuel_moisture import WesternUSALiveFuelMoisture
 from .xview import XView2
 from .zuericrop import ZueriCrop
-from .senbench_airquality_s5p import SenBenchAQNO2S5P, SenBenchAQO3S5P
 
 __all__ = (
     'ADVANCE',
@@ -234,6 +234,7 @@
     'ChesapeakeWV',
     'CloudCoverDetection',
     'CopernicusBench',
+    'CopernicusBenchAQNO2S5P',
     'CopernicusBenchBase',
     'CopernicusBenchBigEarthNetS1',
     'CopernicusBenchBigEarthNetS2',
@@ -352,6 +353,4 @@
     'stack_samples',
     'time_series_split',
     'unbind_samples',
-    'SenBenchAQNO2S5P',
-    'SenBenchNO2S5P',
 )
diff --git a/torchgeo/datasets/copernicus/__init__.py b/torchgeo/datasets/copernicus/__init__.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 
 from ..geo import NonGeoDataset
+from .aq_no2_s5p import CopernicusBenchAQNO2S5P
 from .base import CopernicusBenchBase
 from .bigearthnet_s1 import CopernicusBenchBigEarthNetS1
 from .bigearthnet_s2 import CopernicusBenchBigEarthNetS2
@@ -25,6 +26,7 @@
 
 __all__ = (
     'CopernicusBench',
+    'CopernicusBenchAQNO2S5P',
     'CopernicusBenchBase',
     'CopernicusBenchBigEarthNetS1',
     'CopernicusBenchBigEarthNetS2',
@@ -55,6 +57,7 @@
     'flood_s1': CopernicusBenchFloodS1,
     'lcz_s2': CopernicusBenchLCZS2,
     'biomass_s3': CopernicusBenchBiomassS3,
+    'aq_no2_s5p': CopernicusBenchAQNO2S5P,
 }
 
 
@@ -86,6 +89,7 @@ def __init__(
             'flood_s1',
             'lcz_s2',
             'biomass_s3',
+            'aq_no2_s5p',
         ],
         *args: Any,
         **kwargs: Any,

diff --git a/torchgeo/datasets/copernicus/aq_no2_s5p.py b/torchgeo/datasets/copernicus/aq_no2_s5p.py
@@ -0,0 +1,108 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Copernicus-Bench AQ-NO2-S5P dataset."""
+
+import os
+from collections.abc import Callable, Sequence
+from typing import Literal
+
+import torch
+from torch import Tensor
+
+from ..utils import Path, stack_samples
+from .base import CopernicusBenchBase
+
+
+class CopernicusBenchAQNO2S5P(CopernicusBenchBase):
+    """Copernicus-Bench AQ-NO2-S5P dataset.
+
+    AQ-NO2-S5P is a regression dataset based on Sentinel-5P NO2 images and
+    EEA air quality data products. Specifically, this dataset combines 2021
+    measurements of NO2 (annual average concentration) from EEA with S5P NO2
+    ("tropospheric NO2 column number density") from GEE.
+
+    This benchmark supports both annual (1 image/location) and seasonal
+    (4 images/location) modes, the former is used in the original benchmark.
+
+    If you use this dataset in your research, please cite the following papers:
+
+    * https://arxiv.org/abs/2503.11849
+    * https://www.researchgate.net/profile/Jan-Horalek/publication/389165501_Air_quality_maps_of_EEA_member_and_cooperating_countries_for_2022/links/67b72628207c0c20fa8ec116/Air-quality-maps-of-EEA-member-and-cooperating-countries-for-2022.pdf
+
+    .. versionadded:: 0.7
+    """
+
+    url = 'https://hf.co/datasets/wangyi111/Copernicus-Bench/resolve/9d252acd3aa0e3da3128e05c6f028647f0e48e5f/l3_airquality_s5p/airquality_s5p.zip'
+    md5 = '92081c7437c5c1daf783868ad7669877'
+    zipfile = 'airquality_s5p.zip'
+    directory = os.path.join('airquality_s5p', 'no2')
+    filename = '{}.csv'
+    dtype = torch.float
+    filename_regex = r'(?P<start>\d{4}-\d{2}-\d{2})_(?P<stop>\d{4}-\d{2}-\d{2})'
+    date_format = '%Y-%m-%d'
+    all_bands = ('NO2',)
+    rgb_bands = ('NO2',)
+    cmap = 'Wistia'
+
+    def __init__(
+        self,
+        root: Path = 'data',
+        split: Literal['train', 'val', 'test'] = 'train',
+        mode: Literal['annual', 'seasonal'] = 'annual',
+        bands: Sequence[str] | None = None,
+        transforms: Callable[[dict[str, Tensor]], dict[str, Tensor]] | None = None,
+        download: bool = False,
+        checksum: bool = False,
+    ) -> None:
+        """Initialize a new CopernicusBenchAQNO2S5P instance.
+
+        Args:
+            root: Root directory where dataset can be found.
+            split: One of 'train', 'val', or 'test'.
+            mode: One of 'annual' or 'seasonal'.
+            bands: Sequence of band names to load (defaults to all bands).
+            transforms: A function/transform that takes input sample and its target as
+                entry and returns a transformed version.
+            download: If True, download dataset and store it in the root directory.
+            checksum: If True, check the MD5 of the downloaded files (may be slow).
+
+        Raises:
+            DatasetNotFoundError: If dataset is not found and *download* is False.
+        """
+        self.mode = mode
+        super().__init__(root, split, bands, transforms, download, checksum)
+
+    def __getitem__(self, index: int) -> dict[str, Tensor]:
+        """Return an index within the dataset.
+
+        Args:
+            index: Index to return.
+
+        Returns:
+            Data and labels at that index.
+        """
+        pid = self.files[index]
+        match self.mode:
+            case 'annual':
+                file = '2021-01-01_2021-12-31.tif'
+                path = os.path.join(self.root, self.directory, 's5p_annual', pid, file)
+                sample = self._load_image(path)
+            case 'seasonal':
+                files = [
+                    '2021-01-01_2021-04-01.tif',
+                    '2021-04-01_2021-07-01.tif',
+                    '2021-07-01_2021-10-01.tif',
+                    '2021-10-01_2021-12-31.tif',
+                ]
+                root = os.path.join(self.root, self.directory, 's5p_seasonal', pid)
+                samples = [self._load_image(os.path.join(root, file)) for file in files]
+                sample = stack_samples(samples)
+
+        path = os.path.join(self.root, self.directory, 'label_annual', f'{pid}.tif')
+        sample |= self._load_mask(path)
+
+        if self.transforms is not None:
+            sample = self.transforms(sample)
+
+        return sample
diff --git a/torchgeo/datasets/copernicus/base.py b/torchgeo/datasets/copernicus/base.py
@@ -59,6 +59,10 @@ def url(self) -> str:
     #: Filename format of split files.
     filename = '{}.csv'
 
+    #: Mask dtype to cast to, either torch.long for classification
+    #: or torch.float for regression.
+    dtype: torch.dtype = torch.long
+
     #: Regular expression used to extract date from filename.
     filename_regex = '.*'
 
@@ -156,6 +160,13 @@ def _load_image(self, path: str) -> dict[str, Tensor]:
                     mint, maxt = disambiguate_timestamp(date_str, self.date_format)
                     time = (mint + maxt) / 2
                     sample['time'] = torch.tensor(time)
+                elif 'start' in match.groupdict() and 'stop' in match.groupdict():
+                    start = match.group('start')
+                    stop = match.group('stop')
+                    mint, _ = disambiguate_timestamp(start, self.date_format)
+                    _, maxt = disambiguate_timestamp(stop, self.date_format)
+                    time = (mint + maxt) / 2
+                    sample['time'] = torch.tensor(time)
 
         return sample
 
@@ -170,7 +181,7 @@ def _load_mask(self, path: str) -> dict[str, Tensor]:
         """
         sample: dict[str, Tensor] = {}
         with rio.open(path) as f:
-            sample['mask'] = torch.tensor(f.read(1).astype(np.int64))
+            sample['mask'] = torch.tensor(f.read(1)).to(self.dtype)
 
         return sample
 

diff --git a/torchgeo/datasets/copernicus/biomass_s3.py b/torchgeo/datasets/copernicus/biomass_s3.py
@@ -9,6 +9,7 @@
 from typing import Literal
 
 import pandas as pd
+import torch
 from torch import Tensor
 
 from ..utils import Path, stack_samples
@@ -37,8 +38,9 @@ class CopernicusBenchBiomassS3(CopernicusBenchBase):
     url = 'https://hf.co/datasets/wangyi111/Copernicus-Bench/resolve/9d252acd3aa0e3da3128e05c6f028647f0e48e5f/l3_biomass_s3/biomass_s3.zip'
     md5 = '4769ab8c2c23cd8957b99e15e071931c'
     zipfile = 'biomass_s3.zip'
-    filename = 'static_fnames-{}.csv'
     directory = 'biomass_s3'
+    filename = 'static_fnames-{}.csv'
+    dtype = torch.float
     filename_regex = r'S3[AB]_(?P<date>\d{8}T\d{6})'
     all_bands = (
         'Oa01_radiance',