Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copernicus-Bench: Add Level-3 Sentinel-5P #2607

Merged
merged 6 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add AQ-NO2-S5P
  • Loading branch information
adamjstewart committed Mar 26, 2025
commit ef154e865132635d23a3c12cdf8357d9d8da0c57
1 change: 1 addition & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ Copernicus-Bench is a comprehensive evaluation benchmark with 15 downstream task
.. autoclass:: CopernicusBenchFloodS1
.. autoclass:: CopernicusBenchLCZS2
.. autoclass:: CopernicusBenchBiomassS3
.. autoclass:: CopernicusBenchAQNO2S5P

Base Classes
------------
Expand Down
1 change: 1 addition & 0 deletions docs/api/datasets/copernicus_bench.csv
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ L2,DFC2020-S2,S,Sentinel-2,CC-BY-4.0,"5,128",10,256x256,10,MSI
L3,Flood-S1,CD,Sentinel-1,MIT,"5,000",3,224x224,10,SAR
L3,LCZ-S2,C,Sentinel-2,CC-BY-4.0,"25,000",17,32x32,10,MSI
L3,Biomass-S3,R,Sentinel-3,CC-BY-4.0,"5,000",-,96x96,300,MSI
L3,AQ-NO2-S5P,R,Sentinel-5,CC-BY-4.0,"2,467",-,56x56,"1,000",-
Binary file modified tests/data/copernicus/l3_airquality_s5p/airquality_s5p.zip
Binary file not shown.
3 changes: 2 additions & 1 deletion tests/data/copernicus/l3_airquality_s5p/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
}

Z = np.random.random(size=(profile['height'], profile['width']))
file = '2021-01-01_2021-12-31.tif'
files = [
'2021-01-01_2021-04-01.tif',
'2021-04-01_2021-07-01.tif',
Expand All @@ -60,7 +61,7 @@
# Image (annual)
directory = os.path.join('airquality_s5p', variable, 's5p_annual', pid)
os.makedirs(directory, exist_ok=True)
path = os.path.join(directory, files[-1])
path = os.path.join(directory, file)
with rio.open(path, 'w', **profile) as src:
src.write(Z, 1)

Expand Down
4 changes: 4 additions & 0 deletions tests/datasets/test_copernicus.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class TestCopernicusBench:
('lcz_s2', 'l3_lcz_s2', {}),
('biomass_s3', 'l3_biomass_s3', {'mode': 'static'}),
('biomass_s3', 'l3_biomass_s3', {'mode': 'time-series'}),
('aq_no2_s5p', 'l3_airquality_s5p', {'mode': 'annual'}),
('aq_no2_s5p', 'l3_airquality_s5p', {'mode': 'seasonal'}),
]
)
def dataset(self, request: SubRequest) -> CopernicusBench:
Expand Down Expand Up @@ -108,6 +110,8 @@ def test_not_rgb(self, dataset: CopernicusBench) -> None:

if dataset.name.endswith('s1'):
all_bands = ['VV']
elif dataset.name.endswith('s5p'):
pytest.skip('single-band dataset')

dataset = CopernicusBench(dataset.name, dataset.root, bands=all_bands)
match = 'Dataset does not contain some of the RGB bands'
Expand Down
5 changes: 2 additions & 3 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .cms_mangrove_canopy import CMSGlobalMangroveCanopy
from .copernicus import (
CopernicusBench,
CopernicusBenchAQNO2S5P,
CopernicusBenchBase,
CopernicusBenchBigEarthNetS1,
CopernicusBenchBigEarthNetS2,
Expand Down Expand Up @@ -175,7 +176,6 @@
from .western_usa_live_fuel_moisture import WesternUSALiveFuelMoisture
from .xview import XView2
from .zuericrop import ZueriCrop
from .senbench_airquality_s5p import SenBenchAQNO2S5P, SenBenchAQO3S5P

__all__ = (
'ADVANCE',
Expand Down Expand Up @@ -234,6 +234,7 @@
'ChesapeakeWV',
'CloudCoverDetection',
'CopernicusBench',
'CopernicusBenchAQNO2S5P',
'CopernicusBenchBase',
'CopernicusBenchBigEarthNetS1',
'CopernicusBenchBigEarthNetS2',
Expand Down Expand Up @@ -352,6 +353,4 @@
'stack_samples',
'time_series_split',
'unbind_samples',
'SenBenchAQNO2S5P',
'SenBenchNO2S5P',
)
4 changes: 4 additions & 0 deletions torchgeo/datasets/copernicus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from torch import Tensor

from ..geo import NonGeoDataset
from .aq_no2_s5p import CopernicusBenchAQNO2S5P
from .base import CopernicusBenchBase
from .bigearthnet_s1 import CopernicusBenchBigEarthNetS1
from .bigearthnet_s2 import CopernicusBenchBigEarthNetS2
Expand All @@ -25,6 +26,7 @@

__all__ = (
'CopernicusBench',
'CopernicusBenchAQNO2S5P',
'CopernicusBenchBase',
'CopernicusBenchBigEarthNetS1',
'CopernicusBenchBigEarthNetS2',
Expand Down Expand Up @@ -55,6 +57,7 @@
'flood_s1': CopernicusBenchFloodS1,
'lcz_s2': CopernicusBenchLCZS2,
'biomass_s3': CopernicusBenchBiomassS3,
'aq_no2_s5p': CopernicusBenchAQNO2S5P,
}


Expand Down Expand Up @@ -86,6 +89,7 @@ def __init__(
'flood_s1',
'lcz_s2',
'biomass_s3',
'aq_no2_s5p',
],
*args: Any,
**kwargs: Any,
Expand Down
108 changes: 108 additions & 0 deletions torchgeo/datasets/copernicus/aq_no2_s5p.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""Copernicus-Bench AQ-NO2-S5P dataset."""

import os
from collections.abc import Callable, Sequence
from typing import Literal

import torch
from torch import Tensor

from ..utils import Path, stack_samples
from .base import CopernicusBenchBase


class CopernicusBenchAQNO2S5P(CopernicusBenchBase):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could make a shared CopernicusBenchAQS5P base class, but then we need to document it as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be useful when we have more variables and they can be aligned, could be in further updates of Copercnius-Bench.

"""Copernicus-Bench AQ-NO2-S5P dataset.

AQ-NO2-S5P is a regression dataset based on Sentinel-5P NO2 images and
EEA air quality data products. Specifically, this dataset combines 2021
measurements of NO2 (annual average concentration) from EEA with S5P NO2
("tropospheric NO2 column number density") from GEE.

This benchmark supports both annual (1 image/location) and seasonal
(4 images/location) modes, the former is used in the original benchmark.

If you use this dataset in your research, please cite the following papers:

* https://arxiv.org/abs/2503.11849
* https://www.researchgate.net/profile/Jan-Horalek/publication/389165501_Air_quality_maps_of_EEA_member_and_cooperating_countries_for_2022/links/67b72628207c0c20fa8ec116/Air-quality-maps-of-EEA-member-and-cooperating-countries-for-2022.pdf

.. versionadded:: 0.7
"""

url = 'https://hf.co/datasets/wangyi111/Copernicus-Bench/resolve/9d252acd3aa0e3da3128e05c6f028647f0e48e5f/l3_airquality_s5p/airquality_s5p.zip'
md5 = '92081c7437c5c1daf783868ad7669877'
zipfile = 'airquality_s5p.zip'
directory = os.path.join('airquality_s5p', 'no2')
filename = '{}.csv'
dtype = torch.float
filename_regex = r'(?P<start>\d{4}-\d{2}-\d{2})_(?P<stop>\d{4}-\d{2}-\d{2})'
date_format = '%Y-%m-%d'
all_bands = ('NO2',)
rgb_bands = ('NO2',)
cmap = 'Wistia'

def __init__(
self,
root: Path = 'data',
split: Literal['train', 'val', 'test'] = 'train',
mode: Literal['annual', 'seasonal'] = 'annual',
bands: Sequence[str] | None = None,
transforms: Callable[[dict[str, Tensor]], dict[str, Tensor]] | None = None,
download: bool = False,
checksum: bool = False,
) -> None:
"""Initialize a new CopernicusBenchAQNO2S5P instance.

Args:
root: Root directory where dataset can be found.
split: One of 'train', 'val', or 'test'.
mode: One of 'annual' or 'seasonal'.
bands: Sequence of band names to load (defaults to all bands).
transforms: A function/transform that takes input sample and its target as
entry and returns a transformed version.
download: If True, download dataset and store it in the root directory.
checksum: If True, check the MD5 of the downloaded files (may be slow).

Raises:
DatasetNotFoundError: If dataset is not found and *download* is False.
"""
self.mode = mode
super().__init__(root, split, bands, transforms, download, checksum)

def __getitem__(self, index: int) -> dict[str, Tensor]:
"""Return an index within the dataset.

Args:
index: Index to return.

Returns:
Data and labels at that index.
"""
pid = self.files[index]
match self.mode:
case 'annual':
file = '2021-01-01_2021-12-31.tif'
path = os.path.join(self.root, self.directory, 's5p_annual', pid, file)
sample = self._load_image(path)
case 'seasonal':
files = [
'2021-01-01_2021-04-01.tif',
'2021-04-01_2021-07-01.tif',
'2021-07-01_2021-10-01.tif',
'2021-10-01_2021-12-31.tif',
]
root = os.path.join(self.root, self.directory, 's5p_seasonal', pid)
samples = [self._load_image(os.path.join(root, file)) for file in files]
sample = stack_samples(samples)

path = os.path.join(self.root, self.directory, 'label_annual', f'{pid}.tif')
sample |= self._load_mask(path)

if self.transforms is not None:
sample = self.transforms(sample)

return sample
13 changes: 12 additions & 1 deletion torchgeo/datasets/copernicus/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def url(self) -> str:
#: Filename format of split files.
filename = '{}.csv'

#: Mask dtype to cast to, either torch.long for classification
#: or torch.float for regression.
dtype: torch.dtype = torch.long

#: Regular expression used to extract date from filename.
filename_regex = '.*'

Expand Down Expand Up @@ -156,6 +160,13 @@ def _load_image(self, path: str) -> dict[str, Tensor]:
mint, maxt = disambiguate_timestamp(date_str, self.date_format)
time = (mint + maxt) / 2
sample['time'] = torch.tensor(time)
elif 'start' in match.groupdict() and 'stop' in match.groupdict():
start = match.group('start')
stop = match.group('stop')
mint, _ = disambiguate_timestamp(start, self.date_format)
_, maxt = disambiguate_timestamp(stop, self.date_format)
time = (mint + maxt) / 2
sample['time'] = torch.tensor(time)

return sample

Expand All @@ -170,7 +181,7 @@ def _load_mask(self, path: str) -> dict[str, Tensor]:
"""
sample: dict[str, Tensor] = {}
with rio.open(path) as f:
sample['mask'] = torch.tensor(f.read(1).astype(np.int64))
sample['mask'] = torch.tensor(f.read(1)).to(self.dtype)

return sample

Expand Down
4 changes: 3 additions & 1 deletion torchgeo/datasets/copernicus/biomass_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Literal

import pandas as pd
import torch
from torch import Tensor

from ..utils import Path, stack_samples
Expand Down Expand Up @@ -37,8 +38,9 @@ class CopernicusBenchBiomassS3(CopernicusBenchBase):
url = 'https://hf.co/datasets/wangyi111/Copernicus-Bench/resolve/9d252acd3aa0e3da3128e05c6f028647f0e48e5f/l3_biomass_s3/biomass_s3.zip'
md5 = '4769ab8c2c23cd8957b99e15e071931c'
zipfile = 'biomass_s3.zip'
filename = 'static_fnames-{}.csv'
directory = 'biomass_s3'
filename = 'static_fnames-{}.csv'
dtype = torch.float
filename_regex = r'S3[AB]_(?P<date>\d{8}T\d{6})'
all_bands = (
'Oa01_radiance',
Expand Down
Loading
Loading