From 584c45725aaf411d3cfaff81b3cce02c311e30ea Mon Sep 17 00:00:00 2001 From: Pete Gadomski Date: Fri, 4 Nov 2022 10:30:49 -0600 Subject: [PATCH] feat: allow skipping of cog creation --- src/stactools/noaa_cdr/cog.py | 14 ++---- .../noaa_cdr/ocean_heat_content/cog.py | 33 ++++++++----- .../noaa_cdr/ocean_heat_content/commands.py | 2 +- .../noaa_cdr/ocean_heat_content/stac.py | 9 ++-- src/stactools/noaa_cdr/profile.py | 14 ++++-- .../noaa_cdr/sea_ice_concentration/cog.py | 11 +++-- .../sea_surface_temperature_whoi/stac.py | 4 +- tests/ocean_heat_content/test_stac.py | 49 ++++++++++++++++--- 8 files changed, 94 insertions(+), 42 deletions(-) diff --git a/src/stactools/noaa_cdr/cog.py b/src/stactools/noaa_cdr/cog.py index 400648e..b475879 100644 --- a/src/stactools/noaa_cdr/cog.py +++ b/src/stactools/noaa_cdr/cog.py @@ -6,7 +6,7 @@ import rasterio.shutil import xarray from numpy.typing import NDArray -from pystac import Asset, MediaType +from pystac import Asset from rasterio import MemoryFile from . import dataset @@ -32,13 +32,12 @@ def cogify( if profile.needs_longitude_remap: values = numpy.roll(values, int(profile.width / 2), 1) path = os.path.join(directory, f"{file_name}-{variable}.tif") - asset = write( + write( values, path, profile, ) - asset = profile.update_cog_asset(variable, asset) - assets[variable] = asset + assets[variable] = profile.cog_asset(path) return assets @@ -46,13 +45,8 @@ def write( values: NDArray[Any], path: str, profile: BandProfile, -) -> Asset: +) -> None: with MemoryFile() as memory_file: with memory_file.open(**profile.gtiff()) as open_memory_file: open_memory_file.write(values, 1) rasterio.shutil.copy(open_memory_file, path, **profile.cog()) - asset = Asset( - title=profile.title, href=path, media_type=MediaType.COG, roles=["data"] - ) - asset.extra_fields["raster:bands"] = [profile.raster_band().to_dict()] - return asset diff --git a/src/stactools/noaa_cdr/ocean_heat_content/cog.py b/src/stactools/noaa_cdr/ocean_heat_content/cog.py index 36a7b6d..1d989d8 100644 --- a/src/stactools/noaa_cdr/ocean_heat_content/cog.py +++ b/src/stactools/noaa_cdr/ocean_heat_content/cog.py @@ -19,7 +19,7 @@ class Cog: """Dataclass to hold the result of a cogification operation.""" - asset: Asset + href: str profile: BandProfile time_resolution: TimeResolution start_datetime: datetime.datetime @@ -27,6 +27,9 @@ class Cog: datetime: datetime.datetime attributes: Dict[Hashable, Any] + def asset(self) -> Asset: + return self.profile.cog_asset(self.href) + def time_interval_as_str(self) -> str: """Returns this COG's time interval as a string.""" return self.time_resolution.as_str(self.datetime) @@ -52,6 +55,7 @@ def cogify( outdir: Optional[str] = None, latest_only: bool = False, read_href_modifier: Optional[ReadHrefModifier] = None, + cog_hrefs: Optional[List[str]] = None, ) -> List[Cog]: if outdir is None: outdir = os.path.dirname(href) @@ -60,6 +64,10 @@ def cogify( maybe_modified_href = read_href_modifier(href) else: maybe_modified_href = href + if cog_hrefs: + cog_file_names = dict((os.path.basename(h), h) for h in cog_hrefs) + else: + cog_file_names = dict() with fsspec.open(maybe_modified_href) as file: with xarray.open_dataset(file, decode_times=False) as ds: time_resolution = TimeResolution.from_value(ds.time_coverage_resolution) @@ -71,22 +79,25 @@ def cogify( dt = time.add_months_to_datetime(BASE_TIME, month_offset) start_datetime, end_datetime = time_resolution.datetime_bounds(dt) suffix = time_resolution.as_str(dt) - path = os.path.join( - outdir, - f"{os.path.splitext(os.path.basename(href))[0]}_{suffix}.tif", + file_name = ( + f"{os.path.splitext(os.path.basename(href))[0]}_{suffix}.tif" ) profile = BandProfile.build( ds, variable, lambda d: d.isel(time=i).squeeze() ) - values = numpy.flipud(ds[variable].isel(time=i).values.squeeze()) - asset = cog.write( - values, - path, - profile, - ) + if file_name in cog_file_names: + cog_href = cog_file_names[file_name] + else: + cog_href = os.path.join(outdir, file_name) + values = numpy.flipud(ds[variable].isel(time=i).values.squeeze()) + cog.write( + values, + cog_href, + profile, + ) cogs.append( Cog( - asset=asset, + href=cog_href, profile=profile, time_resolution=time_resolution, datetime=dt, diff --git a/src/stactools/noaa_cdr/ocean_heat_content/commands.py b/src/stactools/noaa_cdr/ocean_heat_content/commands.py index 10b6a66..2f7edce 100644 --- a/src/stactools/noaa_cdr/ocean_heat_content/commands.py +++ b/src/stactools/noaa_cdr/ocean_heat_content/commands.py @@ -170,6 +170,6 @@ def cogify_command(infile: str, outdir: Optional[Path]) -> None: if outdir: os.makedirs(str(outdir), exist_ok=True) cogs = cog.cogify(infile, None if outdir is None else str(outdir)) - print(f"Wrote {len(cogs)} COGs to {os.path.dirname(cogs[0].asset.href)}") + print(f"Wrote {len(cogs)} COGs to {os.path.dirname(cogs[0].asset().href)}") return ocean_heat_content diff --git a/src/stactools/noaa_cdr/ocean_heat_content/stac.py b/src/stactools/noaa_cdr/ocean_heat_content/stac.py index 2db4e4b..fe15379 100644 --- a/src/stactools/noaa_cdr/ocean_heat_content/stac.py +++ b/src/stactools/noaa_cdr/ocean_heat_content/stac.py @@ -128,6 +128,7 @@ def create_collection( def create_items( hrefs: List[str], directory: str, + cog_hrefs: Optional[List[str]] = None, latest_only: bool = False, read_href_modifier: Optional[ReadHrefModifier] = None, ) -> List[Item]: @@ -144,6 +145,7 @@ def create_items( cogs = cog.cogify( href, directory, + cog_hrefs=cog_hrefs, latest_only=latest_only, read_href_modifier=read_href_modifier, ) @@ -175,11 +177,12 @@ def _update_items(items: List[Item], cogs: List[Cog]) -> List[Item]: title = c.attributes["title"].split(" : ")[0] min_depth = int(c.attributes["geospatial_vertical_min"]) max_depth = int(c.attributes["geospatial_vertical_max"]) - c.asset.title = f"{title} : {min_depth}-{max_depth}m {c.time_interval_as_str()}" - item.add_asset(c.asset_key(), c.asset) + asset = c.asset() + asset.title = f"{title} : {min_depth}-{max_depth}m {c.time_interval_as_str()}" + item.add_asset(c.asset_key(), asset) # The asset has the raster extension, but we need to make sure the item # has the schema url. - _ = RasterExtension.ext(c.asset, add_if_missing=True) + _ = RasterExtension.ext(asset, add_if_missing=True) items_as_dict[id] = item return list(items_as_dict.values()) diff --git a/src/stactools/noaa_cdr/profile.py b/src/stactools/noaa_cdr/profile.py index 6ac22a1..0e6d0f6 100644 --- a/src/stactools/noaa_cdr/profile.py +++ b/src/stactools/noaa_cdr/profile.py @@ -6,7 +6,7 @@ import shapely.geometry from pyproj import CRS from pyproj.enums import WktVersion -from pystac import Asset +from pystac import Asset, MediaType from pystac.extensions.raster import DataType, NoDataStrings, RasterBand from rasterio import Affine from xarray import DataArray, Dataset @@ -110,6 +110,7 @@ class BandProfile: attrs: Dict[Hashable, Any] title: str dataset_profile: DatasetProfile + variable: str @classmethod def build( @@ -159,8 +160,16 @@ def build( attrs=data_array.attrs, title=title, dataset_profile=dataset_profile, + variable=variable, ) + def cog_asset(self, href: str) -> Asset: + asset = Asset( + title=self.title, href=href, media_type=MediaType.COG, roles=["data"] + ) + asset.extra_fields["raster:bands"] = [self.raster_band().to_dict()] + return asset + def gtiff(self) -> Dict[str, Any]: return { "crs": self.crs, @@ -173,9 +182,6 @@ def gtiff(self) -> Dict[str, Any]: "driver": "GTiff", } - def update_cog_asset(self, key: str, asset: Asset) -> Asset: - return asset - def raster_band(self) -> RasterBand: if math.isnan(self.nodata): nodata = NoDataStrings.NAN diff --git a/src/stactools/noaa_cdr/sea_ice_concentration/cog.py b/src/stactools/noaa_cdr/sea_ice_concentration/cog.py index 4ee9369..4bca840 100644 --- a/src/stactools/noaa_cdr/sea_ice_concentration/cog.py +++ b/src/stactools/noaa_cdr/sea_ice_concentration/cog.py @@ -6,13 +6,13 @@ from ..profile import BandProfile from .constants import SPATIAL_RESOLUTION -KEYS_WITH_CLASSES = [ +VARIABLES_WITH_CLASSES = [ "cdr_seaice_conc", "nsidc_bt_seaice_conc", "stdev_of_cdr_seaice_conc", "temporal_interpolation_flag", ] -KEYS_WITH_BITFIELDS = ["qa_of_cdr_seaice_conc", "spatial_interpolation_flag"] +VARIABLES_WITH_BITFIELDS = ["qa_of_cdr_seaice_conc", "spatial_interpolation_flag"] def cogify(href: str, directory: str) -> Dict[str, Asset]: @@ -20,11 +20,12 @@ def cogify(href: str, directory: str) -> Dict[str, Asset]: class SeaIceConcentrationBandProfile(BandProfile): - def update_cog_asset(self, key: str, asset: Asset) -> Asset: + def cog_asset(self, href: str) -> Asset: + asset = super().cog_asset(href) asset.extra_fields["raster:bands"][0]["spatial_resolution"] = SPATIAL_RESOLUTION - if key in KEYS_WITH_CLASSES: + if self.variable in VARIABLES_WITH_CLASSES: asset.extra_fields["classification:classes"] = self.classes() - elif key in KEYS_WITH_BITFIELDS: + elif self.variable in VARIABLES_WITH_BITFIELDS: asset.extra_fields["classification:bitfields"] = self.bitfield() return asset diff --git a/src/stactools/noaa_cdr/sea_surface_temperature_whoi/stac.py b/src/stactools/noaa_cdr/sea_surface_temperature_whoi/stac.py index 25af57e..82312f4 100644 --- a/src/stactools/noaa_cdr/sea_surface_temperature_whoi/stac.py +++ b/src/stactools/noaa_cdr/sea_surface_temperature_whoi/stac.py @@ -62,8 +62,8 @@ def create_cog_items(href: str, directory: str) -> List[Item]: values = numpy.flipud(ds[variable].isel(time=i).values.squeeze()) values = numpy.roll(values, int(profiles[variable].width / 2), 1) path = Path(directory) / f"{item.id}-{variable}.tif" - asset = cog.write(values, str(path), profiles[variable]) - item.assets[variable] = asset + cog.write(values, str(path), profiles[variable]) + item.assets[variable] = profiles[variable].cog_asset(str(path)) items.append(item) return items diff --git a/tests/ocean_heat_content/test_stac.py b/tests/ocean_heat_content/test_stac.py index cc37060..5a5820b 100644 --- a/tests/ocean_heat_content/test_stac.py +++ b/tests/ocean_heat_content/test_stac.py @@ -1,4 +1,6 @@ import datetime +import os +import shutil from pathlib import Path from tempfile import TemporaryDirectory @@ -32,10 +34,9 @@ def test_create_collection() -> None: collection.validate_all() -def test_create_items_one_netcdf() -> None: +def test_create_items_one_netcdf(tmp_path: Path) -> None: path = test_data.get_external_data("heat_content_anomaly_0-2000_yearly.nc") - with TemporaryDirectory() as temporary_directory: - items = stac.create_items([path], temporary_directory) + items = stac.create_items([path], str(tmp_path)) assert len(items) == 17 for item in items: assert len(item.assets) == 1 @@ -67,6 +68,22 @@ def test_create_items_one_netcdf() -> None: item.validate() +def test_create_items_one_netcdf_cog_hrefs(tmp_path: Path) -> None: + path = test_data.get_external_data("heat_content_anomaly_0-2000_yearly.nc") + items = stac.create_items([path], str(tmp_path)) + subdirectory = tmp_path / "subdirectory" + subdirectory.mkdir() + new_paths = list() + for p in tmp_path.iterdir(): + if p.suffix == ".tif": + new_path = subdirectory / p.name + p.rename(new_path) + new_paths.append(str(new_path)) + new_items = stac.create_items([path], str(tmp_path), cog_hrefs=new_paths) + assert not any(p.suffix == ".tif" for p in tmp_path.iterdir()) + assert len(new_items) == len(items) + + def test_create_items_two_netcdfs_same_items(tmp_path: Path) -> None: paths = [ test_data.get_external_data("heat_content_anomaly_0-2000_yearly.nc"), @@ -124,7 +141,7 @@ def test_cogify(tmp_path: Path, infile: str, num_cogs: int) -> None: # Because these netcdfs grow in place, we can never be sure of how many there should be. assert len(cogs) >= num_cogs for c in cogs: - assert Path(c.asset.href).exists() + assert Path(c.asset().href).exists() def test_cogify_href(tmp_path: Path) -> None: @@ -135,7 +152,7 @@ def test_cogify_href(tmp_path: Path) -> None: cogs = cog.cogify(href, str(tmp_path)) assert len(cogs) == 17 for c in cogs: - assert Path(c.asset.href).exists() + assert Path(c.asset().href).exists() def test_cogify_href_no_output_directory() -> None: @@ -150,4 +167,24 @@ def test_cogify_href_no_output_directory() -> None: def test_unitless(tmp_path: Path) -> None: path = test_data.get_external_data("mean_salinity_anomaly_0-2000_yearly.nc") cogs = cog.cogify(path, str(tmp_path)) - assert "unit" not in cogs[0].asset.extra_fields["raster:bands"][0] + assert "unit" not in cogs[0].asset().extra_fields["raster:bands"][0] + + +def test_cogify_cog_href(tmp_path: Path) -> None: + path = test_data.get_external_data("heat_content_anomaly_0-2000_yearly.nc") + cogs = cog.cogify(path, str(tmp_path)) + href = cogs[0].asset().href + subdirectory = tmp_path / "subdirectory" + subdirectory.mkdir() + href = shutil.move(href, subdirectory) + for p in tmp_path.iterdir(): + if p.suffix == ".tif": + p.unlink() + new_cogs = cog.cogify(path, str(tmp_path), cog_hrefs=[href]) + assert not (tmp_path / os.path.basename(href)).exists() + assert ( + sum(1 for f in os.listdir(tmp_path) if os.path.splitext(f)[1] == ".tif") + == len(new_cogs) - 1 + ) + assert os.path.exists(href) + assert href in [new_cog.asset().href for new_cog in new_cogs]