Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

STAC Catalog Functionality #184

Merged
merged 12 commits into from
Mar 27, 2024
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors = [
requires-python = ">=3.8"
keywords = ["geospatial", "evaluations"]
license = {text = "MIT"}
version = "0.2.5"
version = "0.2.6"
dynamic = ["readme", "dependencies"]

[project.optional-dependencies]
Expand Down
9 changes: 6 additions & 3 deletions src/gval/accessors/gval_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,18 @@ def __handle_attribute_tracking(
else:
del attribute_tracking_kwargs["agreement_map"]

results = candidate_map.gval.attribute_tracking_xarray(
results = _attribute_tracking_xarray(
candidate_map=candidate_map,
benchmark_map=benchmark_map,
agreement_map=agreement_map,
**attribute_tracking_kwargs,
)

else:
results = candidate_map.gval.attribute_tracking_xarray(
benchmark_map=benchmark_map, agreement_map=agreement_map
results = _attribute_tracking_xarray(
candidate_map=candidate_map,
benchmark_map=benchmark_map,
agreement_map=agreement_map,
)

return results
Expand Down
9 changes: 9 additions & 0 deletions src/gval/comparison/tabulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,15 @@ def _crosstab_Datasets(agreement_map: xr.DataArray) -> DataFrame[Crosstab_df]:
# loop variables
previous_crosstab_df = None # initializing to avoid having unset
for i, b in enumerate(agreement_variable_names):
# Pass pairing dictionary to variable if necessary
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
if (
agreement_map[b].attrs.get("pairing_dictionary") is None
and agreement_map.attrs.get("pairing_dictionary") is not None
):
agreement_map[b].attrs["pairing_dictionary"] = agreement_map.attrs[
"pairing_dictionary"
]

crosstab_df = _crosstab_2d_DataArrays(
agreement_map=agreement_map[b], band_value=b
)
Expand Down
184 changes: 178 additions & 6 deletions src/gval/utils/loading_datasets.py
fernando-aristizabal marked this conversation as resolved.
Show resolved Hide resolved
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from numbers import Number
import ast

import pandas as pd
import rioxarray as rxr
import xarray as xr
import numpy as np
Expand Down Expand Up @@ -323,6 +324,58 @@ def _set_crs(stack: xr.DataArray, band_metadata: list = None) -> Number:
return stack.rio.write_crs(f"EPSG:{band_metadata['epsg'].values}")


def query_stac(
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
url: str,
collections: str,
time: str,
query: str = None,
max_items: int = None,
intersects: dict = None,
bbox: list = None,
) -> list:
"""Return items from a stac query
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
url : str
Address hosting the STAC API
collections : Union[str, list]
Name of collection to get (currently limited to one)
time : str
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
Single or range of values to query in the time dimension
bands: list, default = None
Bands to retrieve from service
query : str, default = None
String command to filter data
max_items : int, default = None
The maximum amount of records to retrieve
intersects : dict, default = None
Dictionary representing the type of geometry and its respective coordinates
bbox : list, default = None
Coordinates to filter the spatial range of request

Returns
-------
list
An iterable of STAC items

"""

if not isinstance(collections, Iterable):
collections = [collections]

catalog = pystac_client.Client.open(url)

return catalog.search(
datetime=time,
collections=[collections],
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
max_items=max_items,
intersects=intersects,
bbox=bbox,
query=query,
).item_collection()


def get_stac_data(
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
url: str,
collection: str,
Expand Down Expand Up @@ -372,17 +425,17 @@ def get_stac_data(

with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Call cataloging url, search, and convert to xarray
catalog = pystac_client.Client.open(url)

stac_items = catalog.search(
datetime=time,
collections=[collection],
# Call cataloging url, search, and convert to xarray
stac_items = query_stac(
url=url,
time=time,
collections=collection,
max_items=max_items,
intersects=intersects,
bbox=bbox,
query=query,
).get_all_items()
)

stack = stackstac.stack(stac_items, resolution=resolution)

Expand Down Expand Up @@ -439,6 +492,125 @@ def get_stac_data(
return stack


def _stac_to_df(stac_items: list, assets: list = None) -> pd.DataFrame:
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
"""Convert a list of stac items to a DataFrame

Parameters
----------
stac_items: list
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
List of stac items to create a catalog with
assets : list, default = None
Assets to keep, (keep all if None)

Returns
-------
pd.DataFrame
A DataFrame with rows for each unique item/asset combination

Raises
------
ValueError
No entries in DataFrame due to nonexistent asset

"""

dfs, compare_idx = [], 1
for stac_item in stac_items:
map_name, map_id, compare_id = [], [], []
for key, item in stac_item.assets.items():
if assets is None or key in assets:
map_name.append(key)
map_id.append(item.href)
compare_id.append(compare_idx)
compare_idx += 1

len_assets = len(map_name)

df_contents = {
"collection_id": [stac_item.collection_id] * len_assets,
"item_id": [stac_item.id] * len_assets,
"item_time": [stac_item.get_datetime()] * len_assets,
"create_time": [stac_item.properties["created"]] * len_assets,
"map_id": map_id,
"map_name": map_name,
"compare_id": compare_id,
"coverage_geometry_type": [stac_item.geometry["type"]] * len_assets,
"coverage_geometry_coords": [stac_item.geometry["coordinates"]]
* len_assets,
"coverage_epsg": ["4326"] * len_assets,
"asset_epsg": [stac_item.properties["proj:epsg"]] * len_assets,
}

dfs.append(pd.DataFrame(df_contents))
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved

combined_df = pd.concat(dfs)
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
if combined_df.empty:
raise ValueError("No entries in DataFrame due to nonexistent asset")

return combined_df


def stac_catalog(
url: str,
collections: Union[str, list],
time: str,
query: str = None,
max_items: int = None,
intersects: dict = None,
bbox: list = None,
assets: list = None,
) -> pd.DataFrame:
"""Create a STAC Catalog from a STAC query
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
url : str
Address hosting the STAC API
collections : Union[str, list]
Name of collection/s to get
time : str
Single or range of values to query in the time dimension
query : str, default = None
String command to filter data
max_items : int, default = None
The maximum amount of records to retrieve
intersects : dict, default = None
Dictionary representing the type of geometry and its respective coordinates
bbox : list, default = None
Coordinates to filter the spatial range of request
assets : list, default = None
Assets to keep, (keep all if None)

Returns
-------
pd.DataFrame
DataFrame representing a catalog based on STAC query

Raises
------
JSONDecodeError
Unable to make STAC query
ValueError
No items returned from query

"""

stac_items = query_stac(
url=url,
time=time,
collections=collections,
max_items=max_items,
intersects=intersects,
bbox=bbox,
query=query,
)

if len(stac_items) == 0:
raise ValueError("No items returned from query")

return _stac_to_df(stac_items, assets).reset_index(drop=True)


def _create_circle_mask(
sizes: int | Tuple[int], center: Tuple[Number, Number], radius: Number
) -> np.ndarray:
Expand Down
119 changes: 119 additions & 0 deletions tests/cases_catalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from pytest_cases import parametrize

import pandas as pd
from pandas import Timestamp
from dask import dataframe as dd
import xarray as xr
import numpy as np
from json import JSONDecodeError

from tests.conftest import TEST_DATA_DIR

Expand Down Expand Up @@ -461,3 +463,120 @@ def case_compare_catalogs_fail(
agreement_map_field,
expected_exception,
)


url = "https://earth-search.aws.element84.com/v1"
collection = "sentinel-2-l2a"
times = ["2020-04-01", "2020-04-03"]
bbox = [-105.78, 35.79, -105.72, 35.84]
assets = ["aot"]

expected_stac_df = {
"collection_id_candidate": ["sentinel-2-l2a", "sentinel-2-l2a"],
"item_id_candidate": ["S2B_13SDV_20200401_1_L2A", "S2B_13SDV_20200401_0_L2A"],
"item_time_candidate": [
Timestamp("2020-04-01 18:04:04.327000+0000", tz="utc"),
Timestamp("2020-04-01 18:04:04.327000+0000", tz="utc"),
],
"create_time_candidate": ["2023-10-07T12:09:07.273Z", "2022-11-06T10:14:16.681Z"],
"map_id_candidate": [
"https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2B_13SDV_20200401_1_L2A/AOT.tif",
"https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2B_13SDV_20200401_0_L2A/AOT.tif",
],
"map_name_candidate": ["aot", "aot"],
"compare_id": [1, 2],
"coverage_geometry_type_candidate": ["Polygon", "Polygon"],
"coverage_geometry_coords_candidate": [
[
[
[-106.11191385205835, 36.13972769324406],
[-106.0982941692468, 35.150822790058335],
[-105.85393882465051, 35.15385918076215],
[-105.68102037327243, 35.69893282033011],
[-105.59450557216445, 35.97641506053815],
[-105.56226433775963, 36.07584727804726],
[-105.53827534157463, 36.14367977962539],
[-106.11191385205835, 36.13972769324406],
]
],
[
[
[-106.11191385205835, 36.13972769324406],
[-105.53527335203748, 36.14369323431527],
[-105.56579262097148, 36.05653228802631],
[-105.68980719734964, 35.659112338538634],
[-105.85157080324588, 35.15190642354915],
[-106.09828205302668, 35.1499211736146],
[-106.11191385205835, 36.13972769324406],
]
],
],
"coverage_epsg_candidate": ["4326", "4326"],
"asset_epsg_candidate": [32613, 32613],
"collection_id_benchmark": ["sentinel-2-l2a", "sentinel-2-l2a"],
"item_id_benchmark": ["S2A_13SDV_20200403_1_L2A", "S2A_13SDV_20200403_0_L2A"],
"item_time_benchmark": [
Timestamp("2020-04-03 17:54:07.524000+0000", tz="utc"),
Timestamp("2020-04-03 17:54:07.524000+0000", tz="utc"),
],
"create_time_benchmark": ["2023-10-08T00:32:51.304Z", "2022-11-06T07:21:36.990Z"],
"map_id_benchmark": [
"https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2A_13SDV_20200403_1_L2A/AOT.tif",
"https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2A_13SDV_20200403_0_L2A/AOT.tif",
],
"map_name_benchmark": ["aot", "aot"],
"coverage_geometry_type_benchmark": ["Polygon", "Polygon"],
"coverage_geometry_coords_benchmark": [
[
[
[-106.11191385205835, 36.13972769324406],
[-106.09828205302668, 35.1499211736146],
[-104.89285176524281, 35.154851672138626],
[-104.89152152018616, 36.14484027029347],
[-106.11191385205835, 36.13972769324406],
]
],
[
[
[-106.11191385205835, 36.13972769324406],
[-104.89152152018616, 36.14484027029347],
[-104.89285176524281, 35.154851672138626],
[-106.09828205302668, 35.1499211736146],
[-106.11191385205835, 36.13972769324406],
]
],
],
"coverage_epsg_benchmark": ["4326", "4326"],
"asset_epsg_benchmark": [32613, 32613],
"band": ["1", "1"],
"coefficient_of_determination": [-1.449866533279419, -0.46196842193603516],
"mean_absolute_error": [25.393386840820312, 11.947012901306152],
"mean_absolute_percentage_error": [0.2044084370136261, 0.15074075758457184],
}


def case_stac_catalog_comparison_success():
return url, collection, times, bbox, assets, pd.DataFrame(expected_stac_df)


bad_url = "https://google.com"
bad_times = ["2018-04-01", "1940-04-03", "2020-04-01"]
bad_assets = ["aot", "aot", "surface_water"]
exceptions = [JSONDecodeError, ValueError, ValueError]


@parametrize(
"url, collection, time, bbox, assets, exception",
list(
zip(
[bad_url, url, url],
[collection] * 3,
bad_times,
[bbox] * 3,
bad_assets,
exceptions,
)
),
)
def case_stac_catalog_comparison_fail(url, collection, time, bbox, assets, exception):
return url, collection, time, bbox, assets, exception
Loading
Loading