NOAA-OWP · fernando-aristizabal · Mar 27, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 requires-python = ">=3.8"
 keywords = ["geospatial", "evaluations"]
 license = {text = "MIT"}
-version = "0.2.5"
+version = "0.2.6"
 dynamic = ["readme", "dependencies"]
 
 [project.optional-dependencies]

diff --git a/src/gval/accessors/gval_xarray.py b/src/gval/accessors/gval_xarray.py
@@ -93,15 +93,18 @@ def __handle_attribute_tracking(
                 else:
                     del attribute_tracking_kwargs["agreement_map"]
 
-            results = candidate_map.gval.attribute_tracking_xarray(
+            results = _attribute_tracking_xarray(
+                candidate_map=candidate_map,
                 benchmark_map=benchmark_map,
                 agreement_map=agreement_map,
                 **attribute_tracking_kwargs,
             )
 
         else:
-            results = candidate_map.gval.attribute_tracking_xarray(
-                benchmark_map=benchmark_map, agreement_map=agreement_map
+            results = _attribute_tracking_xarray(
+                candidate_map=candidate_map,
+                benchmark_map=benchmark_map,
+                agreement_map=agreement_map,
             )
 
         return results

diff --git a/src/gval/comparison/tabulation.py b/src/gval/comparison/tabulation.py
@@ -243,6 +243,15 @@ def _crosstab_Datasets(agreement_map: xr.DataArray) -> DataFrame[Crosstab_df]:
     # loop variables
     previous_crosstab_df = None  # initializing to avoid having unset
     for i, b in enumerate(agreement_variable_names):
+        # Pass pairing dictionary to variable if necessary
+        if (
+            agreement_map[b].attrs.get("pairing_dictionary") is None
+            and agreement_map.attrs.get("pairing_dictionary") is not None
+        ):
+            agreement_map[b].attrs["pairing_dictionary"] = agreement_map.attrs[
+                "pairing_dictionary"
+            ]
+
         crosstab_df = _crosstab_2d_DataArrays(
             agreement_map=agreement_map[b], band_value=b
         )

diff --git a/src/gval/utils/loading_datasets.py b/src/gval/utils/loading_datasets.py
@@ -9,6 +9,7 @@
 from numbers import Number
 import ast
 
+import pandas as pd
 import rioxarray as rxr
 import xarray as xr
 import numpy as np
@@ -323,6 +324,58 @@ def _set_crs(stack: xr.DataArray, band_metadata: list = None) -> Number:
         return stack.rio.write_crs(f"EPSG:{band_metadata['epsg'].values}")
 
 
+def query_stac(
+    url: str,
+    collections: str,
+    time: str,
+    query: str = None,
+    max_items: int = None,
+    intersects: dict = None,
+    bbox: list = None,
+) -> list:
+    """Return items from a stac query
+
+    Parameters
+    ----------
+    url : str
+        Address hosting the STAC API
+    collections : Union[str, list]
+        Name of collection to get (currently limited to one)
+    time : str
+        Single or range of values to query in the time dimension
+    bands: list, default = None
+        Bands to retrieve from service
+    query : str, default = None
+        String command to filter data
+    max_items : int, default = None
+        The maximum amount of records to retrieve
+    intersects : dict, default = None
+        Dictionary representing the type of geometry and its respective coordinates
+    bbox : list, default = None
+        Coordinates to filter the spatial range of request
+
+    Returns
+    -------
+    list
+        An iterable of STAC items
+
+    """
+
+    if not isinstance(collections, Iterable):
+        collections = [collections]
+
+    catalog = pystac_client.Client.open(url)
+
+    return catalog.search(
+        datetime=time,
+        collections=[collections],
+        max_items=max_items,
+        intersects=intersects,
+        bbox=bbox,
+        query=query,
+    ).item_collection()
+
+
 def get_stac_data(
     url: str,
     collection: str,
@@ -372,17 +425,17 @@ def get_stac_data(
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
-        # Call cataloging url, search, and convert to xarray
-        catalog = pystac_client.Client.open(url)
 
-        stac_items = catalog.search(
-            datetime=time,
-            collections=[collection],
+        # Call cataloging url, search, and convert to xarray
+        stac_items = query_stac(
+            url=url,
+            time=time,
+            collections=collection,
             max_items=max_items,
             intersects=intersects,
             bbox=bbox,
             query=query,
-        ).get_all_items()
+        )
 
         stack = stackstac.stack(stac_items, resolution=resolution)
 
@@ -439,6 +492,125 @@ def get_stac_data(
         return stack
 
 
+def _stac_to_df(stac_items: list, assets: list = None) -> pd.DataFrame:
+    """Convert a list of stac items to a DataFrame
+
+    Parameters
+    ----------
+    stac_items: list
+        List of stac items to create a catalog with
+    assets : list, default = None
+        Assets to keep, (keep all if None)
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with rows for each unique item/asset combination
+
+    Raises
+    ------
+    ValueError
+        No entries in DataFrame due to nonexistent asset
+
+    """
+
+    dfs, compare_idx = [], 1
+    for stac_item in stac_items:
+        map_name, map_id, compare_id = [], [], []
+        for key, item in stac_item.assets.items():
+            if assets is None or key in assets:
+                map_name.append(key)
+                map_id.append(item.href)
+                compare_id.append(compare_idx)
+                compare_idx += 1
+
+        len_assets = len(map_name)
+
+        df_contents = {
+            "collection_id": [stac_item.collection_id] * len_assets,
+            "item_id": [stac_item.id] * len_assets,
+            "item_time": [stac_item.get_datetime()] * len_assets,
+            "create_time": [stac_item.properties["created"]] * len_assets,
+            "map_id": map_id,
+            "map_name": map_name,
+            "compare_id": compare_id,
+            "coverage_geometry_type": [stac_item.geometry["type"]] * len_assets,
+            "coverage_geometry_coords": [stac_item.geometry["coordinates"]]
+            * len_assets,
+            "coverage_epsg": ["4326"] * len_assets,
+            "asset_epsg": [stac_item.properties["proj:epsg"]] * len_assets,
+        }
+
+        dfs.append(pd.DataFrame(df_contents))
+
+    combined_df = pd.concat(dfs)
+    if combined_df.empty:
+        raise ValueError("No entries in DataFrame due to nonexistent asset")
+
+    return combined_df
+
+
+def stac_catalog(
+    url: str,
+    collections: Union[str, list],
+    time: str,
+    query: str = None,
+    max_items: int = None,
+    intersects: dict = None,
+    bbox: list = None,
+    assets: list = None,
+) -> pd.DataFrame:
+    """Create a STAC Catalog from a STAC query
+
+    Parameters
+    ----------
+    url : str
+        Address hosting the STAC API
+    collections : Union[str, list]
+        Name of collection/s to get
+    time : str
+        Single or range of values to query in the time dimension
+    query : str, default = None
+        String command to filter data
+    max_items : int, default = None
+        The maximum amount of records to retrieve
+    intersects : dict, default = None
+        Dictionary representing the type of geometry and its respective coordinates
+    bbox : list, default = None
+        Coordinates to filter the spatial range of request
+    assets : list, default = None
+        Assets to keep, (keep all if None)
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame representing a catalog based on STAC query
+
+    Raises
+    ------
+    JSONDecodeError
+        Unable to make STAC query
+    ValueError
+        No items returned from query
+
+    """
+
+    stac_items = query_stac(
+        url=url,
+        time=time,
+        collections=collections,
+        max_items=max_items,
+        intersects=intersects,
+        bbox=bbox,
+        query=query,
+    )
+
+    if len(stac_items) == 0:
+        raise ValueError("No items returned from query")
+
+    return _stac_to_df(stac_items, assets).reset_index(drop=True)
+
+
 def _create_circle_mask(
     sizes: int | Tuple[int], center: Tuple[Number, Number], radius: Number
 ) -> np.ndarray:

diff --git a/tests/cases_catalogs.py b/tests/cases_catalogs.py
@@ -7,9 +7,11 @@
 from pytest_cases import parametrize
 
 import pandas as pd
+from pandas import Timestamp
 from dask import dataframe as dd
 import xarray as xr
 import numpy as np
+from json import JSONDecodeError
 
 from tests.conftest import TEST_DATA_DIR
 
@@ -461,3 +463,120 @@ def case_compare_catalogs_fail(
         agreement_map_field,
         expected_exception,
     )
+
+
+url = "https://earth-search.aws.element84.com/v1"
+collection = "sentinel-2-l2a"
+times = ["2020-04-01", "2020-04-03"]
+bbox = [-105.78, 35.79, -105.72, 35.84]
+assets = ["aot"]
+
+expected_stac_df = {
+    "collection_id_candidate": ["sentinel-2-l2a", "sentinel-2-l2a"],
+    "item_id_candidate": ["S2B_13SDV_20200401_1_L2A", "S2B_13SDV_20200401_0_L2A"],
+    "item_time_candidate": [
+        Timestamp("2020-04-01 18:04:04.327000+0000", tz="utc"),
+        Timestamp("2020-04-01 18:04:04.327000+0000", tz="utc"),
+    ],
+    "create_time_candidate": ["2023-10-07T12:09:07.273Z", "2022-11-06T10:14:16.681Z"],
+    "map_id_candidate": [
+        "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2B_13SDV_20200401_1_L2A/AOT.tif",
+        "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2B_13SDV_20200401_0_L2A/AOT.tif",
+    ],
+    "map_name_candidate": ["aot", "aot"],
+    "compare_id": [1, 2],
+    "coverage_geometry_type_candidate": ["Polygon", "Polygon"],
+    "coverage_geometry_coords_candidate": [
+        [
+            [
+                [-106.11191385205835, 36.13972769324406],
+                [-106.0982941692468, 35.150822790058335],
+                [-105.85393882465051, 35.15385918076215],
+                [-105.68102037327243, 35.69893282033011],
+                [-105.59450557216445, 35.97641506053815],
+                [-105.56226433775963, 36.07584727804726],
+                [-105.53827534157463, 36.14367977962539],
+                [-106.11191385205835, 36.13972769324406],
+            ]
+        ],
+        [
+            [
+                [-106.11191385205835, 36.13972769324406],
+                [-105.53527335203748, 36.14369323431527],
+                [-105.56579262097148, 36.05653228802631],
+                [-105.68980719734964, 35.659112338538634],
+                [-105.85157080324588, 35.15190642354915],
+                [-106.09828205302668, 35.1499211736146],
+                [-106.11191385205835, 36.13972769324406],
+            ]
+        ],
+    ],
+    "coverage_epsg_candidate": ["4326", "4326"],
+    "asset_epsg_candidate": [32613, 32613],
+    "collection_id_benchmark": ["sentinel-2-l2a", "sentinel-2-l2a"],
+    "item_id_benchmark": ["S2A_13SDV_20200403_1_L2A", "S2A_13SDV_20200403_0_L2A"],
+    "item_time_benchmark": [
+        Timestamp("2020-04-03 17:54:07.524000+0000", tz="utc"),
+        Timestamp("2020-04-03 17:54:07.524000+0000", tz="utc"),
+    ],
+    "create_time_benchmark": ["2023-10-08T00:32:51.304Z", "2022-11-06T07:21:36.990Z"],
+    "map_id_benchmark": [
+        "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2A_13SDV_20200403_1_L2A/AOT.tif",
+        "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/13/S/DV/2020/4/S2A_13SDV_20200403_0_L2A/AOT.tif",
+    ],
+    "map_name_benchmark": ["aot", "aot"],
+    "coverage_geometry_type_benchmark": ["Polygon", "Polygon"],
+    "coverage_geometry_coords_benchmark": [
+        [
+            [
+                [-106.11191385205835, 36.13972769324406],
+                [-106.09828205302668, 35.1499211736146],
+                [-104.89285176524281, 35.154851672138626],
+                [-104.89152152018616, 36.14484027029347],
+                [-106.11191385205835, 36.13972769324406],
+            ]
+        ],
+        [
+            [
+                [-106.11191385205835, 36.13972769324406],
+                [-104.89152152018616, 36.14484027029347],
+                [-104.89285176524281, 35.154851672138626],
+                [-106.09828205302668, 35.1499211736146],
+                [-106.11191385205835, 36.13972769324406],
+            ]
+        ],
+    ],
+    "coverage_epsg_benchmark": ["4326", "4326"],
+    "asset_epsg_benchmark": [32613, 32613],
+    "band": ["1", "1"],
+    "coefficient_of_determination": [-1.449866533279419, -0.46196842193603516],
+    "mean_absolute_error": [25.393386840820312, 11.947012901306152],
+    "mean_absolute_percentage_error": [0.2044084370136261, 0.15074075758457184],
+}
+
+
+def case_stac_catalog_comparison_success():
+    return url, collection, times, bbox, assets, pd.DataFrame(expected_stac_df)
+
+
+bad_url = "https://google.com"
+bad_times = ["2018-04-01", "1940-04-03", "2020-04-01"]
+bad_assets = ["aot", "aot", "surface_water"]
+exceptions = [JSONDecodeError, ValueError, ValueError]
+
+
+@parametrize(
+    "url, collection, time, bbox, assets, exception",
+    list(
+        zip(
+            [bad_url, url, url],
+            [collection] * 3,
+            bad_times,
+            [bbox] * 3,
+            bad_assets,
+            exceptions,
+        )
+    ),
+)
+def case_stac_catalog_comparison_fail(url, collection, time, bbox, assets, exception):
+    return url, collection, time, bbox, assets, exception