Compute area-weighted statistics (#640)

* add fractional coverage statistics * add helper method to create coverage array * update type hint * update docs
cogeotiff · Sep 27, 2023 · 7bed9de · 7bed9de
1 parent fdc332b
commit 7bed9de
Show file tree

Hide file tree

Showing 8 changed files with 318 additions and 94 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,36 @@
 
+# 6.2.0 (TBD)
+
+* allow area-weighted statistics by adding `coverage` option in `rio_tiler.utils.get_array_statistics`
+
+    ```python
+    # Data Array
+    # 1, 2
+    # 3, 4
+    data = numpy.ma.array((1, 2, 3, 4)).reshape((1, 2, 2))
+
+    # Coverage Array
+    # 0.5, 0
+    # 1, 0.25
+    coverage = numpy.array((0.5, 0, 1, 0.25)).reshape((2, 2))
+
+    stats = utils.get_array_statistics(data, coverage=coverage)
+    assert len(stats) == 1
+    assert stats[0]["min"] == 1
+    assert stats[0]["max"] == 4
+    assert stats[0]["mean"] == 1.125  # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4
+    assert stats[0]["count"] == 1.75  # (0.5 + 0 + 1 + 0.25) sum of the coverage array
+
+    stats = utils.get_array_statistics(data)
+    assert len(stats) == 1
+    assert stats[0]["min"] == 1
+    assert stats[0]["max"] == 4
+    assert stats[0]["mean"] == 2.5
+    assert stats[0]["count"] == 4
+    ```
+
+* add `rio_tiler.utils.get_coverage_array` method to create a `coverage %` array
+
 # 6.1.0 (2023-09-15)
 
 * add `width`, `height` and `count` properties in `MosaicMethodBase`

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -34,7 +34,7 @@ nav:
   - Advanced Topics:
     - Base classes and custom readers: 'advanced/custom_readers.md'
     - Read Polygon-shaped regions: 'advanced/feature.md'
-    - Zonal statistics: 'advanced/zonal_stats.md'
+    - Statistics: 'advanced/statistics.md'
     - Create a Dynamic Tiler: 'advanced/dynamic_tiler.md'
     - TileMatrixSet: 'advanced/tms.md'
   - Examples:

diff --git a/docs/src/advanced/statistics.md b/docs/src/advanced/statistics.md
@@ -0,0 +1,166 @@
+
+### Statistics
+
+`rio-tiler`'s Readers provide simple `.statistics` method to retrieve dataset global statistics
+
+```python
+with Reader("my.tif") as src:
+    stats = src.statistics()
+
+# Statistics result is in form of Dict[str, rio_tiler.models.BandStatistics]
+print(stats.keys())
+>>> ["b1", "b2", "b3"]
+
+# rio_tiler.models.BandStatistics is a pydantic model
+print(stats["1"].model_dump().keys())
+[
+    "min",
+    "max",
+    "mean",
+    "count",
+    "sum",
+    "std",
+    "median",
+    "majority",
+    "minority",
+    "unique",
+    "histogram",
+    "valid_percent",
+    "masked_pixels",
+    "valid_pixels",
+    # Percentile entries depend on user inputs
+    "percentile_2",
+    "percentile_98",
+]
+```
+
+### Zonal Statistics
+
+You can easily extend the `statistics()` method to create a `.zonal_statistics` one which will accept input features to get statistics from.
+
+```python
+
+import attr
+from typing import Any, Union, Optional, List, Dict
+
+from rio_tiler import io
+from rio_tiler.models import BandStatistics
+
+from geojson_pydantic.features import Feature, FeatureCollection
+from geojson_pydantic.geometries import Polygon
+
+class Reader(io.Reader):
+    """Custom Reader with zonal_statistics method."""
+
+    def zonal_statistics(
+            self,
+            geojson: Union[FeatureCollection, Feature],
+            categorical: bool = False,
+            categories: Optional[List[float]] = None,
+            percentiles: Optional[List[int]] = None,
+            hist_options: Optional[Dict] = None,
+            max_size: int = None,
+            **kwargs: Any,
+        ) -> Union[FeatureCollection, Feature]:
+            """Return statistics from GeoJSON features.
+
+            Args:
+                geojson (Feature or FeatureCollection): a GeoJSON Feature or FeatureCollection.
+                categorical (bool): treat input data as categorical data. Defaults to False.
+                categories (list of numbers, optional): list of categories to return value for.
+                percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`.
+                hist_options (dict, optional): Options to forward to numpy.histogram function.
+                max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to None.
+                kwargs (optional): Options to forward to `self.preview`.
+
+            Returns:
+                Feature or FeatureCollection
+
+            """
+            kwargs = {**self.options, **kwargs}
+
+            hist_options = hist_options or {}
+
+            fc = geojson
+            # We transform the input Feature to a FeatureCollection
+            if isinstance(fc, Feature):
+                fc = FeatureCollection(type="FeatureCollection", features=[geojson])
+
+            for feature in fc:
+                # Get data overlapping with the feature (using Reader.feature method)
+                data = self.feature(
+                    feature.model_dump(exclude_none=True),
+                    max_size=max_size,
+                    **kwargs,
+                )
+                stats = data.statistics(
+                    categorical=categorical,
+                    categories=categories,
+                    percentiles=percentiles,
+                    hist_options=hist_options,
+                )
+
+                # Update input feature properties and add the statistics
+                feature.properties = feature.properties or {}
+                feature.properties.update({"statistics": stats})
+
+            return fc.features[0] if isinstance(geojson, Feature) else fc
+```
+
+
+### Area Weighted Statistics
+
+When getting statistics from `feature`, you may want to calculate values from the pixels which intersect with the geometry but also take the pixel intersection percentage into account. Starting with rio-tiler `6.2.0`, we've added a `coverage` option to the `statistics` utility which enable the user to pass an array representing the coverage percentage such as:
+
+```python
+import numpy
+from rio_tiler.utils import get_array_statistics
+# Data Array
+# 1, 2
+# 3, 4
+data = numpy.ma.array((1, 2, 3, 4)).reshape((1, 2, 2))
+
+# Coverage Array
+# 0.5, 0
+# 1, 0.25
+coverage = numpy.array((0.5, 0, 1, 0.25)).reshape((2, 2))
+
+stats = get_array_statistics(data, coverage=coverage)
+assert len(stats) == 1
+assert stats[0]["min"] == 1
+assert stats[0]["max"] == 4
+assert stats[0]["mean"] == 1.125  # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4
+assert stats[0]["count"] == 1.75  # (0.5 + 0 + 1 + 0.25) sum of the coverage array
+```
+
+When using with a `feature`, your code might look something like:
+
+```python
+from rio_tiler.io import Reader
+from rio_tiler.constants import WGS84_CRS
+
+with Reader(path) as src:
+    # First get the array for the feature
+    data = src_dst.feature(
+        shape,
+        shape_crs=WGS84_CRS,
+    )
+
+    # Get the coverage % array, using ImageData.get_coverage_array method
+    coverage_array = data.get_coverage_array(
+        shape, shape_crs=WGS84_CRS
+    )
+
+    # Get statistics (ImageData.statistics is calling `rio_tiler.utils.get_array_statistics`)
+    stats = data.statistics(coverage=coverage_array)
+```
+
+!!! warnings
+    The coverage weights will only have influence on specific statistics:
+
+    - `mean`
+    - `count`
+    - `sum`
+    - `std`
+    - `median`
+
diff --git a/docs/src/advanced/zonal_stats.md b/docs/src/advanced/zonal_stats.md
diff --git a/rio_tiler/models.py b/rio_tiler/models.py
@@ -8,18 +8,22 @@
 import numpy
 from affine import Affine
 from color_operations import parse_operations, scale_dtype, to_math_type
+from numpy.typing import NDArray
 from pydantic import BaseModel
 from rasterio import windows
 from rasterio.coords import BoundingBox
 from rasterio.crs import CRS
 from rasterio.dtypes import dtype_ranges
 from rasterio.enums import ColorInterp
 from rasterio.errors import NotGeoreferencedWarning
+from rasterio.features import rasterize
 from rasterio.io import MemoryFile
 from rasterio.plot import reshape_as_image
 from rasterio.transform import from_bounds
+from rasterio.warp import transform_geom
 
 from rio_tiler.colormap import apply_cmap
+from rio_tiler.constants import WGS84_CRS
 from rio_tiler.errors import InvalidDatatypeWarning, InvalidPointDataError
 from rio_tiler.expression import apply_expression, get_expression_blocks
 from rio_tiler.types import (
@@ -769,6 +773,7 @@ def statistics(
         categories: Optional[List[float]] = None,
         percentiles: Optional[List[int]] = None,
         hist_options: Optional[Dict] = None,
+        coverage: Optional[numpy.ndarray] = None,
     ) -> Dict[str, BandStatistics]:
         """Return statistics from ImageData."""
         hist_options = hist_options or {}
@@ -778,10 +783,51 @@ def statistics(
             categorical=categorical,
             categories=categories,
             percentiles=percentiles,
+            coverage=coverage,
             **hist_options,
         )
 
         return {
             f"{self.band_names[ix]}": BandStatistics(**stats[ix])
             for ix in range(len(stats))
         }
+
+    def get_coverage_array(
+        self,
+        shape: Dict,
+        shape_crs: CRS = WGS84_CRS,
+        cover_scale: int = 10,
+    ) -> NDArray[numpy.floating]:
+
+        """
+        cover_scale: int, optional
+            Scale used when generating coverage estimates of each
+            raster cell by vector feature. Coverage is generated by
+            rasterizing the feature at a finer resolution than the raster then using a summation to aggregate
+            to the raster resolution and dividing by the square of cover_scale
+            to get coverage value for each cell. Increasing cover_scale
+            will increase the accuracy of coverage values; three orders
+            magnitude finer resolution (cover_scale=1000) is usually enough to
+            get coverage estimates with <1% error in individual edge cells coverage
+            estimates, though much smaller values (e.g., cover_scale=10) are often
+            sufficient (<10% error) and require less memory.
+
+        Note: code adapted from https://github.com/perrygeo/python-rasterstats/pull/136 by @sgoodm
+
+        """
+        if self.crs != shape_crs:
+            shape = transform_geom(shape_crs, self.crs, shape)
+
+        cover_array = rasterize(
+            [(shape, 1)],
+            out_shape=(self.height * cover_scale, self.width * cover_scale),
+            transform=self.transform * Affine.scale(1 / cover_scale),
+            all_touched=True,
+            fill=0,
+            dtype="uint8",
+        )
+        cover_array = cover_array.reshape(
+            (self.height, cover_scale, self.width, cover_scale)
+        ).astype("float32")
+
+        return cover_array.sum(-1).sum(1) / (cover_scale**2)