diff --git a/CHANGES.md b/CHANGES.md index 689c74b5..c0dfd730 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,36 @@ +# 6.2.0 (TBD) + +* allow area-weighted statistics by adding `coverage` option in `rio_tiler.utils.get_array_statistics` + + ```python + # Data Array + # 1, 2 + # 3, 4 + data = numpy.ma.array((1, 2, 3, 4)).reshape((1, 2, 2)) + + # Coverage Array + # 0.5, 0 + # 1, 0.25 + coverage = numpy.array((0.5, 0, 1, 0.25)).reshape((2, 2)) + + stats = utils.get_array_statistics(data, coverage=coverage) + assert len(stats) == 1 + assert stats[0]["min"] == 1 + assert stats[0]["max"] == 4 + assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4 + assert stats[0]["count"] == 1.75 # (0.5 + 0 + 1 + 0.25) sum of the coverage array + + stats = utils.get_array_statistics(data) + assert len(stats) == 1 + assert stats[0]["min"] == 1 + assert stats[0]["max"] == 4 + assert stats[0]["mean"] == 2.5 + assert stats[0]["count"] == 4 + ``` + +* add `rio_tiler.utils.get_coverage_array` method to create a `coverage %` array + # 6.1.0 (2023-09-15) * add `width`, `height` and `count` properties in `MosaicMethodBase` diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 42eddccf..ae1ba924 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -34,7 +34,7 @@ nav: - Advanced Topics: - Base classes and custom readers: 'advanced/custom_readers.md' - Read Polygon-shaped regions: 'advanced/feature.md' - - Zonal statistics: 'advanced/zonal_stats.md' + - Statistics: 'advanced/statistics.md' - Create a Dynamic Tiler: 'advanced/dynamic_tiler.md' - TileMatrixSet: 'advanced/tms.md' - Examples: diff --git a/docs/src/advanced/statistics.md b/docs/src/advanced/statistics.md new file mode 100644 index 00000000..8209b50c --- /dev/null +++ b/docs/src/advanced/statistics.md @@ -0,0 +1,166 @@ + +### Statistics + +`rio-tiler`'s Readers provide simple `.statistics` method to retrieve dataset global statistics + +```python +with Reader("my.tif") as src: + stats = src.statistics() + +# Statistics result is in form of Dict[str, rio_tiler.models.BandStatistics] +print(stats.keys()) +>>> ["b1", "b2", "b3"] + +# rio_tiler.models.BandStatistics is a pydantic model +print(stats["1"].model_dump().keys()) +[ + "min", + "max", + "mean", + "count", + "sum", + "std", + "median", + "majority", + "minority", + "unique", + "histogram", + "valid_percent", + "masked_pixels", + "valid_pixels", + # Percentile entries depend on user inputs + "percentile_2", + "percentile_98", +] +``` + +### Zonal Statistics + +You can easily extend the `statistics()` method to create a `.zonal_statistics` one which will accept input features to get statistics from. + +```python + +import attr +from typing import Any, Union, Optional, List, Dict + +from rio_tiler import io +from rio_tiler.models import BandStatistics + +from geojson_pydantic.features import Feature, FeatureCollection +from geojson_pydantic.geometries import Polygon + +class Reader(io.Reader): + """Custom Reader with zonal_statistics method.""" + + def zonal_statistics( + self, + geojson: Union[FeatureCollection, Feature], + categorical: bool = False, + categories: Optional[List[float]] = None, + percentiles: Optional[List[int]] = None, + hist_options: Optional[Dict] = None, + max_size: int = None, + **kwargs: Any, + ) -> Union[FeatureCollection, Feature]: + """Return statistics from GeoJSON features. + + Args: + geojson (Feature or FeatureCollection): a GeoJSON Feature or FeatureCollection. + categorical (bool): treat input data as categorical data. Defaults to False. + categories (list of numbers, optional): list of categories to return value for. + percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`. + hist_options (dict, optional): Options to forward to numpy.histogram function. + max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to None. + kwargs (optional): Options to forward to `self.preview`. + + Returns: + Feature or FeatureCollection + + """ + kwargs = {**self.options, **kwargs} + + hist_options = hist_options or {} + + fc = geojson + # We transform the input Feature to a FeatureCollection + if isinstance(fc, Feature): + fc = FeatureCollection(type="FeatureCollection", features=[geojson]) + + for feature in fc: + # Get data overlapping with the feature (using Reader.feature method) + data = self.feature( + feature.model_dump(exclude_none=True), + max_size=max_size, + **kwargs, + ) + stats = data.statistics( + categorical=categorical, + categories=categories, + percentiles=percentiles, + hist_options=hist_options, + ) + + # Update input feature properties and add the statistics + feature.properties = feature.properties or {} + feature.properties.update({"statistics": stats}) + + return fc.features[0] if isinstance(geojson, Feature) else fc +``` + + +### Area Weighted Statistics + +When getting statistics from `feature`, you may want to calculate values from the pixels which intersect with the geometry but also take the pixel intersection percentage into account. Starting with rio-tiler `6.2.0`, we've added a `coverage` option to the `statistics` utility which enable the user to pass an array representing the coverage percentage such as: + +```python +import numpy +from rio_tiler.utils import get_array_statistics +# Data Array +# 1, 2 +# 3, 4 +data = numpy.ma.array((1, 2, 3, 4)).reshape((1, 2, 2)) + +# Coverage Array +# 0.5, 0 +# 1, 0.25 +coverage = numpy.array((0.5, 0, 1, 0.25)).reshape((2, 2)) + +stats = get_array_statistics(data, coverage=coverage) +assert len(stats) == 1 +assert stats[0]["min"] == 1 +assert stats[0]["max"] == 4 +assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4 +assert stats[0]["count"] == 1.75 # (0.5 + 0 + 1 + 0.25) sum of the coverage array +``` + +When using with a `feature`, your code might look something like: + +```python +from rio_tiler.io import Reader +from rio_tiler.constants import WGS84_CRS + +with Reader(path) as src: + # First get the array for the feature + data = src_dst.feature( + shape, + shape_crs=WGS84_CRS, + ) + + # Get the coverage % array, using ImageData.get_coverage_array method + coverage_array = data.get_coverage_array( + shape, shape_crs=WGS84_CRS + ) + + # Get statistics (ImageData.statistics is calling `rio_tiler.utils.get_array_statistics`) + stats = data.statistics(coverage=coverage_array) +``` + +!!! warnings + The coverage weights will only have influence on specific statistics: + + - `mean` + - `count` + - `sum` + - `std` + - `median` + diff --git a/docs/src/advanced/zonal_stats.md b/docs/src/advanced/zonal_stats.md deleted file mode 100644 index 6824fdad..00000000 --- a/docs/src/advanced/zonal_stats.md +++ /dev/null @@ -1,82 +0,0 @@ -`rio-tiler`'s Readers provide simple `.statistics` method to retrieve dataset statistics (min, max, histogram...). We can easily extend this to create a `.zonal_statistics` method which will accept input features to get statistics from. - -```python - -import attr -from typing import Any, Union, Optional, List, Dict - -from rio_tiler import io -from rio_tiler.utils import get_array_statistics -from rio_tiler.models import BandStatistics - -from geojson_pydantic.features import Feature, FeatureCollection -from geojson_pydantic.geometries import Polygon - -class Reader(io.Reader): - """Custom Reader with zonal_statistics method.""" - - def zonal_statistics( - self, - geojson: Union[FeatureCollection, Feature], - categorical: bool = False, - categories: Optional[List[float]] = None, - percentiles: List[int] = [2, 98], - hist_options: Optional[Dict] = None, - max_size: int = None, - **kwargs: Any, - ) -> FeatureCollection: - """Return statistics from GeoJSON features. - - Args: - geojson (Feature or FeatureCollection): a GeoJSON Feature or FeatureCollection. - categorical (bool): treat input data as categorical data. Defaults to False. - categories (list of numbers, optional): list of categories to return value for. - percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`. - hist_options (dict, optional): Options to forward to numpy.histogram function. - max_size (int, optional): Limit the size of the longest dimension of the dataset read, respecting bounds X/Y aspect ratio. Defaults to None. - kwargs (optional): Options to forward to `self.preview`. - - Returns: - FeatureCollection - - """ - kwargs = {**self._kwargs, **kwargs} - - hist_options = hist_options or {} - - # We transform the input Feature to a FeatureCollection - if not isinstance(geojson, FeatureCollection): - geojson = FeatureCollection(features=[geojson]) - - for feature in geojson: - # Get data overlapping with the feature (using Reader.feature method) - data = self.feature( - feature.model_dump(exclude_none=True), - max_size=max_size, - **kwargs, - ) - - # Get band statistics for the data - stats = get_array_statistics( - data.as_masked(), - categorical=categorical, - categories=categories, - percentiles=percentiles, - **hist_options, - ) - - # Update input feature properties and add the statistics - feature.properties = feature.properties or {} - feature.properties.update( - { - "statistics": { - f"{data.band_names[ix]}": BandStatistics( - **stats[ix] - ) - for ix in range(len(stats)) - } - } - ) - - return geojson -``` diff --git a/rio_tiler/models.py b/rio_tiler/models.py index 7ccf2225..f151b0fa 100644 --- a/rio_tiler/models.py +++ b/rio_tiler/models.py @@ -8,6 +8,7 @@ import numpy from affine import Affine from color_operations import parse_operations, scale_dtype, to_math_type +from numpy.typing import NDArray from pydantic import BaseModel from rasterio import windows from rasterio.coords import BoundingBox @@ -15,11 +16,14 @@ from rasterio.dtypes import dtype_ranges from rasterio.enums import ColorInterp from rasterio.errors import NotGeoreferencedWarning +from rasterio.features import rasterize from rasterio.io import MemoryFile from rasterio.plot import reshape_as_image from rasterio.transform import from_bounds +from rasterio.warp import transform_geom from rio_tiler.colormap import apply_cmap +from rio_tiler.constants import WGS84_CRS from rio_tiler.errors import InvalidDatatypeWarning, InvalidPointDataError from rio_tiler.expression import apply_expression, get_expression_blocks from rio_tiler.types import ( @@ -769,6 +773,7 @@ def statistics( categories: Optional[List[float]] = None, percentiles: Optional[List[int]] = None, hist_options: Optional[Dict] = None, + coverage: Optional[numpy.ndarray] = None, ) -> Dict[str, BandStatistics]: """Return statistics from ImageData.""" hist_options = hist_options or {} @@ -778,6 +783,7 @@ def statistics( categorical=categorical, categories=categories, percentiles=percentiles, + coverage=coverage, **hist_options, ) @@ -785,3 +791,43 @@ def statistics( f"{self.band_names[ix]}": BandStatistics(**stats[ix]) for ix in range(len(stats)) } + + def get_coverage_array( + self, + shape: Dict, + shape_crs: CRS = WGS84_CRS, + cover_scale: int = 10, + ) -> NDArray[numpy.floating]: + + """ + cover_scale: int, optional + Scale used when generating coverage estimates of each + raster cell by vector feature. Coverage is generated by + rasterizing the feature at a finer resolution than the raster then using a summation to aggregate + to the raster resolution and dividing by the square of cover_scale + to get coverage value for each cell. Increasing cover_scale + will increase the accuracy of coverage values; three orders + magnitude finer resolution (cover_scale=1000) is usually enough to + get coverage estimates with <1% error in individual edge cells coverage + estimates, though much smaller values (e.g., cover_scale=10) are often + sufficient (<10% error) and require less memory. + + Note: code adapted from https://github.com/perrygeo/python-rasterstats/pull/136 by @sgoodm + + """ + if self.crs != shape_crs: + shape = transform_geom(shape_crs, self.crs, shape) + + cover_array = rasterize( + [(shape, 1)], + out_shape=(self.height * cover_scale, self.width * cover_scale), + transform=self.transform * Affine.scale(1 / cover_scale), + all_touched=True, + fill=0, + dtype="uint8", + ) + cover_array = cover_array.reshape( + (self.height, cover_scale, self.width, cover_scale) + ).astype("float32") + + return cover_array.sum(-1).sum(1) / (cover_scale**2) diff --git a/rio_tiler/utils.py b/rio_tiler/utils.py index 47cbc4a0..281a0b58 100644 --- a/rio_tiler/utils.py +++ b/rio_tiler/utils.py @@ -7,6 +7,7 @@ import numpy import rasterio from affine import Affine +from numpy.typing import NDArray from rasterio import windows from rasterio.crs import CRS from rasterio.dtypes import _gdal_typename @@ -36,6 +37,7 @@ def get_array_statistics( categorical: bool = False, categories: Optional[List[float]] = None, percentiles: Optional[List[int]] = None, + coverage: Optional[NDArray[numpy.floating]] = None, **kwargs: Any, ) -> List[Dict[Any, Any]]: """Calculate per band array statistics. @@ -45,6 +47,7 @@ def get_array_statistics( categorical (bool): treat input data as categorical data. Defaults to `False`. categories (list of numbers, optional): list of categories to return value for. percentiles (list of numbers, optional): list of percentile values to calculate. Defaults to `[2, 98]`. + coverage (numpy.array, optional): Data coverage fraction. kwargs (optional): options to forward to `numpy.histogram` function (only applies for non-categorical data). Returns: @@ -121,21 +124,36 @@ def get_array_statistics( else: percentiles_values = (numpy.nan,) * len(percentiles_names) + if coverage is not None: + assert coverage.shape == ( + data.shape[1], + data.shape[2], + ), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}" + + array = data[b] * coverage + + else: + array = data[b] + + count = array.count() if coverage is None else coverage.sum() + if valid_pixels: + majority = float(keys[counts.tolist().index(counts.max())].tolist()) + minority = float(keys[counts.tolist().index(counts.min())].tolist()) + else: + majority = numpy.nan + minority = numpy.nan + output.append( { "min": float(data[b].min()), "max": float(data[b].max()), - "mean": float(data[b].mean()), - "count": float(data[b].count()), - "sum": float(data[b].sum()), - "std": float(data[b].std()), - "median": float(numpy.ma.median(data[b])), - "majority": float(keys[counts.tolist().index(counts.max())].tolist()) - if valid_pixels - else numpy.nan, - "minority": float(keys[counts.tolist().index(counts.min())].tolist()) - if valid_pixels - else numpy.nan, + "mean": float(array.mean()), + "count": float(count), + "sum": float(array.sum()), + "std": float(array.std()), + "median": float(numpy.ma.median(array)), + "majority": majority, + "minority": minority, "unique": float(counts.size), **dict(zip(percentiles_names, percentiles_values)), "histogram": histogram, diff --git a/tests/test_models.py b/tests/test_models.py index e4aebeb7..aa02d761 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -439,3 +439,20 @@ def test_apply_color_formula(): assert img.count == 3 assert img.width == 256 assert img.height == 256 + + +def test_imagedata_coverage(): + """test coverage array.""" + im = ImageData( + numpy.ma.array((1, 2, 3, 4)).reshape((1, 2, 2)), + crs="epsg:4326", + bounds=(-180, -90, 180, 90), + ) + poly = { + "type": "Polygon", + "coordinates": [ + [[-90.0, -45.0], [90.0, -45.0], [90.0, 45.0], [-90.0, 45.0], [-90.0, -45.0]] + ], + } + coverage = im.get_coverage_array(poly) + assert numpy.unique(coverage).tolist() == [0.25] diff --git a/tests/test_utils.py b/tests/test_utils.py index 07dc0e6e..f2106a7b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -561,3 +561,30 @@ def parse(content): assert ColorInterp.alpha in color assert ColorInterp.red not in color assert ColorInterp.gray in color + + +def test_get_array_statistics_coverage(): + """Test statistics with coverage array.""" + # Data Array + # 1, 2 + # 3, 4 + data = np.ma.array((1, 2, 3, 4)).reshape((1, 2, 2)) + + # Coverage Array + # 0.5, 0 + # 1, 0.25 + coverage = np.array((0.5, 0, 1, 0.25)).reshape((2, 2)) + + stats = utils.get_array_statistics(data, coverage=coverage) + assert len(stats) == 1 + assert stats[0]["min"] == 1 + assert stats[0]["max"] == 4 + assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4 + assert stats[0]["count"] == 1.75 + + stats = utils.get_array_statistics(data) + assert len(stats) == 1 + assert stats[0]["min"] == 1 + assert stats[0]["max"] == 4 + assert stats[0]["mean"] == 2.5 + assert stats[0]["count"] == 4