From b1e8d7318d0bbe2f2d977f9d04591b272e0840f4 Mon Sep 17 00:00:00 2001 From: vincentsarago Date: Tue, 12 Mar 2024 15:44:13 +0100 Subject: [PATCH] fix statistics for coverage --- rio_tiler/utils.py | 112 ++++++++++++++++++++++++++++++++------------ tests/test_utils.py | 24 +++++++++- 2 files changed, 105 insertions(+), 31 deletions(-) diff --git a/rio_tiler/utils.py b/rio_tiler/utils.py index 414f66f0..852ac11e 100644 --- a/rio_tiler/utils.py +++ b/rio_tiler/utils.py @@ -33,6 +33,35 @@ def _chunks(my_list: Sequence, chuck_size: int) -> Generator[Sequence, None, Non yield my_list[i : i + chuck_size] +# Ref: https://stackoverflow.com/posts/73905572 +def _weighted_quantiles( + values: NDArray[numpy.floating], + weights: NDArray[numpy.floating], + quantiles: float = 0.5, +) -> float: + i = numpy.argsort(values) + c = numpy.cumsum(weights[i]) + return values[i[numpy.searchsorted(c, numpy.array(quantiles) * c[-1])]] + + +# Ref: https://stackoverflow.com/questions/2413522 +def _weighted_stdev( + values: NDArray[numpy.floating], + weights: NDArray[numpy.floating], +) -> float: + """ + Return the weighted average and standard deviation. + + They weights are in effect first normalized so that they + sum to 1 (and so they must not all be 0). + + values, weights -- NumPy ndarrays with the same shape. + """ + average = numpy.average(values, weights=weights) + variance = numpy.average((values - average) ** 2, weights=weights) + return float(math.sqrt(variance)) + + def get_array_statistics( data: numpy.ma.MaskedArray, categorical: bool = False, @@ -85,25 +114,31 @@ def get_array_statistics( percentiles = percentiles or [2, 98] if len(data.shape) < 3: - data = numpy.expand_dims(data, axis=0) + data = numpy.ma.expand_dims(data, axis=0) output: List[Dict[Any, Any]] = [] percentiles_names = [f"percentile_{int(p)}" for p in percentiles] + if coverage is not None: + assert coverage.shape == ( + data.shape[1], + data.shape[2], + ), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}" + + else: + coverage = numpy.ones((data.shape[1], data.shape[2])) + # Avoid non masked nan/inf values numpy.ma.fix_invalid(data, copy=False) for b in range(data.shape[0]): - keys, counts = numpy.unique(data[b].compressed(), return_counts=True) + data_comp = data[b].compressed() + + keys, counts = numpy.unique(data_comp, return_counts=True) valid_pixels = float(numpy.ma.count(data[b])) masked_pixels = float(numpy.ma.count_masked(data[b])) valid_percent = round((valid_pixels / data[b].size) * 100, 2) - info_px = { - "valid_pixels": valid_pixels, - "masked_pixels": masked_pixels, - "valid_percent": valid_percent, - } if categorical: out_dict = dict(zip(keys.tolist(), counts.tolist())) @@ -115,28 +150,26 @@ def get_array_statistics( h_keys, ] else: - h_counts, h_keys = numpy.histogram(data[b].compressed(), **kwargs) + h_counts, h_keys = numpy.histogram(data_comp, **kwargs) histogram = [h_counts.tolist(), h_keys.tolist()] - if valid_pixels: - percentiles_values = numpy.percentile( - data[b].compressed(), percentiles - ).tolist() - else: - percentiles_values = (numpy.nan,) * len(percentiles_names) - - if coverage is not None: - assert coverage.shape == ( - data.shape[1], - data.shape[2], - ), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}" - - array = data[b] * coverage + # Data coverage fractions + data_cov = data[b] * coverage + # Coverage Array + data mask + masked_coverage = numpy.ma.MaskedArray(coverage, mask=data_cov.mask) + if valid_pixels: + # TODO: when switching to numpy~=2.0 + # percentiles_values = numpy.percentile( + # data_comp, percentiles, weights=coverage.flatten() + # ).tolist() + percentiles_values = [ + _weighted_quantiles(data_comp, masked_coverage.compressed(), pp / 100.0) + for pp in percentiles + ] else: - array = data[b] + percentiles_values = [numpy.nan] * len(percentiles_names) - count = array.count() if coverage is None else coverage.sum() if valid_pixels: majority = float(keys[counts.tolist().index(counts.max())].tolist()) minority = float(keys[counts.tolist().index(counts.min())].tolist()) @@ -144,21 +177,40 @@ def get_array_statistics( majority = numpy.nan minority = numpy.nan + _count = masked_coverage.sum() + _sum = data_cov.sum() + output.append( { + # Minimum value, not taking coverage fractions into account. "min": float(data[b].min()), + # Maximum value, not taking coverage fractions into account. "max": float(data[b].max()), - "mean": float(array.mean()), - "count": float(count), - "sum": float(array.sum()), - "std": float(array.std()), - "median": float(numpy.ma.median(array)), + # Mean value, weighted by the percent of each cell that is covered. + "mean": float(_sum / _count), + # Sum of all non-masked cell coverage fractions. + "count": float(_count), + # Sum of values, weighted by their coverage fractions. + "sum": float(_sum), + # Population standard deviation of cell values, taking into account coverage fraction. + "std": _weighted_stdev(data_comp, masked_coverage.compressed()), + # Median value of cells, weighted by the percent of each cell that is covered. + "median": _weighted_quantiles(data_comp, masked_coverage.compressed()), + # The value occupying the greatest number of cells. "majority": majority, + # The value occupying the least number of cells. "minority": minority, + # Unique values. "unique": float(counts.size), + # quantiles **dict(zip(percentiles_names, percentiles_values)), "histogram": histogram, - **info_px, + # Number of non-masked cells, not taking coverage fractions into account. + "valid_pixels": valid_pixels, + # Number of masked cells, not taking coverage fractions into account. + "masked_pixels": masked_pixels, + # Percent of valid cells + "valid_percent": valid_percent, } ) diff --git a/tests/test_utils.py b/tests/test_utils.py index cb31540f..c7751a26 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -566,6 +566,7 @@ def parse(content): def test_get_array_statistics_coverage(): """Test statistics with coverage array.""" + # same test as https://github.com/isciences/exactextract?tab=readme-ov-file#supported-statistics # Data Array # 1, 2 # 3, 4 @@ -580,8 +581,12 @@ def test_get_array_statistics_coverage(): assert len(stats) == 1 assert stats[0]["min"] == 1 assert stats[0]["max"] == 4 - assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4 + assert ( + round(stats[0]["mean"], 4) == 2.5714 + ) # sum of weighted array / sum of weights | 4.5 / 1.75 = 2.57 assert stats[0]["count"] == 1.75 + assert stats[0]["median"] == 3 # 2 in exactextract + assert stats[0]["std"] == 1.05 stats = utils.get_array_statistics(data) assert len(stats) == 1 @@ -590,6 +595,23 @@ def test_get_array_statistics_coverage(): assert stats[0]["mean"] == 2.5 assert stats[0]["count"] == 4 + # same test as https://github.com/isciences/exactextract/blob/0883cd585d9c7b6b4e936aeca4aa84a15adf82d2/python/tests/test_exact_extract.py#L48-L110 + data = np.ma.arange(1, 10, dtype=np.int32).reshape(3, 3) + coverage = np.array([0.25, 0.5, 0.25, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25]).reshape(3, 3) + stats = utils.get_array_statistics(data, coverage=coverage, percentiles=[25, 75]) + assert len(stats) == 1 + assert stats[0]["count"] == 4 + assert stats[0]["mean"] == 5 + assert stats[0]["median"] == 5 + assert stats[0]["min"] == 1 + assert stats[0]["max"] == 9 + # exactextract takes coverage into account, we don't + assert stats[0]["minority"] == 1 # 1 in exactextract + assert stats[0]["majority"] == 1 # 5 in exactextract + assert stats[0]["percentile_25"] == 3 + assert stats[0]["percentile_75"] == 6 + assert stats[0]["std"] == math.sqrt(5) + def test_get_vrt_transform_world_file(dataset_fixture): """Should return correct transform and size."""