Skip to content

Commit

Permalink
fix statistics for coverage (#684)
Browse files Browse the repository at this point in the history
* fix statistics for coverage

* round

* make sure we can serialize the stats
  • Loading branch information
vincentsarago authored Mar 22, 2024
1 parent 6343b57 commit 2e76ad7
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 31 deletions.
104 changes: 74 additions & 30 deletions rio_tiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,27 @@ def _chunks(my_list: Sequence, chuck_size: int) -> Generator[Sequence, None, Non
yield my_list[i : i + chuck_size]


# Ref: https://stackoverflow.com/posts/73905572
def _weighted_quantiles(
values: NDArray[numpy.floating],
weights: NDArray[numpy.floating],
quantiles: float = 0.5,
) -> float:
i = numpy.argsort(values)
c = numpy.cumsum(weights[i])
return float(values[i[numpy.searchsorted(c, numpy.array(quantiles) * c[-1])]])


# Ref: https://stackoverflow.com/questions/2413522
def _weighted_stdev(
values: NDArray[numpy.floating],
weights: NDArray[numpy.floating],
) -> float:
average = numpy.average(values, weights=weights)
variance = numpy.average((values - average) ** 2, weights=weights)
return float(math.sqrt(variance))


def get_array_statistics(
data: numpy.ma.MaskedArray,
categorical: bool = False,
Expand Down Expand Up @@ -85,25 +106,31 @@ def get_array_statistics(
percentiles = percentiles or [2, 98]

if len(data.shape) < 3:
data = numpy.expand_dims(data, axis=0)
data = numpy.ma.expand_dims(data, axis=0)

output: List[Dict[Any, Any]] = []
percentiles_names = [f"percentile_{int(p)}" for p in percentiles]

if coverage is not None:
assert coverage.shape == (
data.shape[1],
data.shape[2],
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"

else:
coverage = numpy.ones((data.shape[1], data.shape[2]))

# Avoid non masked nan/inf values
numpy.ma.fix_invalid(data, copy=False)

for b in range(data.shape[0]):
keys, counts = numpy.unique(data[b].compressed(), return_counts=True)
data_comp = data[b].compressed()

keys, counts = numpy.unique(data_comp, return_counts=True)

valid_pixels = float(numpy.ma.count(data[b]))
masked_pixels = float(numpy.ma.count_masked(data[b]))
valid_percent = round((valid_pixels / data[b].size) * 100, 2)
info_px = {
"valid_pixels": valid_pixels,
"masked_pixels": masked_pixels,
"valid_percent": valid_percent,
}

if categorical:
out_dict = dict(zip(keys.tolist(), counts.tolist()))
Expand All @@ -115,50 +142,67 @@ def get_array_statistics(
h_keys,
]
else:
h_counts, h_keys = numpy.histogram(data[b].compressed(), **kwargs)
h_counts, h_keys = numpy.histogram(data_comp, **kwargs)
histogram = [h_counts.tolist(), h_keys.tolist()]

if valid_pixels:
percentiles_values = numpy.percentile(
data[b].compressed(), percentiles
).tolist()
else:
percentiles_values = (numpy.nan,) * len(percentiles_names)

if coverage is not None:
assert coverage.shape == (
data.shape[1],
data.shape[2],
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"

array = data[b] * coverage
# Data coverage fractions
data_cov = data[b] * coverage
# Coverage Array + data mask
masked_coverage = numpy.ma.MaskedArray(coverage, mask=data_cov.mask)

if valid_pixels:
# TODO: when switching to numpy~=2.0
# percentiles_values = numpy.percentile(
# data_comp, percentiles, weights=coverage.flatten()
# ).tolist()
percentiles_values = [
_weighted_quantiles(data_comp, masked_coverage.compressed(), pp / 100.0)
for pp in percentiles
]
else:
array = data[b]
percentiles_values = [numpy.nan] * len(percentiles_names)

count = array.count() if coverage is None else coverage.sum()
if valid_pixels:
majority = float(keys[counts.tolist().index(counts.max())].tolist())
minority = float(keys[counts.tolist().index(counts.min())].tolist())
else:
majority = numpy.nan
minority = numpy.nan

_count = masked_coverage.sum()
_sum = data_cov.sum()

output.append(
{
# Minimum value, not taking coverage fractions into account.
"min": float(data[b].min()),
# Maximum value, not taking coverage fractions into account.
"max": float(data[b].max()),
"mean": float(array.mean()),
"count": float(count),
"sum": float(array.sum()),
"std": float(array.std()),
"median": float(numpy.ma.median(array)),
# Mean value, weighted by the percent of each cell that is covered.
"mean": float(_sum / _count),
# Sum of all non-masked cell coverage fractions.
"count": float(_count),
# Sum of values, weighted by their coverage fractions.
"sum": float(_sum),
# Population standard deviation of cell values, taking into account coverage fraction.
"std": _weighted_stdev(data_comp, masked_coverage.compressed()),
# Median value of cells, weighted by the percent of each cell that is covered.
"median": _weighted_quantiles(data_comp, masked_coverage.compressed()),
# The value occupying the greatest number of cells.
"majority": majority,
# The value occupying the least number of cells.
"minority": minority,
# Unique values.
"unique": float(counts.size),
# quantiles
**dict(zip(percentiles_names, percentiles_values)),
"histogram": histogram,
**info_px,
# Number of non-masked cells, not taking coverage fractions into account.
"valid_pixels": valid_pixels,
# Number of masked cells, not taking coverage fractions into account.
"masked_pixels": masked_pixels,
# Percent of valid cells
"valid_percent": valid_percent,
}
)

Expand Down
27 changes: 26 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""tests rio_tiler.utils"""

import json
import math
import os
import warnings
Expand Down Expand Up @@ -460,6 +461,8 @@ def test_get_array_statistics():
"masked_pixels",
"valid_percent",
]
# Make sure the statistics object are JSON serializable
assert json.dumps(stats[0])

stats = utils.get_array_statistics(arr, percentiles=[2, 3, 4])
assert "percentile_2" in stats[0]
Expand Down Expand Up @@ -566,6 +569,7 @@ def parse(content):

def test_get_array_statistics_coverage():
"""Test statistics with coverage array."""
# same test as https://github.com/isciences/exactextract?tab=readme-ov-file#supported-statistics
# Data Array
# 1, 2
# 3, 4
Expand All @@ -580,8 +584,12 @@ def test_get_array_statistics_coverage():
assert len(stats) == 1
assert stats[0]["min"] == 1
assert stats[0]["max"] == 4
assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4
assert (
round(stats[0]["mean"], 4) == 2.5714
) # sum of weighted array / sum of weights | 4.5 / 1.75 = 2.57
assert stats[0]["count"] == 1.75
assert stats[0]["median"] == 3 # 2 in exactextract
assert round(stats[0]["std"], 2) == 1.05

stats = utils.get_array_statistics(data)
assert len(stats) == 1
Expand All @@ -590,6 +598,23 @@ def test_get_array_statistics_coverage():
assert stats[0]["mean"] == 2.5
assert stats[0]["count"] == 4

# same test as https://github.com/isciences/exactextract/blob/0883cd585d9c7b6b4e936aeca4aa84a15adf82d2/python/tests/test_exact_extract.py#L48-L110
data = np.ma.arange(1, 10, dtype=np.int32).reshape(3, 3)
coverage = np.array([0.25, 0.5, 0.25, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25]).reshape(3, 3)
stats = utils.get_array_statistics(data, coverage=coverage, percentiles=[25, 75])
assert len(stats) == 1
assert stats[0]["count"] == 4
assert stats[0]["mean"] == 5
assert stats[0]["median"] == 5
assert stats[0]["min"] == 1
assert stats[0]["max"] == 9
# exactextract takes coverage into account, we don't
assert stats[0]["minority"] == 1 # 1 in exactextract
assert stats[0]["majority"] == 1 # 5 in exactextract
assert stats[0]["percentile_25"] == 3
assert stats[0]["percentile_75"] == 6
assert stats[0]["std"] == math.sqrt(5)


def test_get_vrt_transform_world_file(dataset_fixture):
"""Should return correct transform and size."""
Expand Down

0 comments on commit 2e76ad7

Please sign in to comment.