-
Notifications
You must be signed in to change notification settings - Fork 108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix statistics for coverage #684
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,27 @@ def _chunks(my_list: Sequence, chuck_size: int) -> Generator[Sequence, None, Non | |
yield my_list[i : i + chuck_size] | ||
|
||
|
||
# Ref: https://stackoverflow.com/posts/73905572 | ||
def _weighted_quantiles( | ||
values: NDArray[numpy.floating], | ||
weights: NDArray[numpy.floating], | ||
quantiles: float = 0.5, | ||
) -> float: | ||
i = numpy.argsort(values) | ||
c = numpy.cumsum(weights[i]) | ||
return values[i[numpy.searchsorted(c, numpy.array(quantiles) * c[-1])]] | ||
|
||
|
||
# Ref: https://stackoverflow.com/questions/2413522 | ||
def _weighted_stdev( | ||
values: NDArray[numpy.floating], | ||
weights: NDArray[numpy.floating], | ||
) -> float: | ||
average = numpy.average(values, weights=weights) | ||
variance = numpy.average((values - average) ** 2, weights=weights) | ||
return float(math.sqrt(variance)) | ||
|
||
|
||
def get_array_statistics( | ||
data: numpy.ma.MaskedArray, | ||
categorical: bool = False, | ||
|
@@ -85,25 +106,31 @@ def get_array_statistics( | |
percentiles = percentiles or [2, 98] | ||
|
||
if len(data.shape) < 3: | ||
data = numpy.expand_dims(data, axis=0) | ||
data = numpy.ma.expand_dims(data, axis=0) | ||
|
||
output: List[Dict[Any, Any]] = [] | ||
percentiles_names = [f"percentile_{int(p)}" for p in percentiles] | ||
|
||
if coverage is not None: | ||
assert coverage.shape == ( | ||
data.shape[1], | ||
data.shape[2], | ||
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}" | ||
|
||
else: | ||
coverage = numpy.ones((data.shape[1], data.shape[2])) | ||
|
||
# Avoid non masked nan/inf values | ||
numpy.ma.fix_invalid(data, copy=False) | ||
|
||
for b in range(data.shape[0]): | ||
keys, counts = numpy.unique(data[b].compressed(), return_counts=True) | ||
data_comp = data[b].compressed() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
keys, counts = numpy.unique(data_comp, return_counts=True) | ||
|
||
valid_pixels = float(numpy.ma.count(data[b])) | ||
masked_pixels = float(numpy.ma.count_masked(data[b])) | ||
valid_percent = round((valid_pixels / data[b].size) * 100, 2) | ||
info_px = { | ||
"valid_pixels": valid_pixels, | ||
"masked_pixels": masked_pixels, | ||
"valid_percent": valid_percent, | ||
} | ||
|
||
if categorical: | ||
out_dict = dict(zip(keys.tolist(), counts.tolist())) | ||
|
@@ -115,50 +142,67 @@ def get_array_statistics( | |
h_keys, | ||
] | ||
else: | ||
h_counts, h_keys = numpy.histogram(data[b].compressed(), **kwargs) | ||
h_counts, h_keys = numpy.histogram(data_comp, **kwargs) | ||
histogram = [h_counts.tolist(), h_keys.tolist()] | ||
|
||
if valid_pixels: | ||
percentiles_values = numpy.percentile( | ||
data[b].compressed(), percentiles | ||
).tolist() | ||
else: | ||
percentiles_values = (numpy.nan,) * len(percentiles_names) | ||
|
||
if coverage is not None: | ||
assert coverage.shape == ( | ||
data.shape[1], | ||
data.shape[2], | ||
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}" | ||
|
||
array = data[b] * coverage | ||
# Data coverage fractions | ||
data_cov = data[b] * coverage | ||
# Coverage Array + data mask | ||
masked_coverage = numpy.ma.MaskedArray(coverage, mask=data_cov.mask) | ||
|
||
if valid_pixels: | ||
# TODO: when switching to numpy~=2.0 | ||
# percentiles_values = numpy.percentile( | ||
# data_comp, percentiles, weights=coverage.flatten() | ||
# ).tolist() | ||
percentiles_values = [ | ||
_weighted_quantiles(data_comp, masked_coverage.compressed(), pp / 100.0) | ||
for pp in percentiles | ||
] | ||
else: | ||
array = data[b] | ||
percentiles_values = [numpy.nan] * len(percentiles_names) | ||
|
||
count = array.count() if coverage is None else coverage.sum() | ||
if valid_pixels: | ||
majority = float(keys[counts.tolist().index(counts.max())].tolist()) | ||
minority = float(keys[counts.tolist().index(counts.min())].tolist()) | ||
else: | ||
majority = numpy.nan | ||
minority = numpy.nan | ||
|
||
_count = masked_coverage.sum() | ||
_sum = data_cov.sum() | ||
|
||
output.append( | ||
{ | ||
# Minimum value, not taking coverage fractions into account. | ||
"min": float(data[b].min()), | ||
# Maximum value, not taking coverage fractions into account. | ||
"max": float(data[b].max()), | ||
"mean": float(array.mean()), | ||
"count": float(count), | ||
"sum": float(array.sum()), | ||
"std": float(array.std()), | ||
"median": float(numpy.ma.median(array)), | ||
# Mean value, weighted by the percent of each cell that is covered. | ||
"mean": float(_sum / _count), | ||
# Sum of all non-masked cell coverage fractions. | ||
"count": float(_count), | ||
# Sum of values, weighted by their coverage fractions. | ||
"sum": float(_sum), | ||
# Population standard deviation of cell values, taking into account coverage fraction. | ||
"std": _weighted_stdev(data_comp, masked_coverage.compressed()), | ||
# Median value of cells, weighted by the percent of each cell that is covered. | ||
"median": _weighted_quantiles(data_comp, masked_coverage.compressed()), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
# The value occupying the greatest number of cells. | ||
"majority": majority, | ||
# The value occupying the least number of cells. | ||
"minority": minority, | ||
# Unique values. | ||
"unique": float(counts.size), | ||
# quantiles | ||
**dict(zip(percentiles_names, percentiles_values)), | ||
"histogram": histogram, | ||
**info_px, | ||
# Number of non-masked cells, not taking coverage fractions into account. | ||
"valid_pixels": valid_pixels, | ||
# Number of masked cells, not taking coverage fractions into account. | ||
"masked_pixels": masked_pixels, | ||
# Percent of valid cells | ||
"valid_percent": valid_percent, | ||
} | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -566,6 +566,7 @@ def parse(content): | |
|
||
def test_get_array_statistics_coverage(): | ||
"""Test statistics with coverage array.""" | ||
# same test as https://github.com/isciences/exactextract?tab=readme-ov-file#supported-statistics | ||
# Data Array | ||
# 1, 2 | ||
# 3, 4 | ||
|
@@ -580,8 +581,12 @@ def test_get_array_statistics_coverage(): | |
assert len(stats) == 1 | ||
assert stats[0]["min"] == 1 | ||
assert stats[0]["max"] == 4 | ||
assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4 | ||
assert ( | ||
round(stats[0]["mean"], 4) == 2.5714 | ||
) # sum of weighted array / sum of weights | 4.5 / 1.75 = 2.57 | ||
assert stats[0]["count"] == 1.75 | ||
assert stats[0]["median"] == 3 # 2 in exactextract | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have no idea why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For a raster of type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oO that makes sense 🙏 thanks for having a look |
||
assert round(stats[0]["std"], 2) == 1.05 | ||
|
||
stats = utils.get_array_statistics(data) | ||
assert len(stats) == 1 | ||
|
@@ -590,6 +595,23 @@ def test_get_array_statistics_coverage(): | |
assert stats[0]["mean"] == 2.5 | ||
assert stats[0]["count"] == 4 | ||
|
||
# same test as https://github.com/isciences/exactextract/blob/0883cd585d9c7b6b4e936aeca4aa84a15adf82d2/python/tests/test_exact_extract.py#L48-L110 | ||
data = np.ma.arange(1, 10, dtype=np.int32).reshape(3, 3) | ||
coverage = np.array([0.25, 0.5, 0.25, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25]).reshape(3, 3) | ||
stats = utils.get_array_statistics(data, coverage=coverage, percentiles=[25, 75]) | ||
assert len(stats) == 1 | ||
assert stats[0]["count"] == 4 | ||
assert stats[0]["mean"] == 5 | ||
assert stats[0]["median"] == 5 | ||
assert stats[0]["min"] == 1 | ||
assert stats[0]["max"] == 9 | ||
# exactextract takes coverage into account, we don't | ||
assert stats[0]["minority"] == 1 # 1 in exactextract | ||
assert stats[0]["majority"] == 1 # 5 in exactextract | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
assert stats[0]["percentile_25"] == 3 | ||
assert stats[0]["percentile_75"] == 6 | ||
assert stats[0]["std"] == math.sqrt(5) | ||
|
||
|
||
def test_get_vrt_transform_world_file(dataset_fixture): | ||
"""Should return correct transform and size.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will be removed with numpy 2.0