Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix statistics for coverage #684

Merged
merged 3 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 74 additions & 30 deletions rio_tiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,27 @@ def _chunks(my_list: Sequence, chuck_size: int) -> Generator[Sequence, None, Non
yield my_list[i : i + chuck_size]


# Ref: https://stackoverflow.com/posts/73905572
def _weighted_quantiles(
values: NDArray[numpy.floating],
weights: NDArray[numpy.floating],
quantiles: float = 0.5,
) -> float:
i = numpy.argsort(values)
c = numpy.cumsum(weights[i])
return values[i[numpy.searchsorted(c, numpy.array(quantiles) * c[-1])]]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be removed with numpy 2.0



# Ref: https://stackoverflow.com/questions/2413522
def _weighted_stdev(
values: NDArray[numpy.floating],
weights: NDArray[numpy.floating],
) -> float:
average = numpy.average(values, weights=weights)
variance = numpy.average((values - average) ** 2, weights=weights)
return float(math.sqrt(variance))


def get_array_statistics(
data: numpy.ma.MaskedArray,
categorical: bool = False,
Expand Down Expand Up @@ -85,25 +106,31 @@ def get_array_statistics(
percentiles = percentiles or [2, 98]

if len(data.shape) < 3:
data = numpy.expand_dims(data, axis=0)
data = numpy.ma.expand_dims(data, axis=0)

output: List[Dict[Any, Any]] = []
percentiles_names = [f"percentile_{int(p)}" for p in percentiles]

if coverage is not None:
assert coverage.shape == (
data.shape[1],
data.shape[2],
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"

else:
coverage = numpy.ones((data.shape[1], data.shape[2]))

# Avoid non masked nan/inf values
numpy.ma.fix_invalid(data, copy=False)

for b in range(data.shape[0]):
keys, counts = numpy.unique(data[b].compressed(), return_counts=True)
data_comp = data[b].compressed()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data[b].compressed() was called multiple times


keys, counts = numpy.unique(data_comp, return_counts=True)

valid_pixels = float(numpy.ma.count(data[b]))
masked_pixels = float(numpy.ma.count_masked(data[b]))
valid_percent = round((valid_pixels / data[b].size) * 100, 2)
info_px = {
"valid_pixels": valid_pixels,
"masked_pixels": masked_pixels,
"valid_percent": valid_percent,
}

if categorical:
out_dict = dict(zip(keys.tolist(), counts.tolist()))
Expand All @@ -115,50 +142,67 @@ def get_array_statistics(
h_keys,
]
else:
h_counts, h_keys = numpy.histogram(data[b].compressed(), **kwargs)
h_counts, h_keys = numpy.histogram(data_comp, **kwargs)
histogram = [h_counts.tolist(), h_keys.tolist()]

if valid_pixels:
percentiles_values = numpy.percentile(
data[b].compressed(), percentiles
).tolist()
else:
percentiles_values = (numpy.nan,) * len(percentiles_names)

if coverage is not None:
assert coverage.shape == (
data.shape[1],
data.shape[2],
), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"

array = data[b] * coverage
# Data coverage fractions
data_cov = data[b] * coverage
# Coverage Array + data mask
masked_coverage = numpy.ma.MaskedArray(coverage, mask=data_cov.mask)

if valid_pixels:
# TODO: when switching to numpy~=2.0
# percentiles_values = numpy.percentile(
# data_comp, percentiles, weights=coverage.flatten()
# ).tolist()
percentiles_values = [
_weighted_quantiles(data_comp, masked_coverage.compressed(), pp / 100.0)
for pp in percentiles
]
else:
array = data[b]
percentiles_values = [numpy.nan] * len(percentiles_names)

count = array.count() if coverage is None else coverage.sum()
if valid_pixels:
majority = float(keys[counts.tolist().index(counts.max())].tolist())
minority = float(keys[counts.tolist().index(counts.min())].tolist())
else:
majority = numpy.nan
minority = numpy.nan

_count = masked_coverage.sum()
_sum = data_cov.sum()

output.append(
{
# Minimum value, not taking coverage fractions into account.
"min": float(data[b].min()),
# Maximum value, not taking coverage fractions into account.
"max": float(data[b].max()),
"mean": float(array.mean()),
"count": float(count),
"sum": float(array.sum()),
"std": float(array.std()),
"median": float(numpy.ma.median(array)),
# Mean value, weighted by the percent of each cell that is covered.
"mean": float(_sum / _count),
# Sum of all non-masked cell coverage fractions.
"count": float(_count),
# Sum of values, weighted by their coverage fractions.
"sum": float(_sum),
# Population standard deviation of cell values, taking into account coverage fraction.
"std": _weighted_stdev(data_comp, masked_coverage.compressed()),
# Median value of cells, weighted by the percent of each cell that is covered.
"median": _weighted_quantiles(data_comp, masked_coverage.compressed()),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std and median are now weighted by the coverage array

# The value occupying the greatest number of cells.
"majority": majority,
# The value occupying the least number of cells.
"minority": minority,
# Unique values.
"unique": float(counts.size),
# quantiles
**dict(zip(percentiles_names, percentiles_values)),
"histogram": histogram,
**info_px,
# Number of non-masked cells, not taking coverage fractions into account.
"valid_pixels": valid_pixels,
# Number of masked cells, not taking coverage fractions into account.
"masked_pixels": masked_pixels,
# Percent of valid cells
"valid_percent": valid_percent,
}
)

Expand Down
24 changes: 23 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ def parse(content):

def test_get_array_statistics_coverage():
"""Test statistics with coverage array."""
# same test as https://github.com/isciences/exactextract?tab=readme-ov-file#supported-statistics
# Data Array
# 1, 2
# 3, 4
Expand All @@ -580,8 +581,12 @@ def test_get_array_statistics_coverage():
assert len(stats) == 1
assert stats[0]["min"] == 1
assert stats[0]["max"] == 4
assert stats[0]["mean"] == 1.125 # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4
assert (
round(stats[0]["mean"], 4) == 2.5714
) # sum of weighted array / sum of weights | 4.5 / 1.75 = 2.57
assert stats[0]["count"] == 1.75
assert stats[0]["median"] == 3 # 2 in exactextract
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea why median gives a different results. I've tested a new numpy 2.0 method and it gives 3 while exactextract give 2. I don't want to over engineer the median calculation

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a raster of type T exactextract is returning type T for the quantile and median calculations. T here is int64, so the median of 2.5 is getting truncated to 2. Maybe quantile/median should be returning float64 instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oO that makes sense 🙏 thanks for having a look

assert round(stats[0]["std"], 2) == 1.05

stats = utils.get_array_statistics(data)
assert len(stats) == 1
Expand All @@ -590,6 +595,23 @@ def test_get_array_statistics_coverage():
assert stats[0]["mean"] == 2.5
assert stats[0]["count"] == 4

# same test as https://github.com/isciences/exactextract/blob/0883cd585d9c7b6b4e936aeca4aa84a15adf82d2/python/tests/test_exact_extract.py#L48-L110
data = np.ma.arange(1, 10, dtype=np.int32).reshape(3, 3)
coverage = np.array([0.25, 0.5, 0.25, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25]).reshape(3, 3)
stats = utils.get_array_statistics(data, coverage=coverage, percentiles=[25, 75])
assert len(stats) == 1
assert stats[0]["count"] == 4
assert stats[0]["mean"] == 5
assert stats[0]["median"] == 5
assert stats[0]["min"] == 1
assert stats[0]["max"] == 9
# exactextract takes coverage into account, we don't
assert stats[0]["minority"] == 1 # 1 in exactextract
assert stats[0]["majority"] == 1 # 5 in exactextract
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minority and majority do not take coverage into account. We might do this later if needed

assert stats[0]["percentile_25"] == 3
assert stats[0]["percentile_75"] == 6
assert stats[0]["std"] == math.sqrt(5)


def test_get_vrt_transform_world_file(dataset_fixture):
"""Should return correct transform and size."""
Expand Down
Loading