cogeotiff · vincentsarago · Mar 22, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 18, 2024
diff --git a/rio_tiler/utils.py b/rio_tiler/utils.py
@@ -33,6 +33,27 @@ def _chunks(my_list: Sequence, chuck_size: int) -> Generator[Sequence, None, Non
         yield my_list[i : i + chuck_size]
 
 
+# Ref: https://stackoverflow.com/posts/73905572
+def _weighted_quantiles(
+    values: NDArray[numpy.floating],
+    weights: NDArray[numpy.floating],
+    quantiles: float = 0.5,
+) -> float:
+    i = numpy.argsort(values)
+    c = numpy.cumsum(weights[i])
+    return values[i[numpy.searchsorted(c, numpy.array(quantiles) * c[-1])]]
+
+
+# Ref: https://stackoverflow.com/questions/2413522
+def _weighted_stdev(
+    values: NDArray[numpy.floating],
+    weights: NDArray[numpy.floating],
+) -> float:
+    average = numpy.average(values, weights=weights)
+    variance = numpy.average((values - average) ** 2, weights=weights)
+    return float(math.sqrt(variance))
+
+
 def get_array_statistics(
     data: numpy.ma.MaskedArray,
     categorical: bool = False,
@@ -85,25 +106,31 @@ def get_array_statistics(
     percentiles = percentiles or [2, 98]
 
     if len(data.shape) < 3:
-        data = numpy.expand_dims(data, axis=0)
+        data = numpy.ma.expand_dims(data, axis=0)
 
     output: List[Dict[Any, Any]] = []
     percentiles_names = [f"percentile_{int(p)}" for p in percentiles]
 
+    if coverage is not None:
+        assert coverage.shape == (
+            data.shape[1],
+            data.shape[2],
+        ), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"
+
+    else:
+        coverage = numpy.ones((data.shape[1], data.shape[2]))
+
     # Avoid non masked nan/inf values
     numpy.ma.fix_invalid(data, copy=False)
 
     for b in range(data.shape[0]):
-        keys, counts = numpy.unique(data[b].compressed(), return_counts=True)
+        data_comp = data[b].compressed()
+
+        keys, counts = numpy.unique(data_comp, return_counts=True)
 
         valid_pixels = float(numpy.ma.count(data[b]))
         masked_pixels = float(numpy.ma.count_masked(data[b]))
         valid_percent = round((valid_pixels / data[b].size) * 100, 2)
-        info_px = {
-            "valid_pixels": valid_pixels,
-            "masked_pixels": masked_pixels,
-            "valid_percent": valid_percent,
-        }
 
         if categorical:
             out_dict = dict(zip(keys.tolist(), counts.tolist()))
@@ -115,50 +142,67 @@ def get_array_statistics(
                 h_keys,
             ]
         else:
-            h_counts, h_keys = numpy.histogram(data[b].compressed(), **kwargs)
+            h_counts, h_keys = numpy.histogram(data_comp, **kwargs)
             histogram = [h_counts.tolist(), h_keys.tolist()]
 
-        if valid_pixels:
-            percentiles_values = numpy.percentile(
-                data[b].compressed(), percentiles
-            ).tolist()
-        else:
-            percentiles_values = (numpy.nan,) * len(percentiles_names)
-
-        if coverage is not None:
-            assert coverage.shape == (
-                data.shape[1],
-                data.shape[2],
-            ), f"Invalid shape ({coverage.shape}) for Coverage, expected {(data.shape[1], data.shape[2])}"
-
-            array = data[b] * coverage
+        # Data coverage fractions
+        data_cov = data[b] * coverage
+        # Coverage Array + data mask
+        masked_coverage = numpy.ma.MaskedArray(coverage, mask=data_cov.mask)
 
+        if valid_pixels:
+            # TODO: when switching to numpy~=2.0
+            # percentiles_values = numpy.percentile(
+            #     data_comp, percentiles, weights=coverage.flatten()
+            # ).tolist()
+            percentiles_values = [
+                _weighted_quantiles(data_comp, masked_coverage.compressed(), pp / 100.0)
+                for pp in percentiles
+            ]
         else:
-            array = data[b]
+            percentiles_values = [numpy.nan] * len(percentiles_names)
 
-        count = array.count() if coverage is None else coverage.sum()
         if valid_pixels:
             majority = float(keys[counts.tolist().index(counts.max())].tolist())
             minority = float(keys[counts.tolist().index(counts.min())].tolist())
         else:
             majority = numpy.nan
             minority = numpy.nan
 
+        _count = masked_coverage.sum()
+        _sum = data_cov.sum()
+
         output.append(
             {
+                # Minimum value, not taking coverage fractions into account.
                 "min": float(data[b].min()),
+                # Maximum value, not taking coverage fractions into account.
                 "max": float(data[b].max()),
-                "mean": float(array.mean()),
-                "count": float(count),
-                "sum": float(array.sum()),
-                "std": float(array.std()),
-                "median": float(numpy.ma.median(array)),
+                # Mean value, weighted by the percent of each cell that is covered.
+                "mean": float(_sum / _count),
+                # Sum of all non-masked cell coverage fractions.
+                "count": float(_count),
+                # Sum of values, weighted by their coverage fractions.
+                "sum": float(_sum),
+                # Population standard deviation of cell values, taking into account coverage fraction.
+                "std": _weighted_stdev(data_comp, masked_coverage.compressed()),
+                # Median value of cells, weighted by the percent of each cell that is covered.
+                "median": _weighted_quantiles(data_comp, masked_coverage.compressed()),
+                # The value occupying the greatest number of cells.
                 "majority": majority,
+                # The value occupying the least number of cells.
                 "minority": minority,
+                # Unique values.
                 "unique": float(counts.size),
+                # quantiles
                 **dict(zip(percentiles_names, percentiles_values)),
                 "histogram": histogram,
-                **info_px,
+                # Number of non-masked cells, not taking coverage fractions into account.
+                "valid_pixels": valid_pixels,
+                # Number of masked cells, not taking coverage fractions into account.
+                "masked_pixels": masked_pixels,
+                # Percent of valid cells
+                "valid_percent": valid_percent,
             }
         )
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -566,6 +566,7 @@ def parse(content):
 
 def test_get_array_statistics_coverage():
     """Test statistics with coverage array."""
+    # same test as https://github.com/isciences/exactextract?tab=readme-ov-file#supported-statistics
     # Data Array
     # 1, 2
     # 3, 4
@@ -580,8 +581,12 @@ def test_get_array_statistics_coverage():
     assert len(stats) == 1
     assert stats[0]["min"] == 1
     assert stats[0]["max"] == 4
-    assert stats[0]["mean"] == 1.125  # (1 * 0.5 + 2 * 0.0 + 3 * 1.0 + 4 * 0.25) / 4
+    assert (
+        round(stats[0]["mean"], 4) == 2.5714
+    )  # sum of weighted array / sum of weights | 4.5 / 1.75 = 2.57
     assert stats[0]["count"] == 1.75
+    assert stats[0]["median"] == 3  # 2 in exactextract
+    assert round(stats[0]["std"], 2) == 1.05
 
     stats = utils.get_array_statistics(data)
     assert len(stats) == 1
@@ -590,6 +595,23 @@ def test_get_array_statistics_coverage():
     assert stats[0]["mean"] == 2.5
     assert stats[0]["count"] == 4
 
+    # same test as https://github.com/isciences/exactextract/blob/0883cd585d9c7b6b4e936aeca4aa84a15adf82d2/python/tests/test_exact_extract.py#L48-L110
+    data = np.ma.arange(1, 10, dtype=np.int32).reshape(3, 3)
+    coverage = np.array([0.25, 0.5, 0.25, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25]).reshape(3, 3)
+    stats = utils.get_array_statistics(data, coverage=coverage, percentiles=[25, 75])
+    assert len(stats) == 1
+    assert stats[0]["count"] == 4
+    assert stats[0]["mean"] == 5
+    assert stats[0]["median"] == 5
+    assert stats[0]["min"] == 1
+    assert stats[0]["max"] == 9
+    # exactextract takes coverage into account, we don't
+    assert stats[0]["minority"] == 1  # 1 in exactextract
+    assert stats[0]["majority"] == 1  # 5 in exactextract
+    assert stats[0]["percentile_25"] == 3
+    assert stats[0]["percentile_75"] == 6
+    assert stats[0]["std"] == math.sqrt(5)
+
 
 def test_get_vrt_transform_world_file(dataset_fixture):
     """Should return correct transform and size."""