From 0058c6e703281b7614f21a02d35d74da05c887f3 Mon Sep 17 00:00:00 2001
From: atmorling <atmorling@gmail.com>
Date: Thu, 31 Oct 2024 22:52:55 +1100
Subject: [PATCH] Allow classifier to label ranges (#269)

---
 ecoscope/analysis/classifier.py | 36 ++++++++++++++++++++++++++++++---
 tests/test_classifier.py        | 26 +++++++++++++++++++++++-
 tests/test_ecomap.py            |  9 ++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/ecoscope/analysis/classifier.py b/ecoscope/analysis/classifier.py
index 0213e298..acc18973 100644
--- a/ecoscope/analysis/classifier.py
+++ b/ecoscope/analysis/classifier.py
@@ -25,19 +25,36 @@
 
 # pass in a dataframe and output a series
 def apply_classification(
-    dataframe, input_column_name, output_column_name=None, labels=None, scheme="natural_breaks", **kwargs
+    dataframe,
+    input_column_name,
+    output_column_name=None,
+    labels=None,
+    scheme="natural_breaks",
+    label_prefix="",
+    label_suffix="",
+    label_ranges=False,
+    label_decimals=1,
+    **kwargs,
 ):
     """
-    Classifies the data in a GeoDataFrame column using specified classification scheme.
+    Classifies the data in a DataFrame column using specified classification scheme.
 
     Args:
     dataframe (pd.DatFrame): The data.
     input_column_name (str): The dataframe column to classify.
-    output_column_name (str): The dataframe column that will contain the classification.
+    output_column_names (str): The dataframe column that will contain the classification.
         Defaults to "<input_column_name>_classified"
     labels (list[str]): labels of bins, use bin edges if labels==None.
     scheme (str): Classification scheme to use [equal_interval, natural_breaks, quantile, std_mean, max_breaks,
     fisher_jenks]
+    label_prefix (str): Prepends provided string to each label
+    label_suffix (str): Appends provided string to each label
+    label_ranges (bool): Applicable only when 'labels' is not set
+                         If True, generated labels will be the range between bin edges,
+                         rather than the bin edges themselves.
+    label_decimals (int): Applicable only when 'labels' is not set
+                          Specifies the number of decimal places in the label
+
 
     **kwargs:
         Additional keyword arguments specific to the classification scheme, passed to mapclassify.
@@ -75,7 +92,20 @@ def apply_classification(
     classifier = classifier_class(dataframe[input_column_name].to_numpy(), **kwargs)
     if labels is None:
         labels = classifier.bins
+
+        if label_ranges:
+            # We could do this using mapclassify.get_legend_classes, but this generates a cleaner label
+            ranges = [f"{dataframe[input_column_name].min():.{label_decimals}f} - {labels[0]:.{label_decimals}f}"]
+            ranges.extend(
+                [f"{labels[i]:.{label_decimals}f} - {labels[i + 1]:.{label_decimals}f}" for i in range(len(labels) - 1)]
+            )
+            labels = ranges
+        else:
+            labels = [round(label, label_decimals) for label in labels]
+
     assert len(labels) == len(classifier.bins)
+    if label_prefix or label_suffix:
+        labels = [f"{label_prefix}{label}{label_suffix}" for label in labels]
     dataframe[output_column_name] = [labels[i] for i in classifier.yb]
     return dataframe
 
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
index ec943717..4b5491e5 100644
--- a/tests/test_classifier.py
+++ b/tests/test_classifier.py
@@ -21,7 +21,7 @@ def sample_df():
         (
             "std_mean",
             {"multiples": [-2, -1, 1, 2]},
-            [1.4188611699158102, 4.58113883008419, 4.58113883008419, 4.58113883008419, 6.16227766016838],
+            [1.4, 4.6, 4.6, 4.6, 6.2],
         ),
         ("max_breaks", {"k": 4}, [2.5, 2.5, 3.5, 4.5, 5.0]),
         ("fisher_jenks", {"k": 5}, [1.0, 2.0, 3.0, 4.0, 5.0]),
@@ -38,6 +38,19 @@ def test_classify_with_labels(sample_df):
     assert result["value_classified"].values.tolist() == ["1", "1", "1", "2", "2"]
 
 
+def test_classify_with_labels_prefix_suffix(sample_df):
+    result = apply_classification(
+        sample_df,
+        input_column_name="value",
+        labels=["1", "2"],
+        label_prefix="_",
+        label_suffix="_",
+        scheme="equal_interval",
+        k=2,
+    )
+    assert result["value_classified"].values.tolist() == ["_1_", "_1_", "_1_", "_2_", "_2_"]
+
+
 def test_classify_with_invalid_labels(sample_df):
     with pytest.raises(AssertionError):
         apply_classification(sample_df, input_column_name="value", labels=[0], scheme="std_mean")
@@ -104,3 +117,14 @@ def test_apply_colormap_cmap_user_defined_bad(movebank_relocations):
 
     with pytest.raises(AssertionError):
         apply_color_map(classified, "speed_bins", cmap)
+
+
+def test_classify_with_ranges(sample_df):
+    result = apply_classification(sample_df, input_column_name="value", scheme="equal_interval", label_ranges=True, k=5)
+    assert result["value_classified"].values.tolist() == [
+        "1.0 - 1.8",
+        "1.8 - 2.6",
+        "2.6 - 3.4",
+        "3.4 - 4.2",
+        "4.2 - 5.0",
+    ]
diff --git a/tests/test_ecomap.py b/tests/test_ecomap.py
index 0863634f..773907cb 100644
--- a/tests/test_ecomap.py
+++ b/tests/test_ecomap.py
@@ -279,13 +279,20 @@ def test_add_polyline_with_color(movebank_relocations):
     trajectory = ecoscope.base.Trajectory.from_relocations(movebank_relocations)
     # this is effectively a reimplementation of SpeedDataFrame
     apply_classification(
-        trajectory, input_column_name="speed_kmhr", output_column_name="speed_bins", scheme="equal_interval", k=6
+        trajectory,
+        input_column_name="speed_kmhr",
+        output_column_name="speed_bins",
+        scheme="equal_interval",
+        label_suffix=" km/h",
+        label_ranges=True,
+        k=6,
     )
     cmap = ["#1a9850", "#91cf60", "#d9ef8b", "#fee08b", "#fc8d59", "#d73027"]
     apply_color_map(trajectory, "speed_bins", cmap=cmap, output_column_name="speed_colors")
 
     m = EcoMap()
     m.add_layer(m.polyline_layer(trajectory, color_column="speed_colors", get_width=2000))
+    m.add_legend(labels=trajectory["speed_bins"], colors=trajectory["speed_colors"])
 
     assert len(m.layers) == 2
     assert isinstance(m.layers[1], PathLayer)