From 0058c6e703281b7614f21a02d35d74da05c887f3 Mon Sep 17 00:00:00 2001 From: atmorling Date: Thu, 31 Oct 2024 22:52:55 +1100 Subject: [PATCH] Allow classifier to label ranges (#269) --- ecoscope/analysis/classifier.py | 36 ++++++++++++++++++++++++++++++--- tests/test_classifier.py | 26 +++++++++++++++++++++++- tests/test_ecomap.py | 9 ++++++++- 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/ecoscope/analysis/classifier.py b/ecoscope/analysis/classifier.py index 0213e298..acc18973 100644 --- a/ecoscope/analysis/classifier.py +++ b/ecoscope/analysis/classifier.py @@ -25,19 +25,36 @@ # pass in a dataframe and output a series def apply_classification( - dataframe, input_column_name, output_column_name=None, labels=None, scheme="natural_breaks", **kwargs + dataframe, + input_column_name, + output_column_name=None, + labels=None, + scheme="natural_breaks", + label_prefix="", + label_suffix="", + label_ranges=False, + label_decimals=1, + **kwargs, ): """ - Classifies the data in a GeoDataFrame column using specified classification scheme. + Classifies the data in a DataFrame column using specified classification scheme. Args: dataframe (pd.DatFrame): The data. input_column_name (str): The dataframe column to classify. - output_column_name (str): The dataframe column that will contain the classification. + output_column_names (str): The dataframe column that will contain the classification. Defaults to "_classified" labels (list[str]): labels of bins, use bin edges if labels==None. scheme (str): Classification scheme to use [equal_interval, natural_breaks, quantile, std_mean, max_breaks, fisher_jenks] + label_prefix (str): Prepends provided string to each label + label_suffix (str): Appends provided string to each label + label_ranges (bool): Applicable only when 'labels' is not set + If True, generated labels will be the range between bin edges, + rather than the bin edges themselves. + label_decimals (int): Applicable only when 'labels' is not set + Specifies the number of decimal places in the label + **kwargs: Additional keyword arguments specific to the classification scheme, passed to mapclassify. @@ -75,7 +92,20 @@ def apply_classification( classifier = classifier_class(dataframe[input_column_name].to_numpy(), **kwargs) if labels is None: labels = classifier.bins + + if label_ranges: + # We could do this using mapclassify.get_legend_classes, but this generates a cleaner label + ranges = [f"{dataframe[input_column_name].min():.{label_decimals}f} - {labels[0]:.{label_decimals}f}"] + ranges.extend( + [f"{labels[i]:.{label_decimals}f} - {labels[i + 1]:.{label_decimals}f}" for i in range(len(labels) - 1)] + ) + labels = ranges + else: + labels = [round(label, label_decimals) for label in labels] + assert len(labels) == len(classifier.bins) + if label_prefix or label_suffix: + labels = [f"{label_prefix}{label}{label_suffix}" for label in labels] dataframe[output_column_name] = [labels[i] for i in classifier.yb] return dataframe diff --git a/tests/test_classifier.py b/tests/test_classifier.py index ec943717..4b5491e5 100644 --- a/tests/test_classifier.py +++ b/tests/test_classifier.py @@ -21,7 +21,7 @@ def sample_df(): ( "std_mean", {"multiples": [-2, -1, 1, 2]}, - [1.4188611699158102, 4.58113883008419, 4.58113883008419, 4.58113883008419, 6.16227766016838], + [1.4, 4.6, 4.6, 4.6, 6.2], ), ("max_breaks", {"k": 4}, [2.5, 2.5, 3.5, 4.5, 5.0]), ("fisher_jenks", {"k": 5}, [1.0, 2.0, 3.0, 4.0, 5.0]), @@ -38,6 +38,19 @@ def test_classify_with_labels(sample_df): assert result["value_classified"].values.tolist() == ["1", "1", "1", "2", "2"] +def test_classify_with_labels_prefix_suffix(sample_df): + result = apply_classification( + sample_df, + input_column_name="value", + labels=["1", "2"], + label_prefix="_", + label_suffix="_", + scheme="equal_interval", + k=2, + ) + assert result["value_classified"].values.tolist() == ["_1_", "_1_", "_1_", "_2_", "_2_"] + + def test_classify_with_invalid_labels(sample_df): with pytest.raises(AssertionError): apply_classification(sample_df, input_column_name="value", labels=[0], scheme="std_mean") @@ -104,3 +117,14 @@ def test_apply_colormap_cmap_user_defined_bad(movebank_relocations): with pytest.raises(AssertionError): apply_color_map(classified, "speed_bins", cmap) + + +def test_classify_with_ranges(sample_df): + result = apply_classification(sample_df, input_column_name="value", scheme="equal_interval", label_ranges=True, k=5) + assert result["value_classified"].values.tolist() == [ + "1.0 - 1.8", + "1.8 - 2.6", + "2.6 - 3.4", + "3.4 - 4.2", + "4.2 - 5.0", + ] diff --git a/tests/test_ecomap.py b/tests/test_ecomap.py index 0863634f..773907cb 100644 --- a/tests/test_ecomap.py +++ b/tests/test_ecomap.py @@ -279,13 +279,20 @@ def test_add_polyline_with_color(movebank_relocations): trajectory = ecoscope.base.Trajectory.from_relocations(movebank_relocations) # this is effectively a reimplementation of SpeedDataFrame apply_classification( - trajectory, input_column_name="speed_kmhr", output_column_name="speed_bins", scheme="equal_interval", k=6 + trajectory, + input_column_name="speed_kmhr", + output_column_name="speed_bins", + scheme="equal_interval", + label_suffix=" km/h", + label_ranges=True, + k=6, ) cmap = ["#1a9850", "#91cf60", "#d9ef8b", "#fee08b", "#fc8d59", "#d73027"] apply_color_map(trajectory, "speed_bins", cmap=cmap, output_column_name="speed_colors") m = EcoMap() m.add_layer(m.polyline_layer(trajectory, color_column="speed_colors", get_width=2000)) + m.add_legend(labels=trajectory["speed_bins"], colors=trajectory["speed_colors"]) assert len(m.layers) == 2 assert isinstance(m.layers[1], PathLayer)