diff --git a/pyproject.toml b/pyproject.toml index 32184ea..fe70d61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ classifiers = [ "Operating System :: OS Independent", ] -dependencies = ["narwhals", "pydantic"] +dependencies = ["narwhals", "pydantic", "scikit-learn"] [project.optional-dependencies] dev = [ diff --git a/sklearo/encoding/target.py b/sklearo/encoding/target.py index 8e8e68c..a89b31a 100644 --- a/sklearo/encoding/target.py +++ b/sklearo/encoding/target.py @@ -88,10 +88,13 @@ def __init__( nw.Categorical, nw.String, ), - unseen: Literal["raise", "ignore"] = "raise", - fill_value_unseen: int | float | None | Literal["mean"] = "mean", + unseen: Literal["raise", "ignore", "fill"] = "raise", + fill_value_unseen: float | None | Literal["mean"] = "mean", missing_values: Literal["encode", "ignore", "raise"] = "encode", + underrepresented_categories: Literal["raise", "fill"] = "raise", + fill_values_underrepresented: float | None | Literal["mean"] = "mean", target_type: Literal["auto", "binary", "multiclass", "continuous"] = "auto", + smooth: Literal["auto"] | float = "auto", ) -> None: self.columns = columns @@ -99,12 +102,90 @@ def __init__( self.unseen = unseen self.fill_value_unseen = fill_value_unseen self.target_type = target_type + self.smooth = smooth + self.underrepresented_categories = underrepresented_categories + self.fill_values_underrepresented = fill_values_underrepresented def _calculate_target_statistic( self, x_y: IntoFrameT, target_col: str, column: str ) -> dict: - mean_target_all_categories = ( - x_y.group_by(column).agg(nw.col(target_col).mean()).rows() + + if column in ( + "category_count", + "sum_target", + "std_target", + "smoothing", + "shrinkage", + "smoothed_target", + ): + # rename the column to avoid conflict + original_column_name = column + x_y = x_y.rename(columns={column: f"{column}_original"}) + column = f"{column}_original" + else: + original_column_name = column + + x_y_grouped = x_y.group_by(column, drop_null_keys=True).agg( + category_count=nw.col(target_col).count(), + sum_target=nw.col(target_col).sum(), + **( + {"std_target": nw.col(target_col).std()} + if self.smooth == "auto" + else {} + ), + ) + underrepresented_categories = x_y_grouped.filter(nw.col("category_count") == 1)[ + column + ].to_list() + if underrepresented_categories: + if self.underrepresented_categories == "raise": + raise ValueError( + f"Found underrepresented categories for the column {original_column_name}: " + f"{underrepresented_categories}. Please consider handling underrepresented " + "categories by using a RareLabelEncoder. Alternatively, set " + "underrepresented_categories to 'fill'." + ) + else: + if self.fill_values_underrepresented == "mean": + fill_values_underrepresented = x_y[target_col].mean() + else: + fill_values_underrepresented = self.fill_values_underrepresented + + x_y_grouped = x_y_grouped.filter( + ~nw.col(column).is_in(underrepresented_categories) + ) + encoding_dict = { + category: fill_values_underrepresented + for category in underrepresented_categories + } + else: + encoding_dict = {} + + if self.smooth == "auto": + target_std = x_y[target_col].std() + x_y_grouped = x_y_grouped.with_columns( + smoothing=nw.col("std_target") / target_std + ) + else: + x_y_grouped = x_y_grouped.with_columns(smoothing=nw.lit(self.smooth)) + + categories_encoding_as_list = ( + x_y_grouped.with_columns( + shrinkage=nw.col("category_count") + / (nw.col("category_count") + nw.col("smoothing")) + ) + .with_columns( + smoothed_target=nw.col("shrinkage") + * nw.col("sum_target") + / nw.col("category_count") + + (1 - nw.col("shrinkage")) + * nw.col("sum_target") + / nw.col("category_count") + ) + .select(column, "smoothed_target") + .rows() ) - mean_target = dict(mean_target_all_categories) - return mean_target + + encoding_dict.update(dict(categories_encoding_as_list)) + + return encoding_dict diff --git a/tests/encoding/test_target.py b/tests/encoding/test_target.py index d7a0e76..ddb19ac 100644 --- a/tests/encoding/test_target.py +++ b/tests/encoding/test_target.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import polars as pl @@ -27,7 +29,26 @@ def multi_class_data(self): } return data - def test_woe_encoder_fit_binary(self, binary_class_data, DataFrame): + @pytest.fixture + def regression_data(self): + data = { + "category": ["A"] * 4 + ["B"] * 6, + "target": [ + 100.0, + 200.0, + 300.0, + 400.0, + 500.0, + 600.0, + 700.0, + 800.0, + 900.0, + 10000.0, + ], + } + return data + + def test_target_encoder_fit_binary(self, binary_class_data, DataFrame): binary_class_data = DataFrame(binary_class_data) encoder = TargetEncoder() encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) @@ -35,7 +56,42 @@ def test_woe_encoder_fit_binary(self, binary_class_data, DataFrame): assert encoder.columns_ == ["category"] assert "category" in encoder.encoding_map_ - def test_woe_encoder_fit_multiclass_non_int_target( + def test_target_encoder_fit_regression(self, regression_data, DataFrame): + regression_data = DataFrame(regression_data) + encoder = TargetEncoder() + encoder.fit(regression_data[["category"]], regression_data["target"]) + + assert encoder.columns_ == ["category"] + assert "category" in encoder.encoding_map_ + + def test_target_encoder_unseen_value_fill_unseen_multiclass( + self, multi_class_data, DataFrame + ): + multi_class_data = DataFrame(multi_class_data) + encoder = TargetEncoder(unseen="fill", fill_value_unseen="mean") + encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) + + new_data = DataFrame({"category": ["A", "B", "D"]}) + with pytest.warns(UserWarning, match="Unseen categories"): + transformed = encoder.transform(new_data) + + np.testing.assert_allclose( + transformed["category_mean_target_class_1"].to_list(), + [0.4, 0.2, 0.3], + rtol=1e-5, + ) + np.testing.assert_allclose( + transformed["category_mean_target_class_2"].to_list(), + [0.2, 0.4, 0.3], + rtol=1e-5, + ) + np.testing.assert_allclose( + transformed["category_mean_target_class_3"].to_list(), + [0.4, 0.4, 0.4], + rtol=1e-5, + ) + + def test_target_encoder_fit_multiclass_non_int_target( self, binary_class_data, DataFrame ): binary_class_data = DataFrame(binary_class_data) @@ -52,7 +108,9 @@ def test_woe_encoder_fit_multiclass_non_int_target( rtol=1e-5, ) - def test_woe_encoder_fit_binary_non_int_target(self, multi_class_data, DataFrame): + def test_target_encoder_fit_binary_non_int_target( + self, multi_class_data, DataFrame + ): multi_class_data = DataFrame(multi_class_data) encoder = TargetEncoder(columns=["target"]) encoder.fit(multi_class_data[["target"]], multi_class_data["category"]) @@ -84,7 +142,7 @@ def test_woe_encoder_fit_binary_non_int_target(self, multi_class_data, DataFrame rtol=1e-5, ) - def test_woe_encoder_fit_binary_non_int_target_classes_1_and_2( + def test_target_encoder_fit_binary_non_int_target_classes_1_and_2( self, binary_class_data, DataFrame ): binary_class_data["target"] = [ @@ -121,7 +179,7 @@ def test_woe_encoder_fit_binary_non_int_target_classes_1_and_2( rtol=1e-5, ) - def test_woe_encoder_fit_with_target_in_X_binary( + def test_target_encoder_fit_with_target_in_X_binary( self, binary_class_data, DataFrame ): binary_class_data = DataFrame(binary_class_data) @@ -132,7 +190,7 @@ def test_woe_encoder_fit_with_target_in_X_binary( assert encoder.columns_ == ["category", "target"] assert "category" in encoder.encoding_map_ - def test_woe_encoder_fit_with_target_in_X_multi_class( + def test_target_encoder_fit_with_target_in_X_multi_class( self, multi_class_data, DataFrame ): multi_class_data = DataFrame(multi_class_data) @@ -143,7 +201,7 @@ def test_woe_encoder_fit_with_target_in_X_multi_class( assert encoder.columns_ == ["category", "target"] assert "category" in encoder.encoding_map_ - def test_woe_encoder_fit_with_empty_columns(self, multi_class_data, DataFrame): + def test_target_encoder_fit_with_empty_columns(self, multi_class_data, DataFrame): multi_class_data = DataFrame(multi_class_data) encoder = TargetEncoder(columns=[]) encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) @@ -151,7 +209,7 @@ def test_woe_encoder_fit_with_empty_columns(self, multi_class_data, DataFrame): assert encoder.columns_ == [] assert encoder.encoding_map_ == {} - def test_woe_encoder_fit_multi_class(self, multi_class_data, DataFrame): + def test_target_encoder_fit_multi_class(self, multi_class_data, DataFrame): multi_class_data = DataFrame(multi_class_data) encoder = TargetEncoder() encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) @@ -159,7 +217,7 @@ def test_woe_encoder_fit_multi_class(self, multi_class_data, DataFrame): assert encoder.columns_ == ["category"] assert "category" in encoder.encoding_map_ - def test_woe_encoder_transform_binary(self, binary_class_data, DataFrame): + def test_target_encoder_transform_binary(self, binary_class_data, DataFrame): binary_class_data = DataFrame(binary_class_data) encoder = TargetEncoder() encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) @@ -181,7 +239,30 @@ def test_woe_encoder_transform_binary(self, binary_class_data, DataFrame): ) assert isinstance(transformed, DataFrame) - def test_woe_encoder_transform_multi_class(self, multi_class_data, DataFrame): + def test_target_encoder_transform_regression(self, regression_data, DataFrame): + regression_data = DataFrame(regression_data) + encoder = TargetEncoder() + encoder.fit(regression_data[["category"]], regression_data["target"]) + transformed = encoder.transform(regression_data[["category"]]) + + expected_values = [ + 250.0, + 250.0, + 250.0, + 250.0, + 2250.0, + 2250.0, + 2250.0, + 2250.0, + 2250.0, + 2250.0, + ] + np.testing.assert_allclose( + transformed["category"].to_list(), expected_values, rtol=1e-5 + ) + assert isinstance(transformed, DataFrame) + + def test_target_encoder_transform_multi_class(self, multi_class_data, DataFrame): multi_class_data = DataFrame(multi_class_data) encoder = TargetEncoder() encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) @@ -209,12 +290,12 @@ def test_woe_encoder_transform_multi_class(self, multi_class_data, DataFrame): np.testing.assert_allclose( transformed["category_mean_target_class_3"], - # For class 3 A counts : 2/5, B counts : 2/ + # For class 3 A counts : 2/5, B counts : 2/5 [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], rtol=1e-5, ) - def test_woe_encoder_handle_missing_values_binary( + def test_target_encoder_underrepresented_categories_binary( self, binary_class_data, DataFrame ): binary_class_data["category"][0] = None @@ -222,24 +303,121 @@ def test_woe_encoder_handle_missing_values_binary( encoder = TargetEncoder(missing_values="encode") + with pytest.raises( + ValueError, + match=re.escape( + "Found underrepresented categories for the column category: " + "['MISSING']. Please consider handling underrepresented categories by using a " + "RareLabelEncoder. Alternatively, set underrepresented_categories to 'fill'." + ), + ): + encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) + + def test_target_encoder_underrepresented_categories_multi_class( + self, multi_class_data, DataFrame + ): + multi_class_data["category"][0] = None + multi_class_data = DataFrame(multi_class_data) + + encoder = TargetEncoder(missing_values="encode") + with pytest.raises( + ValueError, + match=re.escape( + "Found underrepresented categories for the column category: " + "['MISSING']. Please consider handling underrepresented categories by using a " + "RareLabelEncoder. Alternatively, set underrepresented_categories to 'fill'." + ), + ): + encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) + + def test_target_encoder_handle_missing_values_binary( + self, binary_class_data, DataFrame + ): + binary_class_data["category"][0] = None + binary_class_data = DataFrame(binary_class_data) + + encoder = TargetEncoder( + missing_values="encode", underrepresented_categories="fill" + ) encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) - transformed = encoder.transform(binary_class_data[["category"]]) - assert "MISSING" in encoder.encoding_map_["category"] + transformed = encoder.transform(binary_class_data[["category"]]) + np.testing.assert_allclose( + transformed["category"].to_list(), + [ + 0.555556, + 0.0, + 0.0, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + ], + rtol=1e-5, + ) - def test_woe_encoder_handle_missing_values_multi_class( + def test_target_encoder_handle_missing_values_multi_class( self, multi_class_data, DataFrame ): multi_class_data["category"][0] = None multi_class_data = DataFrame(multi_class_data) - encoder = TargetEncoder(missing_values="encode") + encoder = TargetEncoder( + missing_values="encode", underrepresented_categories="fill" + ) + encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) + transformed = encoder.transform(multi_class_data[["category"]]) + np.testing.assert_allclose( + transformed["category_mean_target_class_1"].to_list(), + [0.3, 0.25, 0.25, 0.25, 0.25, 0.2, 0.2, 0.2, 0.2, 0.2], + rtol=1e-5, + ) + np.testing.assert_allclose( + transformed["category_mean_target_class_2"].to_list(), + [0.3, 0.25, 0.25, 0.25, 0.25, 0.4, 0.4, 0.4, 0.4, 0.4], + rtol=1e-5, + ) + np.testing.assert_allclose( + transformed["category_mean_target_class_3"].to_list(), + [0.4, 0.5, 0.5, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.4], + rtol=1e-5, + ) - assert "MISSING" in encoder.encoding_map_["category"][1] + def test_target_encoder_unnderrepresented_categories_binary_fill_binary_set_value( + self, binary_class_data, DataFrame + ): + binary_class_data["category"][0] = None + binary_class_data = DataFrame(binary_class_data) + + encoder = TargetEncoder( + missing_values="encode", + underrepresented_categories="fill", + fill_values_underrepresented=999, + ) + encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) + + transformed = encoder.transform(binary_class_data[["category"]]) + np.testing.assert_allclose( + transformed["category"].to_list(), + [ + 999, + 0.0, + 0.0, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + 0.666667, + ], + rtol=1e-5, + ) - def test_woe_encoder_unseen_category_binary(self, binary_class_data, DataFrame): + def test_target_encoder_unseen_category_binary(self, binary_class_data, DataFrame): binary_class_data = DataFrame(binary_class_data) encoder = TargetEncoder(unseen="ignore", fill_value_unseen=-999) encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) @@ -252,7 +430,7 @@ def test_woe_encoder_unseen_category_binary(self, binary_class_data, DataFrame): transformed["category"].to_list(), [0.3333333, 0.6666667, -999], rtol=1e-5 ) - def test_woe_encoder_unseen_category_binary_raise( + def test_target_encoder_unseen_category_binary_raise( self, binary_class_data, DataFrame ): binary_class_data = DataFrame(binary_class_data) @@ -327,7 +505,7 @@ def test_X_y_wrong_size(self, binary_class_data, DataFrame): binary_class_data[["category"]].head(), binary_class_data["target"] ) - def test_woe_encoder_handle_missing_values_raise_in_fit( + def test_target_encoder_handle_missing_values_raise_in_fit( self, binary_class_data, DataFrame ): binary_class_data["category"][0] = None @@ -337,7 +515,7 @@ def test_woe_encoder_handle_missing_values_raise_in_fit( with pytest.raises(ValueError, match="Some columns have missing values."): encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) - def test_woe_encoder_handle_missing_values_raise_in_transform( + def test_target_encoder_handle_missing_values_raise_in_transform( self, binary_class_data, DataFrame ): binary_class_data_df = DataFrame(binary_class_data) @@ -351,7 +529,7 @@ def test_woe_encoder_handle_missing_values_raise_in_transform( with pytest.raises(ValueError, match="Some columns have missing values."): encoder.transform(binary_class_data_df[["category"]]) - def test_woe_encoder_handle_missing_values_ignore_in_fit( + def test_target_encoder_handle_missing_values_ignore_in_fit( self, binary_class_data, DataFrame ): binary_class_data["category"][1] = None @@ -362,7 +540,7 @@ def test_woe_encoder_handle_missing_values_ignore_in_fit( # Ensure that fitting does not raise an error assert encoder is not None - def test_woe_encoder_handle_missing_values_ignore_in_transform( + def test_target_encoder_handle_missing_values_ignore_in_transform( self, binary_class_data, DataFrame ): binary_class_data_df = DataFrame(binary_class_data) @@ -385,7 +563,7 @@ def test_missing_values_in_target_variable(self, binary_class_data, DataFrame): with pytest.raises(ValueError, match="y contains missing values."): encoder.fit(binary_class_data[["category"]], binary_class_data["target"]) - def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame): + def test_target_encoder_fit_transform(self, binary_class_data, DataFrame): binary_class_data_df = DataFrame(binary_class_data) encoder = TargetEncoder() @@ -396,3 +574,27 @@ def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame): # Ensure that the transformed data is not None and has the expected shape assert transformed is not None assert transformed.shape[0] == binary_class_data_df.shape[0] + + def test_target_encoder_explicitly_set_target_type( + self, multi_class_data, DataFrame + ): + multi_class_data = DataFrame(multi_class_data) + encoder = TargetEncoder(target_type="continuous") + encoder.fit(multi_class_data[["category"]], multi_class_data["target"]) + + assert encoder.columns_ == ["category"] + assert "category" in encoder.encoding_map_ + + transformed_data = encoder.transform(multi_class_data[["category"]]) + + assert ( + encoder.get_feature_names_out() + == ["category"] + == list(transformed_data.columns) + ) + + np.testing.assert_allclose( + transformed_data["category"].to_list(), + [2.0, 2.0, 2.0, 2.0, 2.0, 2.2, 2.2, 2.2, 2.2, 2.2], + rtol=1e-5, + ) diff --git a/tests/encoding/test_woe.py b/tests/encoding/test_woe.py index 3b8f5a3..d0e3344 100644 --- a/tests/encoding/test_woe.py +++ b/tests/encoding/test_woe.py @@ -478,3 +478,25 @@ def test_woe_encoder_fit_transform(self, binary_class_data, DataFrame): # Ensure that the transformed data is not None and has the expected shape assert transformed is not None assert transformed.shape[0] == binary_class_data_df.shape[0] + + def test_woe_encoder_with_regression_target_type_raises_error(self, DataFrame): + regression_data = { + "category": ["A"] * 4 + ["B"] * 6, + "target": [ + 100.0, + 200.0, + 300.0, + 400.0, + 500.0, + 600.0, + 700.0, + 800.0, + 900.0, + 10000.0, + ], + } + regression_data_df = DataFrame(regression_data) + + encoder = WOEEncoder() + with pytest.raises(ValueError, match="Invalid type of target 'continuous'."): + encoder.fit(regression_data_df[["category"]], regression_data_df["target"])