Skip to content

Commit

Permalink
Applies the changes to lgbm Regressor
Browse files Browse the repository at this point in the history
Also adds a unittest.
  • Loading branch information
fberanizo committed Aug 29, 2022
1 parent 03e84e0 commit 5a498d9
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 5 deletions.
16 changes: 11 additions & 5 deletions src/fklearn/training/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ def lgbm_regression_learner(df: pd.DataFrame,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: Dict[str, Any] = None,
categorical_features: Union[List[str], str] = "auto",
prediction_column: str = "prediction",
weight_column: str = None,
encode_extra_cols: bool = True) -> LearnerReturnType:
Expand Down Expand Up @@ -458,6 +459,11 @@ def lgbm_regression_learner(df: pd.DataFrame,
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
If not passed, the default will be used.
categorical_features : list of str, or 'auto', optional (default="auto")
A list of column names that should be treated as categorical features.
See the categorical_feature hyper-parameter in:
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
prediction_column : str
The name of the column with the predictions from the model.
Expand All @@ -474,17 +480,17 @@ def lgbm_regression_learner(df: pd.DataFrame,
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'regression')

weights = df[weight_column].values if weight_column else None
weights = df[weight_column] if weight_column else None

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
dtrain = lgbm.Dataset(df[features], label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True, categorical_feature=categorical_features)

bst = lgbm.train(params, dtrain, num_estimators)
bst = lgbm.train(params, dtrain, num_estimators, categorical_feature=categorical_features)

def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
col_dict = {prediction_column: bst.predict(new_df[features])}

if apply_shap:
import shap
Expand Down
30 changes: 30 additions & 0 deletions tests/training/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def test_lgbm_regression_learner():
assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])
assert "prediction" in pred_test.columns

# SHAP test
Expand All @@ -177,6 +178,35 @@ def test_lgbm_regression_learner():
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)

learner = lgbm_regression_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=1,
categorical_features=["x2"],
extra_params={"max_depth": 2,
"min_data_in_leaf": 1,
"min_data_per_group": 1,
"seed": 42},
prediction_column="prediction")

predict_fn, pred_train, log = learner(df_train)

pred_test = predict_fn(df_test)

expected_col_train = df_train.columns.tolist() + ["prediction"]
expected_col_test = df_test.columns.tolist() + ["prediction"]

assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()
assert any(tree['num_cat'] > 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn(df_test, apply_shap=True)
assert "shap_values" in pred_shap.columns
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)


def test_catboost_regressor_learner():
df_train = pd.DataFrame({
Expand Down

0 comments on commit 5a498d9

Please sign in to comment.