From e3cc12054a33bd634b93ff4628b713827e85fccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 12 Nov 2024 16:53:58 -0600 Subject: [PATCH] do not copy column-major numpy arrays when creating Dataset --- python-package/lightgbm/basic.py | 20 +++++++++++++++----- tests/python_package_test/test_basic.py | 13 +++++++++++++ tests/python_package_test/test_engine.py | 24 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index cf3723aadc63..6d52f670c0fd 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -188,6 +188,14 @@ def _get_sample_count(total_nrow: int, params: str) -> int: return sample_cnt.value +def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray: + data = mat.ravel(order="A") # keeps memory layout + if data.dtype not in (np.float32, np.float64): + # change non-float data to float data, need to copy + data = data.astype(np.float32) + return data + + class _MissingType(Enum): NONE = "None" NAN = "NaN" @@ -685,6 +693,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va _C_API_DTYPE_INT64 = 3 """Matrix is row major in Python""" +_C_API_IS_COL_MAJOR = 0 _C_API_IS_ROW_MAJOR = 1 """Macro definition of prediction type in C API of LightGBM""" @@ -2297,10 +2306,11 @@ def __init_from_np2d( raise ValueError("Input numpy.ndarray must be 2 dimensional") self._handle = ctypes.c_void_p() - if mat.dtype == np.float32 or mat.dtype == np.float64: - data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype) - else: # change non-float data to float data, need to copy - data = np.asarray(mat.reshape(mat.size), dtype=np.float32) + data = _np2d_to_np1d(mat) + if mat.flags["C_CONTIGUOUS"]: + layout = _C_API_IS_ROW_MAJOR + else: + layout = _C_API_IS_COL_MAJOR ptr_data, type_ptr_data, _ = _c_float_array(data) _safe_call( @@ -2309,7 +2319,7 @@ def __init_from_np2d( ctypes.c_int(type_ptr_data), ctypes.c_int32(mat.shape[0]), ctypes.c_int32(mat.shape[1]), - ctypes.c_int(_C_API_IS_ROW_MAJOR), + ctypes.c_int(layout), _c_str(params_str), ref_dataset, ctypes.byref(self._handle), diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 0dfe3e47fa11..72ef2866338a 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -947,3 +947,16 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity." ) assert expected_warning in capsys.readouterr().out + + +@pytest.mark.parametrize("order", ["C", "F"]) +@pytest.mark.parametrize("dtype", ["float32", "int64"]) +def test_no_copy_in_dataset_from_numpy_2d(order, dtype): + X = np.random.rand(100, 3) + X = np.require(X, dtype=dtype, requirements=order) + X1d = lgb.basic._np2d_to_np1d(X) + if dtype == "float32": + assert np.shares_memory(X, X1d) + else: + # makes a copy + assert not np.shares_memory(X, X1d) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9ae471e7f4b9..07735daeec86 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4611,3 +4611,27 @@ def test_bagging_by_query_in_lambdarank(): ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] assert ndcg_score_bagging_by_query >= ndcg_score - 0.1 assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 + + +def test_train_with_column_major_dataset(): + params = {"num_leaves": 16} + rounds = 2 + + X_row, y = make_synthetic_regression() + assert X_row.flags["C_CONTIGUOUS"] + ds_row = lgb.Dataset(X_row, y) + bst_row = lgb.train(params, ds_row, num_boost_round=rounds) + pred_row = bst_row.predict(X_row) + # check that we didn't get a trivial model + dumped_row = bst_row.dump_model() + assert len(dumped_row["tree_info"]) == rounds + for tree in dumped_row["tree_info"]: + assert tree["num_leaves"] > 1 + + X_col = np.asfortranarray(X_row) + assert X_col.flags["F_CONTIGUOUS"] + ds_col = lgb.Dataset(X_col, y) + bst_col = lgb.train(params, ds_col, num_boost_round=rounds) + dumped_col = bst_col.dump_model() + + assert dumped_row == dumped_col