Skip to content

Commit

Permalink
do not copy column-major numpy arrays when creating Dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez committed Nov 12, 2024
1 parent 5151fe8 commit e3cc120
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 5 deletions.
20 changes: 15 additions & 5 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@ def _get_sample_count(total_nrow: int, params: str) -> int:
return sample_cnt.value


def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray:
data = mat.ravel(order="A") # keeps memory layout
if data.dtype not in (np.float32, np.float64):
# change non-float data to float data, need to copy
data = data.astype(np.float32)
return data


class _MissingType(Enum):
NONE = "None"
NAN = "NaN"
Expand Down Expand Up @@ -685,6 +693,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
_C_API_DTYPE_INT64 = 3

"""Matrix is row major in Python"""
_C_API_IS_COL_MAJOR = 0
_C_API_IS_ROW_MAJOR = 1

"""Macro definition of prediction type in C API of LightGBM"""
Expand Down Expand Up @@ -2297,10 +2306,11 @@ def __init_from_np2d(
raise ValueError("Input numpy.ndarray must be 2 dimensional")

self._handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
data = np.asarray(mat.reshape(mat.size), dtype=np.float32)
data = _np2d_to_np1d(mat)
if mat.flags["C_CONTIGUOUS"]:
layout = _C_API_IS_ROW_MAJOR
else:
layout = _C_API_IS_COL_MAJOR

ptr_data, type_ptr_data, _ = _c_float_array(data)
_safe_call(
Expand All @@ -2309,7 +2319,7 @@ def __init_from_np2d(
ctypes.c_int(type_ptr_data),
ctypes.c_int32(mat.shape[0]),
ctypes.c_int32(mat.shape[1]),
ctypes.c_int(_C_API_IS_ROW_MAJOR),
ctypes.c_int(layout),
_c_str(params_str),
ref_dataset,
ctypes.byref(self._handle),
Expand Down
13 changes: 13 additions & 0 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,3 +947,16 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
"in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
)
assert expected_warning in capsys.readouterr().out


@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_no_copy_in_dataset_from_numpy_2d(order, dtype):
X = np.random.rand(100, 3)
X = np.require(X, dtype=dtype, requirements=order)
X1d = lgb.basic._np2d_to_np1d(X)
if dtype == "float32":
assert np.shares_memory(X, X1d)
else:
# makes a copy
assert not np.shares_memory(X, X1d)
24 changes: 24 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4611,3 +4611,27 @@ def test_bagging_by_query_in_lambdarank():
ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"]
assert ndcg_score_bagging_by_query >= ndcg_score - 0.1
assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1


def test_train_with_column_major_dataset():
params = {"num_leaves": 16}
rounds = 2

X_row, y = make_synthetic_regression()
assert X_row.flags["C_CONTIGUOUS"]
ds_row = lgb.Dataset(X_row, y)
bst_row = lgb.train(params, ds_row, num_boost_round=rounds)
pred_row = bst_row.predict(X_row)
# check that we didn't get a trivial model
dumped_row = bst_row.dump_model()
assert len(dumped_row["tree_info"]) == rounds
for tree in dumped_row["tree_info"]:
assert tree["num_leaves"] > 1

X_col = np.asfortranarray(X_row)
assert X_col.flags["F_CONTIGUOUS"]
ds_col = lgb.Dataset(X_col, y)
bst_col = lgb.train(params, ds_col, num_boost_round=rounds)
dumped_col = bst_col.dump_model()

assert dumped_row == dumped_col

0 comments on commit e3cc120

Please sign in to comment.