do not copy column-major numpy arrays when creating Dataset

microsoft · Nov 12, 2024 · e3cc120 · e3cc120
1 parent 5151fe8
commit e3cc120
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 5 deletions.
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -188,6 +188,14 @@ def _get_sample_count(total_nrow: int, params: str) -> int:
     return sample_cnt.value
 
 
+def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray:
+    data = mat.ravel(order="A")  # keeps memory layout
+    if data.dtype not in (np.float32, np.float64):
+        # change non-float data to float data, need to copy
+        data = data.astype(np.float32)
+    return data
+
+
 class _MissingType(Enum):
     NONE = "None"
     NAN = "NaN"
@@ -685,6 +693,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
 _C_API_DTYPE_INT64 = 3
 
 """Matrix is row major in Python"""
+_C_API_IS_COL_MAJOR = 0
 _C_API_IS_ROW_MAJOR = 1
 
 """Macro definition of prediction type in C API of LightGBM"""
@@ -2297,10 +2306,11 @@ def __init_from_np2d(
             raise ValueError("Input numpy.ndarray must be 2 dimensional")
 
         self._handle = ctypes.c_void_p()
-        if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
-        else:  # change non-float data to float data, need to copy
-            data = np.asarray(mat.reshape(mat.size), dtype=np.float32)
+        data = _np2d_to_np1d(mat)
+        if mat.flags["C_CONTIGUOUS"]:
+            layout = _C_API_IS_ROW_MAJOR
+        else:
+            layout = _C_API_IS_COL_MAJOR
 
         ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(
@@ -2309,7 +2319,7 @@ def __init_from_np2d(
                 ctypes.c_int(type_ptr_data),
                 ctypes.c_int32(mat.shape[0]),
                 ctypes.c_int32(mat.shape[1]),
-                ctypes.c_int(_C_API_IS_ROW_MAJOR),
+                ctypes.c_int(layout),
                 _c_str(params_str),
                 ref_dataset,
                 ctypes.byref(self._handle),

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
@@ -947,3 +947,16 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
         "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
     )
     assert expected_warning in capsys.readouterr().out
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+@pytest.mark.parametrize("dtype", ["float32", "int64"])
+def test_no_copy_in_dataset_from_numpy_2d(order, dtype):
+    X = np.random.rand(100, 3)
+    X = np.require(X, dtype=dtype, requirements=order)
+    X1d = lgb.basic._np2d_to_np1d(X)
+    if dtype == "float32":
+        assert np.shares_memory(X, X1d)
+    else:
+        # makes a copy
+        assert not np.shares_memory(X, X1d)
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
@@ -4611,3 +4611,27 @@ def test_bagging_by_query_in_lambdarank():
     ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"]
     assert ndcg_score_bagging_by_query >= ndcg_score - 0.1
     assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1
+
+
+def test_train_with_column_major_dataset():
+    params = {"num_leaves": 16}
+    rounds = 2
+
+    X_row, y = make_synthetic_regression()
+    assert X_row.flags["C_CONTIGUOUS"]
+    ds_row = lgb.Dataset(X_row, y)
+    bst_row = lgb.train(params, ds_row, num_boost_round=rounds)
+    pred_row = bst_row.predict(X_row)
+    # check that we didn't get a trivial model
+    dumped_row = bst_row.dump_model()
+    assert len(dumped_row["tree_info"]) == rounds
+    for tree in dumped_row["tree_info"]:
+        assert tree["num_leaves"] > 1
+
+    X_col = np.asfortranarray(X_row)
+    assert X_col.flags["F_CONTIGUOUS"]
+    ds_col = lgb.Dataset(X_col, y)
+    bst_col = lgb.train(params, ds_col, num_boost_round=rounds)
+    dumped_col = bst_col.dump_model()
+
+    assert dumped_row == dumped_col