[python-package] Add support for NumPy 2.0, test against nightly vers…

…ions of dependencies (fixes #6454) (#6467)
microsoft · Jun 13, 2024 · 1e7ebc5 · 1e7ebc5
1 parent 6392682
commit 1e7ebc5
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 14 deletions.
diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -e -E -u -o pipefail
+
+# latest versions of lightgbm's dependencies,
+# including pre-releases and nightlies
+#
+# ref: https://github.com/pydata/xarray/blob/31111b3afe44fd6f7dac363264e94186cc5168d2/.github/workflows/upstream-dev-ci.yaml
+echo "installing testing dependencies"
+python -m pip install \
+    cloudpickle \
+    psutil \
+    pytest
+echo "done installing testing dependencies"
+
+echo "installing lightgbm's dependencies"
+python -m pip install \
+    --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
+    --prefer-binary \
+    --pre \
+    --upgrade \
+        'numpy>=2.0.0.dev0' \
+        'matplotlib>=3.10.0.dev0' \
+        'pandas>=3.0.0.dev0' \
+        'scikit-learn>=1.6.dev0' \
+        'scipy>=1.15.0.dev0'
+
+python -m pip install \
+    --extra-index-url https://pypi.fury.io/arrow-nightlies/ \
+    --prefer-binary \
+    --pre \
+    --upgrade \
+        'pyarrow>=17.0.0.dev0'
+
+python -m pip install \
+    'cffi>=1.15.1'
+
+echo "done installing lightgbm's dependencies"
+
+echo "installing lightgbm"
+pip install --no-deps dist/*.whl
+echo "done installing lightgbm"
+
+echo "installed package versions:"
+pip freeze
+
+echo ""
+echo "running tests"
+pytest tests/c_api_test/
+pytest tests/python_package_test/
diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml
@@ -75,6 +75,33 @@ jobs:
           export PATH=${CONDA}/bin:${PATH}
           $GITHUB_WORKSPACE/.ci/setup.sh || exit 1
           $GITHUB_WORKSPACE/.ci/test.sh || exit 1
+  test-latest-versions:
+    name: Python - latest versions (ubuntu-latest)
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 5
+          submodules: true
+      - name: Create wheel
+        run: |
+          docker run \
+            --rm \
+            --env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
+            -v $(pwd):/opt/lgb-build \
+            -w /opt/lgb-build \
+            lightgbm/vsts-agent:manylinux_2_28_x86_64 \
+            /bin/bash -c 'PATH=/opt/miniforge/bin:$PATH sh ./build-python.sh bdist_wheel --nomp'
+      - name: Test compatibility
+        run: |
+          docker run \
+            --rm \
+            -v $(pwd):/opt/lgb-build \
+            -w /opt/lgb-build \
+            python:3.11 \
+            /bin/bash ./.ci/test-python-latest.sh
   test-oldest-versions:
     name: Python - oldest supported versions (ubuntu-latest)
     runs-on: ubuntu-latest
@@ -89,6 +116,7 @@ jobs:
         run: |
           docker run \
             --rm \
+            --env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
             -v $(pwd):/opt/lgb-build \
             -w /opt/lgb-build \
             lightgbm/vsts-agent:manylinux_2_28_x86_64 \
@@ -104,7 +132,7 @@ jobs:
   all-python-package-jobs-successful:
     if: always()
     runs-on: ubuntu-latest
-    needs: [test, test-oldest-versions]
+    needs: [test, test-latest-versions, test-oldest-versions]
     steps:
     - name: Note that all tests succeeded
       uses: re-actors/[email protected]

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -356,10 +356,10 @@ def _list_to_1d_numpy(
         array = data.ravel()
         return _cast_numpy_array_to_dtype(array, dtype)
     elif _is_1d_list(data):
-        return np.array(data, dtype=dtype, copy=False)
+        return np.asarray(data, dtype=dtype)
     elif isinstance(data, pd_Series):
         _check_for_bad_pandas_dtypes(data.to_frame().dtypes)
-        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
+        return np.asarray(data, dtype=dtype)  # SparseArray should be supported as well
     else:
         raise TypeError(
             f"Wrong type({type(data).__name__}) for {name}.\n" "It should be list, numpy 1-D array or pandas Series"
@@ -728,7 +728,7 @@ def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray:
 def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray]:
     """Get pointer of float numpy array / list."""
     if _is_1d_list(data):
-        data = np.array(data, copy=False)
+        data = np.asarray(data)
     if _is_numpy_1d_array(data):
         data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
@@ -749,7 +749,7 @@ def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray
 def _c_int_array(data: np.ndarray) -> Tuple[_ctypes_int_ptr, int, np.ndarray]:
     """Get pointer of int numpy array / list."""
     if _is_1d_list(data):
-        data = np.array(data, copy=False)
+        data = np.asarray(data)
     if _is_numpy_1d_array(data):
         data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
@@ -1270,7 +1270,7 @@ def __inner_predict_np2d(
         preds: Optional[np.ndarray],
     ) -> Tuple[np.ndarray, int]:
         if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
         else:  # change non-float data to float data, need to copy
             data = np.array(mat.reshape(mat.size), dtype=np.float32)
         ptr_data, type_ptr_data, _ = _c_float_array(data)
@@ -2285,9 +2285,9 @@ def __init_from_np2d(
 
         self._handle = ctypes.c_void_p()
         if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
         else:  # change non-float data to float data, need to copy
-            data = np.array(mat.reshape(mat.size), dtype=np.float32)
+            data = np.asarray(mat.reshape(mat.size), dtype=np.float32)
 
         ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(
@@ -2332,7 +2332,7 @@ def __init_from_list_np2d(
             nrow[i] = mat.shape[0]
 
             if mat.dtype == np.float32 or mat.dtype == np.float64:
-                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+                mats[i] = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
             else:  # change non-float data to float data, need to copy
                 mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
 

diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
@@ -512,7 +512,7 @@ def _make_n_folds(
         if hasattr(folds, "split"):
             group_info = full_data.get_group()
             if group_info is not None:
-                group_info = np.array(group_info, dtype=np.int32, copy=False)
+                group_info = np.asarray(group_info, dtype=np.int32)
                 flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
             else:
                 flatted_group = np.zeros(num_data, dtype=np.int32)
@@ -526,7 +526,7 @@ def _make_n_folds(
             if not SKLEARN_INSTALLED:
                 raise LightGBMError("scikit-learn is required for ranking cv")
             # ranking task, split according to groups
-            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
+            group_info = np.asarray(full_data.get_group(), dtype=np.int32)
             flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
             group_kfold = _LGBMGroupKFold(n_splits=nfold)
             folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)

diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py
@@ -125,7 +125,7 @@ def load_from_mat(filename, reference):
     mat = np.loadtxt(str(filename), dtype=np.float64)
     label = mat[:, 0].astype(np.float32)
     mat = mat[:, 1:]
-    data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
+    data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
     handle = ctypes.c_void_p()
     ref = None
     if reference is not None:
@@ -203,7 +203,7 @@ def test_booster():
     mat = data[:, 1:]
     preb = np.empty(mat.shape[0], dtype=np.float64)
     num_preb = ctypes.c_int64(0)
-    data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
+    data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
     LIB.LGBM_BoosterPredictForMat(
         booster2,
         data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),

diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py
@@ -20,6 +20,10 @@
 else:
     import pyarrow as pa  # type: ignore
 
+    assert (
+        lgb.compat.PYARROW_INSTALLED is True
+    ), "'pyarrow' and its dependencies must be installed to run the arrow tests"
+
 # ----------------------------------------------------------------------------------------------- #
 #                                            UTILITIES                                            #
 # ----------------------------------------------------------------------------------------------- #

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
@@ -777,7 +777,10 @@ def test_custom_objective_safety(rng):
 def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng):
     pd = pytest.importorskip("pandas")
     X = rng.uniform(size=(10, 2)).astype(dtype)
-    df = pd.DataFrame(X)
+    # copy=False is necessary because starting with pandas 3.0, pd.DataFrame() creates
+    # a copy of the input numpy array by default
+    # ref: https://github.com/pandas-dev/pandas/issues/58913
+    df = pd.DataFrame(X, copy=False)
     built_data = lgb.basic._data_from_pandas(
         data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
     )[0]