From f5b6bd60d9d752c8e5a75b11ab771d0422214bb4 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Mon, 4 Dec 2023 19:26:55 +0000 Subject: [PATCH] [python-package] Allow to pass Arrow table and array as init scores (#6167) --- include/LightGBM/c_api.h | 5 +-- include/LightGBM/dataset.h | 4 +++ python-package/lightgbm/basic.py | 28 ++++++++++----- python-package/lightgbm/compat.py | 2 ++ src/io/dataset.cpp | 2 ++ src/io/metadata.cpp | 28 ++++++++++----- tests/python_package_test/test_arrow.py | 45 ++++++++++++++++++++++++- 7 files changed, 95 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index eafe6fab7825..ada2e4109638 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, * \brief Set vector to a content in info. * \note * - \a group converts input datatype into ``int32``; - * - \a label and \a weight convert input datatype into ``float32``. + * - \a label and \a weight convert input datatype into ``float32``; + * - \a init_score converts input datatype into ``float64``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label, \a weight, \a group + * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index bf8264276a5f..220a1f9f009c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -125,6 +125,7 @@ class Metadata { * \param init_score Initial scores, this class will manage memory for init_score. */ void SetInitScore(const double* init_score, data_size_t len); + void SetInitScore(const ArrowChunkedArray& array); /*! @@ -347,6 +348,9 @@ class Metadata { void SetWeightsFromIterator(It first, It last); /*! \brief Insert initial scores at the given index */ void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); + /*! \brief Set init scores from pointers to the first element and the end of an iterator. */ + template + void SetInitScoresFromIterator(It first, It last); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); /*! \brief Set queries from pointers to the first element and the end of an iterator. */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b55546941f77..31ae5182ee9e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -19,8 +19,8 @@ import scipy.sparse from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, - dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, - pd_Series) + dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -84,6 +84,9 @@ np.ndarray, pd_Series, pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, ] _LGBM_TrainDataType = Union[ str, @@ -1660,7 +1663,7 @@ def __init__( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. feature_name : list of str, or 'auto', optional (default="auto") Feature names. @@ -2440,7 +2443,7 @@ def create_valid( sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. params : dict or None, optional (default=None) Other parameters for validation Dataset. @@ -2547,7 +2550,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]] + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] ) -> "Dataset": """Set property into the Dataset. @@ -2576,7 +2579,16 @@ def set_field( return self # If the data is a arrow data, we can just pass it to C - if _is_pyarrow_array(data): + if _is_pyarrow_array(data) or _is_pyarrow_table(data): + # If a table is being passed, we concatenate the columns. This is only valid for + # 'init_score'. + if _is_pyarrow_table(data): + if field_name != "init_score": + raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") + data = pa_chunked_array([ + chunk for array in data.columns for chunk in array.chunks # type: ignore + ]) + c_array = _export_arrow_to_c(data) _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( self._handle, @@ -2869,7 +2881,7 @@ def set_init_score( Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None Init score for Booster. Returns @@ -4443,7 +4455,7 @@ def refit( .. versionadded:: 4.0.0 - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for ``data``. .. versionadded:: 4.0.0 diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index dc48dbf792cf..bd1b29a1e802 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -201,6 +201,7 @@ def __init__(self, *args, **kwargs): from pyarrow import Array as pa_Array from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import Table as pa_Table + from pyarrow import chunked_array as pa_chunked_array from pyarrow.cffi import ffi as arrow_cffi from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer @@ -243,6 +244,7 @@ class pa_compute: # type: ignore all = None equal = None + pa_chunked_array = None arrow_is_integer = None arrow_is_floating = None diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 78dd5e4319a5..058d7bd328ad 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray metadata_.SetLabel(ca); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(ca); + } else if (name == std::string("init_score")) { + metadata_.SetInitScore(ca); } else if (name == std::string("query") || name == std::string("group")) { metadata_.SetQuery(ca); } else { diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index d94b0ed3f2f7..55440649f55e 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector +void Metadata::SetInitScoresFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (init_score == nullptr || len == 0) { + // Clear init scores on empty input + if (last - first == 0) { init_score_.clear(); num_init_score_ = 0; return; } - if ((len % num_data_) != 0) { + if (((last - first) % num_data_) != 0) { Log::Fatal("Initial score size doesn't match data size"); } - if (init_score_.empty()) { init_score_.resize(len); } - num_init_score_ = len; + if (init_score_.empty()) { + init_score_.resize(last - first); + } + num_init_score_ = last - first; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024) for (int64_t i = 0; i < num_init_score_; ++i) { - init_score_[i] = Common::AvoidInf(init_score[i]); + init_score_[i] = Common::AvoidInf(first[i]); } init_score_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { - cuda_metadata_->SetInitScore(init_score_.data(), len); + cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size()); } #endif // USE_CUDA } +void Metadata::SetInitScore(const double* init_score, data_size_t len) { + SetInitScoresFromIterator(init_score, init_score + len); +} + +void Metadata::SetInitScore(const ArrowChunkedArray& array) { + SetInitScoresFromIterator(array.begin(), array.end()); +} + void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { if (num_init_score_ <= 0) { Log::Fatal("Inserting initial score data into dataset with no initial scores"); diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 38b053e94fd5..fd20df25dd87 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -178,7 +178,7 @@ def test_dataset_construct_weights_none(): ["array_type", "weight_data"], [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], ) -@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) +@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) def test_dataset_construct_weights(array_type, weight_data, arrow_type): data = generate_dummy_arrow_table() weights = array_type(weight_data, type=arrow_type) @@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type): expected = np.array([0, 2, 5], dtype=np.int32) np_assert_array_equal(expected, dataset.get_field("group"), strict=True) + + +# ----------------------------------------- INIT SCORES ----------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "init_score_data"], + [ + (pa.array, [0, 1, 2, 3, 3]), + (pa.chunked_array, [[0, 1, 2], [3, 3]]), + (pa.chunked_array, [[], [0, 1, 2], [3, 3]]), + (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_init_scores_array( + array_type: Any, init_score_data: Any, arrow_type: Any +): + data = generate_dummy_arrow_table() + init_scores = array_type(init_score_data, type=arrow_type) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 1, 2, 3, 3], dtype=np.float64) + np_assert_array_equal(expected, dataset.get_init_score(), strict=True) + + +def test_dataset_construct_init_scores_table(): + data = generate_dummy_arrow_table() + init_scores = pa.Table.from_arrays( + [ + generate_random_arrow_array(5, seed=1), + generate_random_arrow_array(5, seed=2), + generate_random_arrow_array(5, seed=3), + ], + names=["a", "b", "c"], + ) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + actual = dataset.get_init_score() + expected = init_scores.to_pandas().to_numpy().astype(np.float64) + np_assert_array_equal(expected, actual, strict=True)