diff --git a/.appveyor.yml b/.appveyor.yml index 4cff03d571a1..58a536d17a98 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: 4.1.0.99.{build} +version: 4.2.0.99.{build} image: Visual Studio 2015 platform: x64 diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh index 56489ecf3325..2d91f8e85f00 100755 --- a/.ci/lint-cpp.sh +++ b/.ci/lint-cpp.sh @@ -30,8 +30,7 @@ get_omp_pragmas_without_num_threads() { --include='*.h' \ --include='*.hpp' \ 'pragma omp parallel' \ - | grep -v ' num_threads' \ - | grep -v 'openmp_wrapper.h' + | grep -v ' num_threads' } PROBLEMATIC_LINES=$( get_omp_pragmas_without_num_threads diff --git a/.ci/setup.sh b/.ci/setup.sh index f7da21286d7d..bc17fee03308 100755 --- a/.ci/setup.sh +++ b/.ci/setup.sh @@ -54,6 +54,14 @@ else # Linux sudo apt-get install --no-install-recommends -y \ clang \ libomp-dev + elif [[ $COMPILER == "clang-17" ]]; then + sudo apt-get install wget + wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc + sudo apt-add-repository deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-add-repository deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get update + sudo apt-get install -y clang-17 + sudo apt-get install --no-install-recommends -y libomp-17-dev fi export LANG="en_US.UTF-8" diff --git a/.ci/test.sh b/.ci/test.sh index 9ffd48efe452..472fd7d8c6e5 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -6,6 +6,9 @@ if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "gcc" ]]; then elif [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "clang" ]]; then export CXX=clang++ export CC=clang +elif [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "clang-17" ]]; then + export CXX=clang++-17 + export CC=clang-17 fi if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index 9eddec2deed1..bd8f1f71f9f1 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -1,9 +1,11 @@ #!/bin/bash -RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1 +RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com')" || exit -1 sh build-cran-package.sh \ --r-executable=RDvalgrind \ + --no-build-vignettes \ || exit -1 + RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz || exit -1 cd R-package/tests @@ -68,7 +70,7 @@ bytes_possibly_lost=$( | tr -d "," ) echo "valgrind found ${bytes_possibly_lost} bytes possibly lost" -if [[ ${bytes_possibly_lost} -gt 352 ]]; then +if [[ ${bytes_possibly_lost} -gt 1056 ]]; then exit -1 fi diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 02b5cfbdae23..b6885cad0503 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,4 +7,4 @@ # offer a reasonable automatic best-guess # catch-all rule (this only gets matched if no rules below match) -* @guolinke @jameslamb @shiyu1994 @jmoralez +* @guolinke @jameslamb @shiyu1994 @jmoralez @borchero diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 8c4e3e6a4949..c750dfa502ac 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -34,7 +34,7 @@ jobs: SETUP_CONDA: 'false' OS_NAME: 'linux' PRODUCES_ARTIFACTS: 'true' - pool: sh-ubuntu + pool: sh-mariner container: linux-artifact-builder strategy: matrix: @@ -82,12 +82,12 @@ jobs: - job: Linux_latest ########################################### variables: - COMPILER: clang + COMPILER: clang-17 DEBIAN_FRONTEND: 'noninteractive' IN_UBUNTU_BASE_CONTAINER: 'true' OS_NAME: 'linux' SETUP_CONDA: 'true' - pool: sh-ubuntu + pool: sh-mariner container: ubuntu-latest strategy: matrix: diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bfa2adf541d..49f67d806c22 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -439,6 +439,7 @@ file( src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp + src/utils/*.cpp if(USE_CUDA) src/treelearner/*.cu src/boosting/cuda/*.cpp @@ -647,6 +648,7 @@ if(BUILD_CPP_TEST) file(GLOB CPP_TEST_SOURCES tests/cpp_tests/*.cpp) if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /permissive-") set( CompilerFlags CMAKE_CXX_FLAGS diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 1193c0d463b9..62b479530b4a 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -4,10 +4,10 @@ Title: Light Gradient Boosting Machine Version: ~~VERSION~~ Date: ~~DATE~~ Authors@R: c( - person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut", "cre")), + person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut")), person("Guolin", "Ke", email = "guolin.ke@outlook.com", role = c("aut")), person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("aut")), - person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut")), + person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut", "cre")), person("Qi", "Meng", role = c("aut")), person("Thomas", "Finley", role = c("aut")), person("Taifeng", "Wang", role = c("aut")), diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index e07af84d8824..718f0e55a0d7 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ S3method(print,lgb.Booster) S3method(set_field,lgb.Dataset) S3method(slice,lgb.Dataset) S3method(summary,lgb.Booster) +export(getLGBMthreads) export(get_field) export(lgb.Dataset) export(lgb.Dataset.construct) @@ -33,8 +34,7 @@ export(lgb.restore_handle) export(lgb.save) export(lgb.train) export(lightgbm) -export(readRDS.lgb.Booster) -export(saveRDS.lgb.Booster) +export(setLGBMthreads) export(set_field) export(slice) import(methods) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 17da9545ae19..4437c6fa552e 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -917,6 +917,8 @@ NULL #' the factor levels not being present in the output. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1082,6 +1084,8 @@ predict.lgb.Booster <- function(object, #' \link{predict.lgb.Booster}. #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(mtcars) #' X <- as.matrix(mtcars[, -1L]) @@ -1224,6 +1228,8 @@ summary.lgb.Booster <- function(object, ...) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1289,6 +1295,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -1346,6 +1354,8 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' @examples #' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1396,6 +1406,8 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # train a regression model #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index ddc338d2cae3..ff9b0b4fa38a 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -780,6 +780,8 @@ Dataset <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -837,6 +839,8 @@ lgb.Dataset <- function(data, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -913,6 +917,8 @@ lgb.Dataset.create.valid <- function(dataset, #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -942,6 +948,8 @@ lgb.Dataset.construct <- function(dataset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -975,6 +983,8 @@ dim.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1045,6 +1055,8 @@ dimnames.lgb.Dataset <- function(x) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1089,6 +1101,8 @@ slice.lgb.Dataset <- function(dataset, idxset) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1138,6 +1152,8 @@ get_field.lgb.Dataset <- function(dataset, field_name) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1177,6 +1193,8 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1207,6 +1225,8 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' # create training Dataset #' data(agaricus.train, package ="lightgbm") #' train <- agaricus.train @@ -1240,6 +1260,8 @@ lgb.Dataset.set.reference <- function(dataset, reference) { #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 11768c5bfa0b..0545fbf71899 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -51,6 +51,8 @@ CVBooster <- R6::R6Class( #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index 27efb17392df..7c76131f4f53 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -14,6 +14,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index 976315262792..8f93d45429f1 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -17,6 +17,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) log(x / (1.0 - x)) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 5d994accfa7f..bf4562e41018 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -29,6 +29,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index fc59ebd0efec..b8a90ca158ae 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 8b95371eb3c2..97650f30a7d3 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -16,6 +16,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' Logit <- function(x) { #' log(x / (1.0 - x)) #' } diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index 0ed25ef26f3d..8a24cc628ca9 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -16,7 +16,10 @@ #' @return \code{lgb.Booster} (the same `model` object that was passed as input, invisibly). #' @seealso \link{lgb.make_serializable}, \link{lgb.drop_serialized}. #' @examples +#' \donttest{ #' library(lightgbm) +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data("agaricus.train") #' model <- lightgbm( #' agaricus.train$data @@ -33,6 +36,7 @@ #' model_new$check_null_handle() #' lgb.restore_handle(model_new) #' model_new$check_null_handle() +#' } #' @export lgb.restore_handle <- function(model) { if (!.is_Booster(x = model)) { diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 6979558d22cd..8a299fb6b8ac 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -19,6 +19,8 @@ #' #' @examples #' \donttest{ +#' \dontshow{setLGBMthreads(2L)} +#' \dontshow{data.table::setDTthreads(1L)} #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/multithreading.R b/R-package/R/multithreading.R new file mode 100644 index 000000000000..a8d6b51a8968 --- /dev/null +++ b/R-package/R/multithreading.R @@ -0,0 +1,51 @@ +#' @name setLGBMThreads +#' @title Set maximum number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to set the maximum number of threads LightGBM will use for such operations. +#' +#' This function affects all LightGBM operations in the same process. +#' +#' So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM +#' operation in the same process will use more than 4 threads. +#' +#' Call \code{setLGBMthreads(-1)} to remove this limitation. +#' @param num_threads maximum number of threads to be used by LightGBM in multi-threaded operations +#' @return NULL +#' @seealso \link{getLGBMthreads} +#' @export +setLGBMthreads <- function(num_threads) { + .Call( + LGBM_SetMaxThreads_R, + num_threads + ) + return(invisible(NULL)) +} + +#' @name getLGBMThreads +#' @title Get default number of threads used by LightGBM +#' @description LightGBM attempts to speed up many operations by using multi-threading. +#' The number of threads used in those operations can be controlled via the +#' \code{num_threads} parameter passed through \code{params} to functions like +#' \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing +#' a model from a text file) are done via code paths that don't explicitly accept thread-control +#' configuration. +#' +#' Use this function to see the default number of threads LightGBM will use for such operations. +#' @return number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is +#' not explicitly supplied, LightGBM will choose a number of threads to use automatically. +#' @seealso \link{setLGBMthreads} +#' @export +getLGBMthreads <- function() { + out <- 0L + .Call( + LGBM_GetMaxThreads_R, + out + ) + return(out) +} diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R deleted file mode 100644 index a8abac642c24..000000000000 --- a/R-package/R/readRDS.lgb.Booster.R +++ /dev/null @@ -1,48 +0,0 @@ -#' @name readRDS.lgb.Booster -#' @title readRDS for \code{lgb.Booster} models (DEPRECATED) -#' @description Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} object, -#' and then restores its handle through \code{lgb.restore_handle}. -#' -#' \bold{This function throws a warning and will be removed in future versions.} -#' @param file a connection or the name of the file where the R object is saved to or read from. -#' @param refhook a hook function for handling reference objects. -#' -#' @return \code{lgb.Booster} -#' -#' @examples -#' \donttest{ -#' library(lightgbm) -#' data(agaricus.train, package = "lightgbm") -#' train <- agaricus.train -#' dtrain <- lgb.Dataset(train$data, label = train$label) -#' data(agaricus.test, package = "lightgbm") -#' test <- agaricus.test -#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) -#' params <- list( -#' objective = "regression" -#' , metric = "l2" -#' , min_data = 1L -#' , learning_rate = 1.0 -#' , num_threads = 2L -#' ) -#' valids <- list(test = dtest) -#' model <- lgb.train( -#' params = params -#' , data = dtrain -#' , nrounds = 10L -#' , valids = valids -#' , early_stopping_rounds = 5L -#' ) -#' model_file <- tempfile(fileext = ".rds") -#' saveRDS.lgb.Booster(model, model_file) -#' new_model <- readRDS.lgb.Booster(model_file) -#' } -#' @export -readRDS.lgb.Booster <- function(file, refhook = NULL) { - - warning("'readRDS.lgb.Booster' is deprecated and will be removed in a future release. Use readRDS() instead.") - - object <- readRDS(file = file, refhook = refhook) - lgb.restore_handle(object) - return(object) -} diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R deleted file mode 100644 index d75056e69734..000000000000 --- a/R-package/R/saveRDS.lgb.Booster.R +++ /dev/null @@ -1,78 +0,0 @@ -#' @name saveRDS.lgb.Booster -#' @title saveRDS for \code{lgb.Booster} models (DEPRECATED) -#' @description Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable before the call if -#' it isn't already. -#' -#' \bold{This function throws a warning and will be removed in future versions.} -#' @param object \code{lgb.Booster} object to serialize. -#' @param file a connection or the name of the file where the R object is saved to or read from. -#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), -#' a binary one is used. See the comments in the help for save. -#' @param version the workspace format version to use. \code{NULL} specifies the current default -#' version (2). Versions prior to 2 are not supported, so this will only be relevant -#' when there are later versions. -#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, -#' or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of -#' compression to be used. Ignored if file is a connection. -#' @param refhook a hook function for handling reference objects. -#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}. -#' -#' @return NULL invisibly. -#' -#' @examples -#' \donttest{ -#' library(lightgbm) -#' data(agaricus.train, package = "lightgbm") -#' train <- agaricus.train -#' dtrain <- lgb.Dataset(train$data, label = train$label) -#' data(agaricus.test, package = "lightgbm") -#' test <- agaricus.test -#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) -#' params <- list( -#' objective = "regression" -#' , metric = "l2" -#' , min_data = 1L -#' , learning_rate = 1.0 -#' , num_threads = 2L -#' ) -#' valids <- list(test = dtest) -#' model <- lgb.train( -#' params = params -#' , data = dtrain -#' , nrounds = 10L -#' , valids = valids -#' , early_stopping_rounds = 5L -#' ) -#' model_file <- tempfile(fileext = ".rds") -#' saveRDS.lgb.Booster(model, model_file) -#' } -#' @export -saveRDS.lgb.Booster <- function(object, - file, - ascii = FALSE, - version = NULL, - compress = TRUE, - refhook = NULL, - raw = TRUE) { - - warning("'saveRDS.lgb.Booster' is deprecated and will be removed in a future release. Use saveRDS() instead.") - - if (!.is_Booster(x = object)) { - stop("saveRDS.lgb.Booster: object should be an ", sQuote("lgb.Booster")) - } - - if (is.null(object$raw)) { - lgb.make_serializable(object) - } - - saveRDS( - object - , file = file - , ascii = ascii - , version = version - , compress = compress - , refhook = refhook - ) - - return(invisible(NULL)) -} diff --git a/R-package/configure b/R-package/configure index 39a18d669833..37dfdbfbf6c7 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99. +# Generated by GNU Autoconf 2.71 for lightgbm 4.2.0.99. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -607,8 +607,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='lightgbm' PACKAGE_TARNAME='lightgbm' -PACKAGE_VERSION='4.1.0.99' -PACKAGE_STRING='lightgbm 4.1.0.99' +PACKAGE_VERSION='4.2.0.99' +PACKAGE_STRING='lightgbm 4.2.0.99' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems. +\`configure' configures lightgbm 4.2.0.99 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1273,7 +1273,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";; + short | recursive ) echo "Configuration of lightgbm 4.2.0.99:";; esac cat <<\_ACEOF @@ -1341,7 +1341,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -lightgbm configure 4.1.0.99 +lightgbm configure 4.2.0.99 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by lightgbm $as_me 4.1.0.99, which was +It was created by lightgbm $as_me 4.2.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by lightgbm $as_me 4.1.0.99, which was +This file was extended by lightgbm $as_me 4.2.0.99, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -lightgbm config.status 4.1.0.99 +lightgbm config.status 4.2.0.99 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md index 44b8ed391bfc..80f84b924a48 100644 --- a/R-package/cran-comments.md +++ b/R-package/cran-comments.md @@ -1,5 +1,16 @@ # CRAN Submission History +## v4.2.0 - Submission 1 - (December 7, 2023) + +### CRAN response + +Accepted to CRAN + +### Maintainer Notes + +This submission included many changes from the last 2 years, as well as fixes for a warning +CRAN said could cause the package to be archived: https://github.com/microsoft/LightGBM/issues/6221. + ## v4.1.0 - not submitted v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved. diff --git a/R-package/man/dim.Rd b/R-package/man/dim.Rd index 94ca192d8291..69332d0ec397 100644 --- a/R-package/man/dim.Rd +++ b/R-package/man/dim.Rd @@ -21,6 +21,8 @@ be directly used with an \code{lgb.Dataset} object. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dimnames.lgb.Dataset.Rd b/R-package/man/dimnames.lgb.Dataset.Rd index ec01a04f607b..85f2085f1d77 100644 --- a/R-package/man/dimnames.lgb.Dataset.Rd +++ b/R-package/man/dimnames.lgb.Dataset.Rd @@ -28,6 +28,8 @@ Since row names are irrelevant, it is recommended to use \code{colnames} directl } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/getLGBMThreads.Rd b/R-package/man/getLGBMThreads.Rd new file mode 100644 index 000000000000..21af4f4849d4 --- /dev/null +++ b/R-package/man/getLGBMThreads.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{getLGBMThreads} +\alias{getLGBMThreads} +\alias{getLGBMthreads} +\title{Get default number of threads used by LightGBM} +\usage{ +getLGBMthreads() +} +\value{ +number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is + not explicitly supplied, LightGBM will choose a number of threads to use automatically. +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to see the default number of threads LightGBM will use for such operations. +} +\seealso{ +\link{setLGBMthreads} +} diff --git a/R-package/man/get_field.Rd b/R-package/man/get_field.Rd index 1b6692fcf807..e2562cc21364 100644 --- a/R-package/man/get_field.Rd +++ b/R-package/man/get_field.Rd @@ -32,6 +32,8 @@ Get one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4895600ff922..2605657b060a 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -65,6 +65,8 @@ Construct \code{lgb.Dataset} object from dense matrix, sparse matrix } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.construct.Rd b/R-package/man/lgb.Dataset.construct.Rd index 97c9e7887602..e400e0a5f8d5 100644 --- a/R-package/man/lgb.Dataset.construct.Rd +++ b/R-package/man/lgb.Dataset.construct.Rd @@ -17,6 +17,8 @@ Construct Dataset explicitly } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index ab8ca753c2b9..fc50dff19986 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -48,6 +48,8 @@ Construct validation data according to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.save.Rd b/R-package/man/lgb.Dataset.save.Rd index 5ea38227ba66..b03c2c5e0ac5 100644 --- a/R-package/man/lgb.Dataset.save.Rd +++ b/R-package/man/lgb.Dataset.save.Rd @@ -20,6 +20,8 @@ Please note that \code{init_score} is not saved in binary file. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd index 26eb10770e47..5dfcc9a771e8 100644 --- a/R-package/man/lgb.Dataset.set.categorical.Rd +++ b/R-package/man/lgb.Dataset.set.categorical.Rd @@ -22,6 +22,8 @@ Set the categorical features of an \code{lgb.Dataset} object. Use this function } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.reference.Rd b/R-package/man/lgb.Dataset.set.reference.Rd index 349b0b22913e..a4efbfac5962 100644 --- a/R-package/man/lgb.Dataset.set.reference.Rd +++ b/R-package/man/lgb.Dataset.set.reference.Rd @@ -19,6 +19,8 @@ If you want to use validation data, you should set reference to training data } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # create training Dataset data(agaricus.train, package ="lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd index 39fe6afa6b18..e02600451df5 100644 --- a/R-package/man/lgb.configure_fast_predict.Rd +++ b/R-package/man/lgb.configure_fast_predict.Rd @@ -114,6 +114,8 @@ Calling this function multiple times with different parameters might not overrid } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(mtcars) X <- as.matrix(mtcars[, -1L]) diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 555cb11c7bb3..7ea2928c6166 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -152,6 +152,8 @@ Cross validation logic used by LightGBM \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index f4e90242fd75..39f0e3018ac7 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -20,6 +20,8 @@ Dump LightGBM model to json \examples{ \donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index 9c2293a0f909..0dc7eb0845c3 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -33,6 +33,8 @@ Given a \code{lgb.Booster}, return evaluation results for a } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} # train a regression model data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 89a3d4e6b5b7..79cb82f5d8ef 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -25,6 +25,8 @@ Creates a \code{data.table} of feature importances in a model. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index c1166b2c1cc9..3acc27955c46 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -30,6 +30,8 @@ Computes feature contribution components of rawscore prediction. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) log(x / (1.0 - x)) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index c1a00a20974b..f145db5a245e 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -20,6 +20,8 @@ Load LightGBM takes in either a file path or model string. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index 4d02ede9a001..60ef8cdac133 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -40,6 +40,8 @@ Parse a LightGBM model json dump into a \code{data.table} structure. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 302f46460e3f..bdf354da0385 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -38,6 +38,8 @@ Features are shown ranked in a decreasing importance order. } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index a914071e896f..6f168e120a4e 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -35,6 +35,8 @@ contribution of a feature. Features are shown ranked in a decreasing contributio } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} Logit <- function(x) { log(x / (1.0 - x)) } diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index 95cbdc64485d..37922c077642 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -27,7 +27,10 @@ function. If you wish to make fast single-row predictions using a \code{lgb.Boos call \link{lgb.configure_fast_predict} on the loaded \code{lgb.Booster} object. } \examples{ +\donttest{ library(lightgbm) +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data("agaricus.train") model <- lightgbm( agaricus.train$data @@ -45,6 +48,7 @@ model_new$check_null_handle() lgb.restore_handle(model_new) model_new$check_null_handle() } +} \seealso{ \link{lgb.make_serializable}, \link{lgb.drop_serialized}. } diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index efd110c7d816..62ec0ed462f6 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -21,6 +21,8 @@ Save LightGBM model } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 0f2961edc415..557c85b7f9dc 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -130,6 +130,8 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}}, \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 2df13b9bc374..bcb2f3f980fb 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -121,6 +121,8 @@ If the model object has been configured for fast single-row predictions through } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd deleted file mode 100644 index 6a8e4c80ca91..000000000000 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/readRDS.lgb.Booster.R -\name{readRDS.lgb.Booster} -\alias{readRDS.lgb.Booster} -\title{readRDS for \code{lgb.Booster} models (DEPRECATED)} -\usage{ -readRDS.lgb.Booster(file, refhook = NULL) -} -\arguments{ -\item{file}{a connection or the name of the file where the R object is saved to or read from.} - -\item{refhook}{a hook function for handling reference objects.} -} -\value{ -\code{lgb.Booster} -} -\description{ -Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} object, - and then restores its handle through \code{lgb.restore_handle}. - - \bold{This function throws a warning and will be removed in future versions.} -} -\examples{ -\donttest{ -library(lightgbm) -data(agaricus.train, package = "lightgbm") -train <- agaricus.train -dtrain <- lgb.Dataset(train$data, label = train$label) -data(agaricus.test, package = "lightgbm") -test <- agaricus.test -dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) -params <- list( - objective = "regression" - , metric = "l2" - , min_data = 1L - , learning_rate = 1.0 - , num_threads = 2L -) -valids <- list(test = dtest) -model <- lgb.train( - params = params - , data = dtrain - , nrounds = 10L - , valids = valids - , early_stopping_rounds = 5L -) -model_file <- tempfile(fileext = ".rds") -saveRDS.lgb.Booster(model, model_file) -new_model <- readRDS.lgb.Booster(model_file) -} -} diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd deleted file mode 100644 index a8664243dce2..000000000000 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/saveRDS.lgb.Booster.R -\name{saveRDS.lgb.Booster} -\alias{saveRDS.lgb.Booster} -\title{saveRDS for \code{lgb.Booster} models (DEPRECATED)} -\usage{ -saveRDS.lgb.Booster( - object, - file, - ascii = FALSE, - version = NULL, - compress = TRUE, - refhook = NULL, - raw = TRUE -) -} -\arguments{ -\item{object}{\code{lgb.Booster} object to serialize.} - -\item{file}{a connection or the name of the file where the R object is saved to or read from.} - -\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), -a binary one is used. See the comments in the help for save.} - -\item{version}{the workspace format version to use. \code{NULL} specifies the current default -version (2). Versions prior to 2 are not supported, so this will only be relevant -when there are later versions.} - -\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, -or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of -compression to be used. Ignored if file is a connection.} - -\item{refhook}{a hook function for handling reference objects.} - -\item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.} -} -\value{ -NULL invisibly. -} -\description{ -Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable before the call if - it isn't already. - - \bold{This function throws a warning and will be removed in future versions.} -} -\examples{ -\donttest{ -library(lightgbm) -data(agaricus.train, package = "lightgbm") -train <- agaricus.train -dtrain <- lgb.Dataset(train$data, label = train$label) -data(agaricus.test, package = "lightgbm") -test <- agaricus.test -dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) -params <- list( - objective = "regression" - , metric = "l2" - , min_data = 1L - , learning_rate = 1.0 - , num_threads = 2L -) -valids <- list(test = dtest) -model <- lgb.train( - params = params - , data = dtrain - , nrounds = 10L - , valids = valids - , early_stopping_rounds = 5L -) -model_file <- tempfile(fileext = ".rds") -saveRDS.lgb.Booster(model, model_file) -} -} diff --git a/R-package/man/setLGBMThreads.Rd b/R-package/man/setLGBMThreads.Rd new file mode 100644 index 000000000000..53336fc2548e --- /dev/null +++ b/R-package/man/setLGBMThreads.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multithreading.R +\name{setLGBMThreads} +\alias{setLGBMThreads} +\alias{setLGBMthreads} +\title{Set maximum number of threads used by LightGBM} +\usage{ +setLGBMthreads(num_threads) +} +\arguments{ +\item{num_threads}{maximum number of threads to be used by LightGBM in multi-threaded operations} +} +\description{ +LightGBM attempts to speed up many operations by using multi-threading. + The number of threads used in those operations can be controlled via the + \code{num_threads} parameter passed through \code{params} to functions like + \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing + a model from a text file) are done via code paths that don't explicitly accept thread-control + configuration. + + Use this function to set the maximum number of threads LightGBM will use for such operations. + + This function affects all LightGBM operations in the same process. + + So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM + operation in the same process will use more than 4 threads. + + Call \code{setLGBMthreads(-1)} to remove this limitation. +} +\seealso{ +\link{getLGBMthreads} +} diff --git a/R-package/man/set_field.Rd b/R-package/man/set_field.Rd index f9901e27eefd..2ceebfb87753 100644 --- a/R-package/man/set_field.Rd +++ b/R-package/man/set_field.Rd @@ -34,6 +34,8 @@ Set one attribute of a \code{lgb.Dataset} } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 1d7bec08de0f..a65809a239d8 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -23,6 +23,8 @@ Get a new \code{lgb.Dataset} containing the specified rows of } \examples{ \donttest{ +\dontshow{setLGBMthreads(2L)} +\dontshow{data.table::setDTthreads(1L)} data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml index 233a31f0ead9..99a3b1010d41 100644 --- a/R-package/pkgdown/_pkgdown.yml +++ b/R-package/pkgdown/_pkgdown.yml @@ -85,8 +85,6 @@ reference: - '`lgb.save`' - '`lgb.load`' - '`lgb.model.dt.tree`' - - '`saveRDS.lgb.Booster`' - - '`readRDS.lgb.Booster`' - title: Model Interpretation desc: Analyze your models contents: diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index ba9ef054bfab..c04263f62c1c 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -53,5 +53,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 14f5afde002f..86d56fecdf34 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -54,5 +54,6 @@ OBJECTS = \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ treelearner/voting_parallel_tree_learner.o \ + utils/openmp_wrapper.o \ c_api.o \ lightgbm_R.o diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 21ba801a3a60..4799f8540497 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -40,7 +40,7 @@ void LGBM_R_save_exception_msg(const std::string &err); catch(std::exception& ex) { LGBM_R_save_exception_msg(ex); } \ catch(std::string& ex) { LGBM_R_save_exception_msg(ex); } \ catch(...) { Rf_error("unknown exception"); } \ - Rf_error(R_errmsg_buffer); \ + Rf_error("%s", R_errmsg_buffer); \ return R_NilValue; /* <- won't be reached */ #define CHECK_CALL(x) \ @@ -226,9 +226,10 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle, int32_t len = static_cast(Rf_asInteger(len_used_row_indices)); std::vector idxvec(len); // convert from one-based to zero-based index + const int *used_row_indices_ = INTEGER(used_row_indices); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int32_t i = 0; i < len; ++i) { - idxvec[i] = static_cast(INTEGER(used_row_indices)[i] - 1); + idxvec[i] = static_cast(used_row_indices_[i] - 1); } const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters))); DatasetHandle res = nullptr; @@ -339,18 +340,20 @@ SEXP LGBM_DatasetSetField_R(SEXP handle, const char* name = CHAR(PROTECT(Rf_asChar(field_name))); if (!strcmp("group", name) || !strcmp("query", name)) { std::vector vec(len); + const int *field_data_ = INTEGER(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int i = 0; i < len; ++i) { - vec[i] = static_cast(INTEGER(field_data)[i]); + vec[i] = static_cast(field_data_[i]); } CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_INT32)); } else if (!strcmp("init_score", name)) { CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, REAL(field_data), len, C_API_DTYPE_FLOAT64)); } else { std::vector vec(len); + const double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (len >= 1024) for (int i = 0; i < len; ++i) { - vec[i] = static_cast(REAL(field_data)[i]); + vec[i] = static_cast(field_data_[i]); } CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32)); } @@ -372,21 +375,24 @@ SEXP LGBM_DatasetGetField_R(SEXP handle, if (!strcmp("group", name) || !strcmp("query", name)) { auto p_data = reinterpret_cast(res); // convert from boundaries to size + int *field_data_ = INTEGER(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len - 1; ++i) { - INTEGER(field_data)[i] = p_data[i + 1] - p_data[i]; + field_data_[i] = p_data[i + 1] - p_data[i]; } } else if (!strcmp("init_score", name)) { auto p_data = reinterpret_cast(res); + double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len; ++i) { - REAL(field_data)[i] = p_data[i]; + field_data_[i] = p_data[i]; } } else { auto p_data = reinterpret_cast(res); + double *field_data_ = REAL(field_data); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (out_len >= 1024) for (int i = 0; i < out_len; ++i) { - REAL(field_data)[i] = p_data[i]; + field_data_[i] = p_data[i]; } } UNPROTECT(1); @@ -611,10 +617,12 @@ SEXP LGBM_BoosterUpdateOneIterCustom_R(SEXP handle, int is_finished = 0; int int_len = Rf_asInteger(len); std::vector tgrad(int_len), thess(int_len); + const double *grad_ = REAL(grad); + const double *hess_ = REAL(hess); #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (int_len >= 1024) for (int j = 0; j < int_len; ++j) { - tgrad[j] = static_cast(REAL(grad)[j]); - thess[j] = static_cast(REAL(hess)[j]); + tgrad[j] = static_cast(grad_[j]); + thess[j] = static_cast(hess_[j]); } CHECK_CALL(LGBM_BoosterUpdateOneIterCustom(R_ExternalPtrAddr(handle), tgrad.data(), thess.data(), &is_finished)); return R_NilValue; @@ -1204,6 +1212,23 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) { R_API_END(); } +SEXP LGBM_GetMaxThreads_R(SEXP out) { + R_API_BEGIN(); + int num_threads; + CHECK_CALL(LGBM_GetMaxThreads(&num_threads)); + INTEGER(out)[0] = num_threads; + return R_NilValue; + R_API_END(); +} + +SEXP LGBM_SetMaxThreads_R(SEXP num_threads) { + R_API_BEGIN(); + int new_num_threads = Rf_asInteger(num_threads); + CHECK_CALL(LGBM_SetMaxThreads(new_num_threads)); + return R_NilValue; + R_API_END(); +} + // .Call() calls static const R_CallMethodDef CallEntries[] = { {"LGBM_HandleIsNull_R" , (DL_FUNC) &LGBM_HandleIsNull_R , 1}, @@ -1260,6 +1285,8 @@ static const R_CallMethodDef CallEntries[] = { {"LGBM_BoosterDumpModel_R" , (DL_FUNC) &LGBM_BoosterDumpModel_R , 3}, {"LGBM_NullBoosterHandleError_R" , (DL_FUNC) &LGBM_NullBoosterHandleError_R , 0}, {"LGBM_DumpParamAliases_R" , (DL_FUNC) &LGBM_DumpParamAliases_R , 0}, + {"LGBM_GetMaxThreads_R" , (DL_FUNC) &LGBM_GetMaxThreads_R , 1}, + {"LGBM_SetMaxThreads_R" , (DL_FUNC) &LGBM_SetMaxThreads_R , 1}, {NULL, NULL, 0} }; diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h index 7141a06a207c..4f0407e8f2ec 100644 --- a/R-package/src/lightgbm_R.h +++ b/R-package/src/lightgbm_R.h @@ -850,4 +850,23 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R( */ LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R(); +/*! +* \brief Get current maximum number of threads used by LightGBM routines in this process. +* \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_GetMaxThreads_R( + SEXP out +); + + +/*! +* \brief Set maximum number of threads used by LightGBM routines in this process. +* \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). +* \return R NULL value +*/ +LIGHTGBM_C_EXPORT SEXP LGBM_SetMaxThreads_R( + SEXP num_threads +); + #endif // LIGHTGBM_R_H_ diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R index 9c928c1f71d1..45edf40efbeb 100644 --- a/R-package/tests/testthat/helper.R +++ b/R-package/tests/testthat/helper.R @@ -11,6 +11,11 @@ # the check farm is a shared resource and will typically be running many checks simultaneously. # .LGB_MAX_THREADS <- 2L +setLGBMthreads(.LGB_MAX_THREADS) + +# control data.table parallelism +# ref: https://github.com/Rdatatable/data.table/issues/5658 +data.table::setDTthreads(1L) # by default, how much should results in tests be allowed to differ from hard-coded expected numbers? .LGB_NUMERIC_TOLERANCE <- 1e-6 diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index c1fc02630c13..e6b0e8abda64 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -1252,45 +1252,6 @@ test_that("lgb.cv() correctly handles passing through params to the model file", }) -test_that("params (including dataset params) should be stored in .rds file for Booster", { - data(agaricus.train, package = "lightgbm") - dtrain <- lgb.Dataset( - agaricus.train$data - , label = agaricus.train$label - , params = list( - max_bin = 17L - ) - ) - params <- list( - objective = "binary" - , max_depth = 4L - , bagging_fraction = 0.8 - , verbose = .LGB_VERBOSITY - , num_threads = .LGB_MAX_THREADS - ) - bst <- Booster$new( - params = params - , train_set = dtrain - ) - bst_file <- tempfile(fileext = ".rds") - expect_warning(saveRDS.lgb.Booster(bst, file = bst_file)) - - expect_warning({ - bst_from_file <- readRDS.lgb.Booster(file = bst_file) - }) - expect_identical( - bst_from_file$params - , list( - objective = "binary" - , max_depth = 4L - , bagging_fraction = 0.8 - , verbose = .LGB_VERBOSITY - , num_threads = .LGB_MAX_THREADS - , max_bin = 17L - ) - ) -}) - test_that("params (including dataset params) should be stored in .rds file for Booster", { data(agaricus.train, package = "lightgbm") dtrain <- lgb.Dataset( @@ -1350,46 +1311,6 @@ test_that("Handle is automatically restored when calling predict", { expect_equal(pred_before, pred_after) }) -test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster and readRDS.lgb.Booster", { - X <- matrix(rnorm(100L), ncol = 1L) - labels <- 2L * X + runif(nrow(X), 0L, 0.1) - dtrain <- lgb.Dataset( - data = X - , label = labels - ) - - params <- list( - objective = "regression" - , verbose = .LGB_VERBOSITY - , metric = "mse" - , seed = 0L - , num_leaves = 2L - , num_threads = .LGB_MAX_THREADS - ) - - bst <- lgb.train( - data = dtrain - , nrounds = 10L - , params = params - ) - expect_true(.is_Booster(bst)) - - # save predictions, then write the model to a file and destroy it in R - preds <- predict(bst, X) - model_file <- tempfile(fileext = ".rds") - expect_warning(saveRDS.lgb.Booster(bst, file = model_file)) - bst$finalize() - expect_null(bst$.__enclos_env__$private$handle) - rm(bst) - - # load the booster and make predictions...should be the same - expect_warning({ - bst2 <- readRDS.lgb.Booster(file = model_file) - }) - preds2 <- predict(bst2, X) - expect_identical(preds, preds2) -}) - test_that("boosters with linear models at leaves can be written to RDS and re-loaded successfully", { X <- matrix(rnorm(100L), ncol = 1L) labels <- 2L * X + runif(nrow(X), 0L, 0.1) diff --git a/R-package/tests/testthat/test_multithreading.R b/R-package/tests/testthat/test_multithreading.R new file mode 100644 index 000000000000..e2f3169627a2 --- /dev/null +++ b/R-package/tests/testthat/test_multithreading.R @@ -0,0 +1,16 @@ +test_that("getLGBMthreads() and setLGBMthreads() work as expected", { + # works with integer input + ret <- setLGBMthreads(2L) + expect_null(ret) + expect_equal(getLGBMthreads(), 2L) + + # works with float input + ret <- setLGBMthreads(1.0) + expect_null(ret) + expect_equal(getLGBMthreads(), 1L) + + # setting to any negative number sets max threads to -1 + ret <- setLGBMthreads(-312L) + expect_null(ret) + expect_equal(getLGBMthreads(), -1L) +}) diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd index d7aaf676f386..82bd6957640c 100644 --- a/R-package/vignettes/basic_walkthrough.Rmd +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -27,6 +27,12 @@ Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), library(lightgbm) ``` +```{r, include=FALSE} +# limit number of threads used, to be respectful of CRAN's resources when it checks this vignette +data.table::setDTthreads(1L) +setLGBMthreads(2L) +``` + This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit. ## The dataset diff --git a/README.md b/README.md index f6f4e8c570e0..f3f63404b399 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,6 @@ Next you may want to read: - [**Features**](https://github.com/microsoft/LightGBM/blob/master/docs/Features.rst) and algorithms supported by LightGBM. - [**Parameters**](https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst) is an exhaustive list of customization you can make. - [**Distributed Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation. -- [**Laurae++ interactive documentation**](https://sites.google.com/view/lauraepp/parameters) is a detailed guide for hyperparameters. - [**FLAML**](https://www.microsoft.com/en-us/research/project/fast-and-lightweight-automl-for-large-scale-data/articles/flaml-a-fast-and-lightweight-automl-library/) provides automated tuning for LightGBM ([code examples](https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-LightGBM/)). - [**Optuna Hyperparameter Tuner**](https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258) provides automated tuning for LightGBM hyperparameters ([code examples](https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_tuner_simple.py)). - [**Understanding LightGBM Parameters (and How to Tune Them using Neptune)**](https://neptune.ai/blog/lightgbm-parameters-guide). diff --git a/VERSION.txt b/VERSION.txt index 1f06da0058c9..67dae3929837 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -4.1.0.99 +4.2.0.99 diff --git a/build-cran-package.sh b/build-cran-package.sh index 1c8a5dfbdc48..9fa0c5877085 100755 --- a/build-cran-package.sh +++ b/build-cran-package.sh @@ -227,6 +227,7 @@ if ${BUILD_VIGNETTES} ; then rm -f ./lightgbm/src/network/*.o rm -f ./lightgbm/src/objective/*.o rm -f ./lightgbm/src/treelearner/*.o + rm -f ./lightgbm/src/utils/*.o echo "re-tarring ${TARBALL_NAME}" tar \ diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 2e0002cb6bc1..43999931ca07 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -11,44 +11,15 @@ LightGBM FAQ ------ -Critical Issues -=============== +Please post questions, feature requests, and bug reports at https://github.com/microsoft/LightGBM/issues. -A **critical issue** could be a *crash*, *prediction error*, *nonsense output*, or something else requiring immediate attention. +This project is mostly maintained by volunteers, so please be patient. +If your request is time-sensitive or more than a month goes by without a response, please tag the maintainers below for help. -Please post such an issue in the `Microsoft/LightGBM repository `__. - -You may also ping a member of the core team according to the relevant area of expertise by mentioning them with the arabase (@) symbol: - -- `@guolinke `__ **Guolin Ke** (C++ code / R-package / Python-package) -- `@chivee `__ **Qiwei Ye** (C++ code / Python-package) -- `@shiyu1994 `__ **Yu Shi** (C++ code / Python-package) -- `@tongwu-msft` **Tong Wu** (C++ code / Python-package) -- `@hzy46 `__ **Zhiyuan He** (C++ code / Python-package) -- `@btrotta `__ **Belinda Trotta** (C++ code) -- `@Laurae2 `__ **Damien Soukhavong** (R-package) -- `@jameslamb `__ **James Lamb** (R-package / Dask-package) -- `@jmoralez `__ **José Morales** (Dask-package) -- `@wxchan `__ **Wenxuan Chen** (Python-package) -- `@henry0312 `__ **Tsukasa Omoto** (Python-package) -- `@StrikerRUS `__ **Nikita Titov** (Python-package) -- `@huanzhang12 `__ **Huan Zhang** (GPU support) - -Please include as much of the following information as possible when submitting a critical issue: - -- Is it reproducible on CLI (command line interface), R, and/or Python? - -- Is it specific to a wrapper? (R or Python?) - -- Is it specific to the compiler? (gcc or Clang version? MinGW or Visual Studio version?) - -- Is it specific to your Operating System? (Windows? Linux? macOS?) - -- Are you able to reproduce this issue with a simple case? - -- Does the issue persist after removing all optimization flags and compiling LightGBM in debug mode? - -When submitting issues, please keep in mind that this is largely a volunteer effort, and we may not be available 24/7 to provide support. +- `@guolinke `__ **Guolin Ke** +- `@shiyu1994 `__ **Yu Shi** +- `@jameslamb `__ **James Lamb** +- `@jmoralez `__ **José Morales** -------------- @@ -62,7 +33,7 @@ General LightGBM Questions 1. Where do I find more details about LightGBM parameters? ---------------------------------------------------------- -Take a look at `Parameters <./Parameters.rst>`__ and the `Laurae++/Parameters `__ website. +Take a look at `Parameters <./Parameters.rst>`__. 2. On datasets with millions of features, training does not start (or starts after a very long time). ----------------------------------------------------------------------------------------------------- @@ -263,7 +234,7 @@ As of LightGBM v4.0.0, ``setinfo()`` has been replaced by a new method, ``set_fi 3. ``error in data.table::data.table()...argument 2 is NULL`` ------------------------------------------------------------- -If you are experiencing this error when running ``lightgbm``, you may be facing the same issue reported in `#2715 `_ and later in `#2989 `_. We have seen that some in some situations, using ``data.table`` 1.11.x results in this error. To get around this, you can upgrade your version of ``data.table`` to at least version 1.12.0. +If you are experiencing this error when running ``lightgbm``, you may be facing the same issue reported in `#2715 `_ and later in `#2989 `_. We have seen that in some situations, using ``data.table`` 1.11.x results in this error. To get around this, you can upgrade your version of ``data.table`` to at least version 1.12.0. ------ diff --git a/docs/GPU-Targets.rst b/docs/GPU-Targets.rst index 9c3cac7c814a..ab024847d82d 100644 --- a/docs/GPU-Targets.rst +++ b/docs/GPU-Targets.rst @@ -4,7 +4,7 @@ GPU SDK Correspondence and Device Targeting Table GPU Targets Table ================= -OpenCL is a universal massively parallel programming framework that targets to multiple backends (GPU, CPU, FPGA, etc). +OpenCL is a universal massively parallel programming framework that targets multiple backends (GPU, CPU, FPGA, etc). Basically, to use a device from a vendor, you have to install drivers from that specific vendor. Intel's and AMD's OpenCL runtime also include x86 CPU target support. NVIDIA's OpenCL runtime only supports NVIDIA GPU (no CPU support). diff --git a/docs/GPU-Tutorial.rst b/docs/GPU-Tutorial.rst index 836ab1add378..ee1d3173e556 100644 --- a/docs/GPU-Tutorial.rst +++ b/docs/GPU-Tutorial.rst @@ -33,7 +33,7 @@ After installing the drivers you need to restart the server. After about 30 seconds, the server should be up again. -If you are using an AMD GPU, you should download and install the `AMDGPU-Pro`_ driver and also install package ``ocl-icd-libopencl1`` and ``ocl-icd-opencl-dev``. +If you are using an AMD GPU, you should download and install the `AMDGPU-Pro`_ driver and also install packages ``ocl-icd-libopencl1`` and ``ocl-icd-opencl-dev``. Build LightGBM -------------- diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst index 36e657e5801b..7603997d2d97 100644 --- a/docs/GPU-Windows.rst +++ b/docs/GPU-Windows.rst @@ -152,7 +152,7 @@ Download `Prebuilt Boost x86_64`_ or `Prebuilt Boost i686`_ and unpack them wit Boost Compilation ----------------- -Installing Boost requires to download Boost and to install it. +Installing Boost requires downloading Boost and installing it. It takes about 10 minutes to several hours depending on your CPU speed and network speed. We will assume an installation in ``C:\boost`` and a general installation (like in Unix variants: without versioning and without type tags). diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 329f9c38656e..341cdd487c71 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -14,10 +14,6 @@ This page contains descriptions of all parameters in LightGBM. - `Parameters Tuning <./Parameters-Tuning.rst>`__ -**External Links** - -- `Laurae++ Interactive Documentation`_ - Parameters Format ----------------- @@ -1380,5 +1376,3 @@ If the name of data file is ``train.txt``, the query file should be named as ``t In this case, LightGBM will load the query file automatically if it exists. Also, you can include query/group id column in your data file. Please refer to the ``group_column`` `parameter <#group_column>`__ in above. - -.. _Laurae++ Interactive Documentation: https://sites.google.com/view/lauraepp/parameters diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst index 30b0b3c228a2..4a372db6736e 100644 --- a/docs/Quick-Start.rst +++ b/docs/Quick-Start.rst @@ -50,7 +50,7 @@ The parameters format is ``key1=value1 key2=value2 ...``. Parameters can be set both in config file and command line. If one parameter appears in both command line and config file, LightGBM will use the parameter from the command line. -The most important parameters which new users should take a look to are located into `Core Parameters <./Parameters.rst#core-parameters>`__ +The most important parameters which new users should take a look at are located into `Core Parameters <./Parameters.rst#core-parameters>`__ and the top of `Learning Control Parameters <./Parameters.rst#learning-control-parameters>`__ sections of the full detailed list of `LightGBM's parameters <./Parameters.rst>`__. diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h index 3d1c74713bd3..767da12a9809 100644 --- a/include/LightGBM/arrow.h +++ b/include/LightGBM/arrow.h @@ -16,6 +16,7 @@ #include #include #include +#include /* -------------------------------------- C DATA INTERFACE ------------------------------------- */ // The C data interface is taken from @@ -117,6 +118,7 @@ class ArrowChunkedArray { const struct ArrowSchema* schema) { chunks_.reserve(n_chunks); for (auto k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; chunks_.push_back(&chunks[k]); } schema_ = schema; @@ -220,6 +222,7 @@ class ArrowTable { std::vector children_chunks; children_chunks.reserve(n_chunks); for (int64_t k = 0; k < n_chunks; ++k) { + if (chunks[k].length == 0) continue; children_chunks.push_back(chunks[k].children[j]); } columns_.emplace_back(children_chunks, schema->children[j]); diff --git a/include/LightGBM/arrow.tpp b/include/LightGBM/arrow.tpp index 67b481c9497e..8d1ce4f4c0c1 100644 --- a/include/LightGBM/arrow.tpp +++ b/include/LightGBM/arrow.tpp @@ -144,7 +144,7 @@ struct ArrayIndexAccessor { // - The structure of validity bitmasks is taken from here: // https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps // - If the bitmask is NULL, all indices are valid - if (validity == nullptr || !(validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { + if (validity == nullptr || (validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { // In case the index is valid, we take it from the data buffer auto data = static_cast(array->buffers[1]); return static_cast(data[buffer_idx]); diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index fd337cbc7cbe..b43f096c31ee 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -558,9 +558,11 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, /*! * \brief Set vector to a content in info. * \note - * - \a label and \a weight convert input datatype into ``float32``. + * - \a group converts input datatype into ``int32``; + * - \a label and \a weight convert input datatype into ``float32``; + * - \a init_score converts input datatype into ``float64``. * \param handle Handle of dataset - * \param field_name Field name, can be \a label, \a weight + * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group * \param n_chunks The number of Arrow arrays passed to this function * \param chunks Pointer to the list of Arrow arrays * \param schema Pointer to the schema of all Arrow arrays @@ -1415,6 +1417,40 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! + * \brief Make prediction for a new dataset. + * \note + * You should pre-allocate memory for ``out_result``: + * - for normal and raw score, its length is equal to ``num_class * num_data``; + * - for leaf index, its length is equal to ``num_class * num_data * num_iteration``; + * - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``. + * \param handle Handle of booster + * \param n_chunks The number of Arrow arrays passed to this function + * \param chunks Pointer to the list of Arrow arrays + * \param schema Pointer to the schema of all Arrow arrays + * \param predict_type What should be predicted + * - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed); + * - ``C_API_PREDICT_RAW_SCORE``: raw score; + * - ``C_API_PREDICT_LEAF_INDEX``: leaf index; + * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param start_iteration Start index of the iteration to predict + * \param num_iteration Number of iteration for prediction, <= 0 means no limit + * \param parameter Other parameters for prediction, e.g. early stopping for prediction + * \param[out] out_len Length of output result + * \param[out] out_result Pointer to array with predictions + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result); + /*! * \brief Save model into file. * \param handle Handle of booster @@ -1559,6 +1595,20 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines, void* reduce_scatter_ext_fun, void* allgather_ext_fun); +/*! + * \brief Set maximum number of threads used by LightGBM routines in this process. + * \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_SetMaxThreads(int num_threads); + +/*! + * \brief Get current maximum number of threads used by LightGBM routines in this process. + * \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads(). + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_GetMaxThreads(int* out); + #if !defined(__cplusplus) && (!defined(__STDC__) || (__STDC_VERSION__ < 199901L)) /*! \brief Inline specifier no-op in C using standards before C99. */ #define INLINE_FUNCTION diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 48c1bee804d7..220a1f9f009c 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -116,6 +116,7 @@ class Metadata { void SetWeights(const ArrowChunkedArray& array); void SetQuery(const data_size_t* query, data_size_t len); + void SetQuery(const ArrowChunkedArray& array); void SetPosition(const data_size_t* position, data_size_t len); @@ -124,6 +125,7 @@ class Metadata { * \param init_score Initial scores, this class will manage memory for init_score. */ void SetInitScore(const double* init_score, data_size_t len); + void SetInitScore(const ArrowChunkedArray& array); /*! @@ -346,8 +348,14 @@ class Metadata { void SetWeightsFromIterator(It first, It last); /*! \brief Insert initial scores at the given index */ void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); + /*! \brief Set init scores from pointers to the first element and the end of an iterator. */ + template + void SetInitScoresFromIterator(It first, It last); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); + /*! \brief Set queries from pointers to the first element and the end of an iterator. */ + template + void SetQueriesFromIterator(It first, It last); /*! \brief Filename of current data */ std::string data_filename_; /*! \brief Number of data */ diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index a337fc353b75..b9a8ea2982fc 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -5,6 +5,15 @@ #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ #define LIGHTGBM_OPENMP_WRAPPER_H_ +#include + +// this can only be changed by LGBM_SetMaxThreads() +LIGHTGBM_EXTERN_C int LGBM_MAX_NUM_THREADS; + +// this is modified by OMP_SET_NUM_THREADS(), for example +// by passing num_thread through params +LIGHTGBM_EXTERN_C int LGBM_DEFAULT_NUM_THREADS; + #ifdef _OPENMP #include @@ -17,22 +26,25 @@ #include #include -inline int OMP_NUM_THREADS() { - int ret = 1; -#pragma omp parallel -#pragma omp master - { ret = omp_get_num_threads(); } - return ret; -} - -inline void OMP_SET_NUM_THREADS(int num_threads) { - static const int default_omp_num_threads = OMP_NUM_THREADS(); - if (num_threads > 0) { - omp_set_num_threads(num_threads); - } else { - omp_set_num_threads(default_omp_num_threads); - } -} +/* + Get number of threads to use in OpenMP parallel regions. + + By default, this will return the result of omp_get_max_threads(), + which is OpenMP-implementation dependent but generally can be controlled + by environment variable OMP_NUM_THREADS. + + ref: + - https://www.openmp.org/spec-html/5.0/openmpsu112.html + - https://gcc.gnu.org/onlinedocs/libgomp/omp_005fget_005fmax_005fthreads.html +*/ +LIGHTGBM_EXTERN_C int OMP_NUM_THREADS(); + +/* + Update the default number of threads that'll be used in OpenMP parallel + regions for LightGBM routines where the number of threads aren't directly + supplied. +*/ +LIGHTGBM_EXTERN_C void OMP_SET_NUM_THREADS(int num_threads); class ThreadExceptionHelper { public: @@ -102,10 +114,7 @@ class ThreadExceptionHelper { /** Fall here if no OPENMP support, so just simulate a single thread running. All #pragma omp should be ignored by the compiler **/ - inline void omp_set_num_threads(int) __GOMP_NOTHROW {} // NOLINT (no cast done here) inline void OMP_SET_NUM_THREADS(int) __GOMP_NOTHROW {} - inline int omp_get_num_threads() __GOMP_NOTHROW {return 1;} - inline int omp_get_max_threads() __GOMP_NOTHROW {return 1;} inline int omp_get_thread_num() __GOMP_NOTHROW {return 0;} inline int OMP_NUM_THREADS() __GOMP_NOTHROW { return 1; } #ifdef __cplusplus diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 939842df3389..560a9a438872 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -19,8 +19,8 @@ import scipy.sparse from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, - dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, - pd_Series) + dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, + pd_CategoricalDtype, pd_DataFrame, pd_Series) from .libpath import find_lib_path if TYPE_CHECKING: @@ -70,7 +70,9 @@ List[float], List[int], np.ndarray, - pd_Series + pd_Series, + pa_Array, + pa_ChunkedArray, ] _LGBM_PositionType = Union[ np.ndarray, @@ -82,6 +84,9 @@ np.ndarray, pd_Series, pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, ] _LGBM_TrainDataType = Union[ str, @@ -110,7 +115,8 @@ np.ndarray, pd_DataFrame, dt_DataTable, - scipy.sparse.spmatrix + scipy.sparse.spmatrix, + pa_Table, ] _LGBM_WeightType = Union[ List[float], @@ -758,6 +764,23 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}') +def _pandas_to_numpy( + data: pd_DataFrame, + target_dtype: "np.typing.DTypeLike" +) -> np.ndarray: + _check_for_bad_pandas_dtypes(data.dtypes) + try: + # most common case (no nullable dtypes) + return data.to_numpy(dtype=target_dtype, copy=False) + except TypeError: + # 1.0 <= pd version < 1.1 and nullable dtypes, least common case + # raises error because array is casted to type(pd.NA) and there's no na_value argument + return data.astype(target_dtype, copy=False).values + except ValueError: + # data has nullable dtypes, but we can specify na_value argument and copy will be made + return data.to_numpy(dtype=target_dtype, na_value=np.nan) + + def _data_from_pandas( data: pd_DataFrame, feature_name: _LGBM_FeatureNameConfiguration, @@ -767,6 +790,10 @@ def _data_from_pandas( if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.') + # take shallow copy in case we modify categorical columns + # whole column modifications don't change the original df + data = data.copy(deep=False) + # determine feature names if feature_name == 'auto': feature_name = [str(col) for col in data.columns] @@ -783,29 +810,23 @@ def _data_from_pandas( if list(data[col].cat.categories) != list(category): data[col] = data[col].cat.set_categories(category) if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) if categorical_feature == 'auto': # use cat cols from DataFrame categorical_feature = cat_cols_not_ordered else: # use cat cols specified by user categorical_feature = list(categorical_feature) # type: ignore[assignment] - # get numpy representation of the data - _check_for_bad_pandas_dtypes(data.dtypes) df_dtypes = [dtype.type for dtype in data.dtypes] - df_dtypes.append(np.float32) # so that the target dtype considers floats + # so that the target dtype considers floats + df_dtypes.append(np.float32) target_dtype = np.result_type(*df_dtypes) - try: - # most common case (no nullable dtypes) - data = data.to_numpy(dtype=target_dtype, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - data = data.astype(target_dtype, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - data = data.to_numpy(dtype=target_dtype, na_value=np.nan) - return data, feature_name, categorical_feature, pandas_categorical + + return ( + _pandas_to_numpy(data, target_dtype=target_dtype), + feature_name, + categorical_feature, + pandas_categorical + ) def _dump_pandas_categorical( @@ -1049,7 +1070,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) @@ -1141,6 +1162,13 @@ def predict( num_iteration=num_iteration, predict_type=predict_type ) + elif _is_pyarrow_table(data): + preds, nrow = self.__pred_for_pyarrow_table( + table=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type + ) elif isinstance(data, list): try: data = np.array(data) @@ -1594,6 +1622,48 @@ def __pred_for_csc( if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, nrow + + def __pred_for_pyarrow_table( + self, + table: pa_Table, + start_iteration: int, + num_iteration: int, + predict_type: int + ) -> Tuple[np.ndarray, int]: + """Predict for a PyArrow table.""" + if not PYARROW_INSTALLED: + raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.") + + # Check that the input is valid: we only handle numbers (for now) + if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types): + raise ValueError("Arrow table may only have integer or floating point datatypes") + + # Prepare prediction output array + n_preds = self.__get_num_preds( + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=table.num_rows, + predict_type=predict_type + ) + preds = np.empty(n_preds, dtype=np.float64) + out_num_preds = ctypes.c_int64(0) + + # Export Arrow table to C and run prediction + c_array = _export_arrow_to_c(table) + _safe_call(_LIB.LGBM_BoosterPredictForArrow( + self._handle, + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + if n_preds != out_num_preds.value: + raise ValueError("Wrong length for predict results") + return preds, table.num_rows def current_iteration(self) -> int: """Get the index of the current iteration. @@ -1640,13 +1710,13 @@ def __init__( If this is Dataset for validation, training data should be used as reference. weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. feature_name : list of str, or 'auto', optional (default="auto") Feature names. @@ -2420,13 +2490,13 @@ def create_valid( Label of the data. weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Weight for each instance. Weights should be non-negative. - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for Dataset. params : dict or None, optional (default=None) Other parameters for validation Dataset. @@ -2533,7 +2603,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]] + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] ) -> "Dataset": """Set property into the Dataset. @@ -2562,7 +2632,16 @@ def set_field( return self # If the data is a arrow data, we can just pass it to C - if _is_pyarrow_array(data): + if _is_pyarrow_array(data) or _is_pyarrow_table(data): + # If a table is being passed, we concatenate the columns. This is only valid for + # 'init_score'. + if _is_pyarrow_table(data): + if field_name != "init_score": + raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") + data = pa_chunked_array([ + chunk for array in data.columns for chunk in array.chunks # type: ignore + ]) + c_array = _export_arrow_to_c(data) _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( self._handle, @@ -2805,18 +2884,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": if isinstance(label, pd_DataFrame): if len(label.columns) > 1: raise ValueError('DataFrame for label cannot have multiple columns') - _check_for_bad_pandas_dtypes(label.dtypes) - try: - # most common case (no nullable dtypes) - label = label.to_numpy(dtype=np.float32, copy=False) - except TypeError: - # 1.0 <= pd version < 1.1 and nullable dtypes, least common case - # raises error because array is casted to type(pd.NA) and there's no na_value argument - label = label.astype(np.float32, copy=False).values - except ValueError: - # data has nullable dtypes, but we can specify na_value argument and copy will be made - label = label.to_numpy(dtype=np.float32, na_value=np.nan) - label_array = np.ravel(label) + label_array = np.ravel(_pandas_to_numpy(label, target_dtype=np.float32)) elif _is_pyarrow_array(label): label_array = label else: @@ -2866,7 +2934,7 @@ def set_init_score( Parameters ---------- - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None Init score for Booster. Returns @@ -2888,7 +2956,7 @@ def set_group( Parameters ---------- - group : list, numpy 1-D array, pandas Series or None + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None Group/query data. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -2902,7 +2970,8 @@ def set_group( """ self.group = group if self._handle is not None and group is not None: - group = _list_to_1d_numpy(group, dtype=np.int32, name='group') + if not _is_pyarrow_array(group): + group = _list_to_1d_numpy(group, dtype=np.int32, name='group') self.set_field('group', group) # original values can be modified at cpp side constructed_group = self.get_field('group') @@ -2947,7 +3016,7 @@ def get_feature_name(self) -> List[str]: reserved_string_buffer_size = 255 required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_DatasetGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -2961,7 +3030,7 @@ def get_feature_name(self) -> List[str]: # if buffer length is not long enough, reallocate buffers if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_DatasetGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4331,7 +4400,7 @@ def predict( Parameters ---------- - data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse + data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse Data source for prediction. If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) @@ -4430,7 +4499,7 @@ def refit( .. versionadded:: 4.0.0 - group : list, numpy 1-D array, pandas Series or None, optional (default=None) + group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None) Group/query size for ``data``. Only used in the learning-to-rank task. sum(group) = n_samples. @@ -4439,7 +4508,7 @@ def refit( .. versionadded:: 4.0.0 - init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) + init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None) Init score for ``data``. .. versionadded:: 4.0.0 @@ -4627,7 +4696,7 @@ def feature_name(self) -> List[str]: reserved_string_buffer_size = 255 required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4641,7 +4710,7 @@ def feature_name(self) -> List[str]: # if buffer length is not long enough, reallocate buffers if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] - ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetFeatureNames( self._handle, ctypes.c_int(num_feature), @@ -4851,7 +4920,7 @@ def __get_eval_info(self) -> None: string_buffers = [ ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(self.__num_inner_eval) ] - ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetEvalNames( self._handle, ctypes.c_int(self.__num_inner_eval), @@ -4867,7 +4936,7 @@ def __get_eval_info(self) -> None: string_buffers = [ ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(self.__num_inner_eval) ] - ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] _safe_call(_LIB.LGBM_BoosterGetEvalNames( self._handle, ctypes.c_int(self.__num_inner_eval), diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index dc48dbf792cf..bd1b29a1e802 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -201,6 +201,7 @@ def __init__(self, *args, **kwargs): from pyarrow import Array as pa_Array from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import Table as pa_Table + from pyarrow import chunked_array as pa_chunked_array from pyarrow.cffi import ffi as arrow_cffi from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer @@ -243,6 +244,7 @@ class pa_compute: # type: ignore all = None equal = None + pa_chunked_array = None arrow_is_integer = None arrow_is_floating = None diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index c096a6f1b5e2..21222228b0c2 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -16,8 +16,7 @@ def find_lib_path() -> List[str]: List of all found library paths to LightGBM. """ curr_path = Path(__file__).absolute() - dll_path = [curr_path, - curr_path.parents[1], + dll_path = [curr_path.parents[1], curr_path.parents[0] / 'bin', curr_path.parents[0] / 'lib'] if system() in ('Windows', 'Microsoft'): diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 83520c5248cd..f3545165698f 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -30,7 +30,7 @@ maintainers = [ name = "lightgbm" readme = "README.rst" requires-python = ">=3.6" -version = "4.1.0.99" +version = "4.2.0.99" [project.optional-dependencies] arrow = [ diff --git a/src/c_api.cpp b/src/c_api.cpp index baf934db42b1..67b18003588a 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -2568,6 +2568,57 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle, API_END(); } +int LGBM_BoosterPredictForArrow(BoosterHandle handle, + int64_t n_chunks, + const ArrowArray* chunks, + const ArrowSchema* schema, + int predict_type, + int start_iteration, + int num_iteration, + const char* parameter, + int64_t* out_len, + double* out_result) { + API_BEGIN(); + + // Apply the configuration + auto param = Config::Str2Map(parameter); + Config config; + config.Set(param); + OMP_SET_NUM_THREADS(config.num_threads); + + // Set up chunked array and iterators for all columns + ArrowTable table(n_chunks, chunks, schema); + std::vector> its; + its.reserve(table.get_num_columns()); + for (int64_t j = 0; j < table.get_num_columns(); ++j) { + its.emplace_back(table.get_column(j).begin()); + } + + // Build row function + auto num_columns = table.get_num_columns(); + auto row_fn = [num_columns, &its] (int row_idx) { + std::vector> result; + result.reserve(num_columns); + for (int64_t j = 0; j < num_columns; ++j) { + result.emplace_back(static_cast(j), its[j][row_idx]); + } + return result; + }; + + // Run prediction + Booster* ref_booster = reinterpret_cast(handle); + ref_booster->Predict(start_iteration, + num_iteration, + predict_type, + static_cast(table.get_num_rows()), + static_cast(table.get_num_columns()), + row_fn, + config, + out_result, + out_len); + API_END(); +} + int LGBM_BoosterSaveModel(BoosterHandle handle, int start_iteration, int num_iteration, @@ -2699,6 +2750,23 @@ int LGBM_NetworkInitWithFunctions(int num_machines, int rank, API_END(); } +int LGBM_SetMaxThreads(int num_threads) { + API_BEGIN(); + if (num_threads <= 0) { + LGBM_MAX_NUM_THREADS = -1; + } else { + LGBM_MAX_NUM_THREADS = num_threads; + } + API_END(); +} + +int LGBM_GetMaxThreads(int* out) { + API_BEGIN(); + *out = LGBM_MAX_NUM_THREADS; + API_END(); +} + + // ---- start of some help functions diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 01eb41b71367..058d7bd328ad 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -904,6 +904,10 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray metadata_.SetLabel(ca); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(ca); + } else if (name == std::string("init_score")) { + metadata_.SetInitScore(ca); + } else if (name == std::string("query") || name == std::string("group")) { + metadata_.SetQuery(ca); } else { return false; } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index ed4fb135e62a..55440649f55e 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector +void Metadata::SetInitScoresFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (init_score == nullptr || len == 0) { + // Clear init scores on empty input + if (last - first == 0) { init_score_.clear(); num_init_score_ = 0; return; } - if ((len % num_data_) != 0) { + if (((last - first) % num_data_) != 0) { Log::Fatal("Initial score size doesn't match data size"); } - if (init_score_.empty()) { init_score_.resize(len); } - num_init_score_ = len; + if (init_score_.empty()) { + init_score_.resize(last - first); + } + num_init_score_ = last - first; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024) for (int64_t i = 0; i < num_init_score_; ++i) { - init_score_[i] = Common::AvoidInf(init_score[i]); + init_score_[i] = Common::AvoidInf(first[i]); } init_score_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { - cuda_metadata_->SetInitScore(init_score_.data(), len); + cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size()); } #endif // USE_CUDA } +void Metadata::SetInitScore(const double* init_score, data_size_t len) { + SetInitScoresFromIterator(init_score, init_score + len); +} + +void Metadata::SetInitScore(const ArrowChunkedArray& array) { + SetInitScoresFromIterator(array.begin(), array.end()); +} + void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { if (num_init_score_ <= 0) { Log::Fatal("Inserting initial score data into dataset with no initial scores"); @@ -507,30 +519,34 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da // CUDA is handled after all insertions are complete } -void Metadata::SetQuery(const data_size_t* query, data_size_t len) { +template +void Metadata::SetQueriesFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (query == nullptr || len == 0) { + // Clear query boundaries on empty input + if (last - first == 0) { query_boundaries_.clear(); num_queries_ = 0; return; } + data_size_t sum = 0; #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum) - for (data_size_t i = 0; i < len; ++i) { - sum += query[i]; + for (data_size_t i = 0; i < last - first; ++i) { + sum += first[i]; } if (num_data_ != sum) { - Log::Fatal("Sum of query counts is not same with #data"); + Log::Fatal("Sum of query counts (%i) differs from the length of #data (%i)", num_data_, sum); } - num_queries_ = len; + num_queries_ = last - first; + query_boundaries_.resize(num_queries_ + 1); query_boundaries_[0] = 0; for (data_size_t i = 0; i < num_queries_; ++i) { - query_boundaries_[i + 1] = query_boundaries_[i] + query[i]; + query_boundaries_[i + 1] = query_boundaries_[i] + first[i]; } CalculateQueryWeights(); query_load_from_file_ = false; + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { if (query_weights_.size() > 0) { @@ -543,6 +559,14 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { #endif // USE_CUDA } +void Metadata::SetQuery(const data_size_t* query, data_size_t len) { + SetQueriesFromIterator(query, query + len); +} + +void Metadata::SetQuery(const ArrowChunkedArray& array) { + SetQueriesFromIterator(array.begin(), array.end()); +} + void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { std::lock_guard lock(mutex_); // save to nullptr diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index fa782ebaad25..a1ea79efa1a1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -155,7 +155,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { #pragma warning(disable : 4702) explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) { Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDAP=1"); + "Please recompile with CMake option -DUSE_CUDA=1"); } }; diff --git a/src/utils/openmp_wrapper.cpp b/src/utils/openmp_wrapper.cpp new file mode 100644 index 000000000000..fb6e661eb67c --- /dev/null +++ b/src/utils/openmp_wrapper.cpp @@ -0,0 +1,44 @@ +/*! + * Copyright (c) 2023 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#include + +int LGBM_MAX_NUM_THREADS = -1; + +int LGBM_DEFAULT_NUM_THREADS = -1; + +#ifdef _OPENMP + +#include + +int OMP_NUM_THREADS() { + int default_num_threads = 1; + + if (LGBM_DEFAULT_NUM_THREADS > 0) { + // if LightGBM-specific default has been set, ignore OpenMP-global config + default_num_threads = LGBM_DEFAULT_NUM_THREADS; + } else { + // otherwise, default to OpenMP-global config + #pragma omp single + { default_num_threads = omp_get_max_threads(); } + } + + // ensure that if LGBM_SetMaxThreads() was ever called, LightGBM doesn't + // use more than that many threads + if (LGBM_MAX_NUM_THREADS > 0 && default_num_threads > LGBM_MAX_NUM_THREADS) { + return LGBM_MAX_NUM_THREADS; + } + + return default_num_threads; +} + +void OMP_SET_NUM_THREADS(int num_threads) { + if (num_threads <= 0) { + LGBM_DEFAULT_NUM_THREADS = -1; + } else { + LGBM_DEFAULT_NUM_THREADS = num_threads; + } +} + +#endif // _OPENMP diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index 4bb76e4aba19..6cfec1c445fc 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -247,3 +247,36 @@ def test_booster(): c_str(''), c_str('preb.txt')) LIB.LGBM_BoosterFree(booster2) + + +def test_max_thread_control(): + # at initialization, should be -1 + num_threads = ctypes.c_int(0) + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 + + # updating that value through the C API should work + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(6) + ) + assert ret == 0 + + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == 6 + + # resetting to any negative number should set it to -1 + ret = LIB.LGBM_SetMaxThreads( + ctypes.c_int(-123) + ) + assert ret == 0 + ret = LIB.LGBM_GetMaxThreads( + ctypes.byref(num_threads) + ) + assert ret == 0 + assert num_threads.value == -1 diff --git a/tests/cpp_tests/test_arrow.cpp b/tests/cpp_tests/test_arrow.cpp index 7e3c57c401f4..e975b6ba374b 100644 --- a/tests/cpp_tests/test_arrow.cpp +++ b/tests/cpp_tests/test_arrow.cpp @@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test { // 1) Create validity bitmap char* validity = nullptr; if (!null_indices.empty()) { - validity = static_cast(calloc(values.size() + sizeof(char) - 1, sizeof(char))); + auto num_bytes = (values.size() + 7) / 8; + validity = static_cast(calloc(num_bytes, sizeof(char))); + memset(validity, 0xff, num_bytes * sizeof(char)); for (size_t i = 0; i < values.size(); ++i) { if (std::find(null_indices.begin(), null_indices.end(), i) != null_indices.end()) { - validity[i / 8] |= (1 << (i % 8)); + validity[i / 8] &= ~(1 << (i % 8)); } } } diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 40482a904a62..593c03d8c7ef 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -1,7 +1,6 @@ # coding: utf-8 import filecmp -from pathlib import Path -from typing import Any, Callable, Dict +from typing import Any, Dict, Optional import numpy as np import pyarrow as pa @@ -15,19 +14,45 @@ # UTILITIES # # ----------------------------------------------------------------------------------------------- # +_INTEGER_TYPES = [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), +] +_FLOAT_TYPES = [ + pa.float32(), + pa.float64(), +] + + +def generate_simple_arrow_table(empty_chunks: bool = False) -> pa.Table: + c: list[list[int]] = [[]] if empty_chunks else [] + columns = [ + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int8()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int16()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int64()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float32()), + pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float64()), + ] + return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) -def generate_simple_arrow_table() -> pa.Table: + +def generate_nullable_arrow_table() -> pa.Table: columns = [ - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int8()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int16()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.uint64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.int64()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float32()), - pa.chunked_array([[1, 2, 3, 4, 5]], type=pa.float64()), + pa.chunked_array([[1, None, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[None, 2, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[1, 2, 3, 4, None]], type=pa.float32()), + pa.chunked_array([[None, None, None, None, None]], type=pa.float32()), ] return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) @@ -38,19 +63,40 @@ def generate_dummy_arrow_table() -> pa.Table: return pa.Table.from_arrays([col1, col2], names=["a", "b"]) -def generate_random_arrow_table(num_columns: int, num_datapoints: int, seed: int) -> pa.Table: - columns = [generate_random_arrow_array(num_datapoints, seed + i) for i in range(num_columns)] +def generate_random_arrow_table( + num_columns: int, + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.Table: + columns = [ + generate_random_arrow_array( + num_datapoints, seed + i, generate_nulls=generate_nulls, values=values + ) + for i in range(num_columns) + ] names = [f"col_{i}" for i in range(num_columns)] return pa.Table.from_arrays(columns, names=names) -def generate_random_arrow_array(num_datapoints: int, seed: int) -> pa.ChunkedArray: +def generate_random_arrow_array( + num_datapoints: int, + seed: int, + generate_nulls: bool = True, + values: Optional[np.ndarray] = None, +) -> pa.ChunkedArray: generator = np.random.default_rng(seed) - data = generator.standard_normal(num_datapoints) + data = ( + generator.standard_normal(num_datapoints) + if values is None + else generator.choice(values, size=num_datapoints, replace=True) + ) # Set random nulls - indices = generator.choice(len(data), size=num_datapoints // 10) - data[indices] = None + if generate_nulls: + indices = generator.choice(len(data), size=num_datapoints // 10) + data[indices] = None # Split data into <=2 random chunks split_points = np.sort(generator.choice(np.arange(1, num_datapoints), 2, replace=False)) @@ -80,14 +126,14 @@ def dummy_dataset_params() -> Dict[str, Any]: ("arrow_table_fn", "dataset_params"), [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), + (lambda: generate_simple_arrow_table(empty_chunks=True), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), + (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), (lambda: generate_random_arrow_table(100, 10000, 43), {}), ], ) -def test_dataset_construct_fuzzy( - tmp_path: Path, arrow_table_fn: Callable[[], pa.Table], dataset_params: Dict[str, Any] -): +def test_dataset_construct_fuzzy(tmp_path, arrow_table_fn, dataset_params): arrow_table = arrow_table_fn() arrow_dataset = lgb.Dataset(arrow_table, params=dataset_params) @@ -106,19 +152,25 @@ def test_dataset_construct_fuzzy( def test_dataset_construct_fields_fuzzy(): arrow_table = generate_random_arrow_table(3, 1000, 42) - arrow_labels = generate_random_arrow_array(1000, 42) - arrow_weights = generate_random_arrow_array(1000, 42) + arrow_labels = generate_random_arrow_array(1000, 42, generate_nulls=False) + arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False) + arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32()) - arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights) + arrow_dataset = lgb.Dataset( + arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups + ) arrow_dataset.construct() pandas_dataset = lgb.Dataset( - arrow_table.to_pandas(), label=arrow_labels.to_numpy(), weight=arrow_weights.to_numpy() + arrow_table.to_pandas(), + label=arrow_labels.to_numpy(), + weight=arrow_weights.to_numpy(), + group=arrow_groups.to_numpy(), ) pandas_dataset.construct() # Check for equality - for field in ("label", "weight"): + for field in ("label", "weight", "group"): np_assert_array_equal( arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True ) @@ -131,24 +183,15 @@ def test_dataset_construct_fields_fuzzy(): @pytest.mark.parametrize( ["array_type", "label_data"], - [(pa.array, [0, 1, 0, 0, 1]), (pa.chunked_array, [[0], [1, 0, 0, 1]])], -) -@pytest.mark.parametrize( - "arrow_type", [ - pa.int8(), - pa.int16(), - pa.int32(), - pa.int64(), - pa.uint8(), - pa.uint16(), - pa.uint32(), - pa.uint64(), - pa.float32(), - pa.float64(), + (pa.array, [0, 1, 0, 0, 1]), + (pa.chunked_array, [[0], [1, 0, 0, 1]]), + (pa.chunked_array, [[], [0], [1, 0, 0, 1]]), + (pa.chunked_array, [[0], [], [1, 0], [], [], [0, 1], []]), ], ) -def test_dataset_construct_labels(array_type: Any, label_data: Any, arrow_type: Any): +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_labels(array_type, label_data, arrow_type): data = generate_dummy_arrow_table() labels = array_type(label_data, type=arrow_type) dataset = lgb.Dataset(data, label=labels, params=dummy_dataset_params()) @@ -172,10 +215,15 @@ def test_dataset_construct_weights_none(): @pytest.mark.parametrize( ["array_type", "weight_data"], - [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], + [ + (pa.array, [3, 0.7, 1.5, 0.5, 0.1]), + (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[], [3], [0.7, 1.5, 0.5, 0.1]]), + (pa.chunked_array, [[3], [0.7], [], [], [1.5, 0.5, 0.1], []]), + ], ) -@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) -def test_dataset_construct_weights(array_type: Any, weight_data: Any, arrow_type: Any): +@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES) +def test_dataset_construct_weights(array_type, weight_data, arrow_type): data = generate_dummy_arrow_table() weights = array_type(weight_data, type=arrow_type) dataset = lgb.Dataset(data, weight=weights, params=dummy_dataset_params()) @@ -183,3 +231,157 @@ def test_dataset_construct_weights(array_type: Any, weight_data: Any, arrow_type expected = np.array([3, 0.7, 1.5, 0.5, 0.1], dtype=np.float32) np_assert_array_equal(expected, dataset.get_weight(), strict=True) + + +# -------------------------------------------- GROUPS ------------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "group_data"], + [ + (pa.array, [2, 3]), + (pa.chunked_array, [[2], [3]]), + (pa.chunked_array, [[], [2, 3]]), + (pa.chunked_array, [[2], [], [3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES) +def test_dataset_construct_groups(array_type, group_data, arrow_type): + data = generate_dummy_arrow_table() + groups = array_type(group_data, type=arrow_type) + dataset = lgb.Dataset(data, group=groups, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 2, 5], dtype=np.int32) + np_assert_array_equal(expected, dataset.get_field("group"), strict=True) + + +# ----------------------------------------- INIT SCORES ----------------------------------------- # + + +@pytest.mark.parametrize( + ["array_type", "init_score_data"], + [ + (pa.array, [0, 1, 2, 3, 3]), + (pa.chunked_array, [[0, 1, 2], [3, 3]]), + (pa.chunked_array, [[], [0, 1, 2], [3, 3]]), + (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]), + ], +) +@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) +def test_dataset_construct_init_scores_array( + array_type: Any, init_score_data: Any, arrow_type: Any +): + data = generate_dummy_arrow_table() + init_scores = array_type(init_score_data, type=arrow_type) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + expected = np.array([0, 1, 2, 3, 3], dtype=np.float64) + np_assert_array_equal(expected, dataset.get_init_score(), strict=True) + + +def test_dataset_construct_init_scores_table(): + data = generate_dummy_arrow_table() + init_scores = pa.Table.from_arrays( + [ + generate_random_arrow_array(5, seed=1, generate_nulls=False), + generate_random_arrow_array(5, seed=2, generate_nulls=False), + generate_random_arrow_array(5, seed=3, generate_nulls=False), + ], + names=["a", "b", "c"], + ) + dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) + dataset.construct() + + actual = dataset.get_init_score() + expected = init_scores.to_pandas().to_numpy().astype(np.float64) + np_assert_array_equal(expected, actual, strict=True) + + +# ------------------------------------------ PREDICTION ----------------------------------------- # + + +def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table): + p_arrow = booster.predict(data) + p_pandas = booster.predict(data.to_pandas()) + np_assert_array_equal(p_arrow, p_pandas, strict=True) + + p_raw_arrow = booster.predict(data, raw_score=True) + p_raw_pandas = booster.predict(data.to_pandas(), raw_score=True) + np_assert_array_equal(p_raw_arrow, p_raw_pandas, strict=True) + + p_leaf_arrow = booster.predict(data, pred_leaf=True) + p_leaf_pandas = booster.predict(data.to_pandas(), pred_leaf=True) + np_assert_array_equal(p_leaf_arrow, p_leaf_pandas, strict=True) + + p_pred_contrib_arrow = booster.predict(data, pred_contrib=True) + p_pred_contrib_pandas = booster.predict(data.to_pandas(), pred_contrib=True) + np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True) + + p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True) + p_first_iter_pandas = booster.predict( + data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True + ) + np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True) + + +def test_predict_regression(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "regression", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_binary_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(2)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "binary", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_multiclass_classification(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(5)), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "multiclass", "num_leaves": 7, "num_class": 5}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) + + +def test_predict_ranking(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)), + group=np.array([1000, 2000, 3000, 4000]), + params=dummy_dataset_params(), + ) + booster = lgb.train( + {"objective": "lambdarank", "num_leaves": 7}, + dataset, + num_boost_round=5, + ) + assert_equal_predict_arrow_pandas(booster, data) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 2f6b07e7a77f..b8ef43e41397 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) -def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): +@pytest.mark.parametrize('categories', ['seen', 'unseen']) +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') + if categories == 'seen': + pandas_categorical = [['a', 'b']] + else: + pandas_categorical = [['a']] data = lgb.basic._data_from_pandas( data=df, feature_name=feature_name, categorical_feature="auto", - pandas_categorical=None + pandas_categorical=pandas_categorical, )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes - np.testing.assert_equal(df[column_name].cat.codes, data[:, 0]) + if categories == 'seen': + # if all categories were seen during training we just take the codes + codes = df[column_name].cat.codes + else: + # if we only saw 'a' during training we just replace its code + # and leave the rest as nan + a_code = df[column_name].cat.categories.get_loc('a') + codes = np.where(df[column_name] == 'a', a_code, np.nan) + np.testing.assert_equal(codes, data[:, 0]) @pytest.mark.parametrize('min_data_in_bin', [2, 10])