From fc45eb5908a89af607e212bb3cbc619bcc713a0d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 19 Aug 2021 22:09:12 -0500 Subject: [PATCH 1/6] documentation changes --- R-package/R/lgb.Booster.R | 3 ++- R-package/R/lgb.Dataset.R | 7 +++++-- R-package/man/lgb.Dataset.Rd | 4 +++- R-package/man/lgb.Dataset.create.valid.Rd | 3 ++- R-package/man/predict.lgb.Booster.Rd | 3 ++- python-package/lightgbm/basic.py | 10 +++++----- 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index d9e0186f97b1..507e5e01085b 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -682,7 +682,8 @@ Booster <- R6::R6Class( #' @title Predict method for LightGBM model #' @description Predicted values based on class \code{lgb.Booster} #' @param object Object of class \code{lgb.Booster} -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename +#' @param data a \code{matrix} object, a \code{dgCMatrix} object or +#' a character representing a path to a text file (CSV, TSV, or LibSVM) #' @param start_iteration int or None, optional (default=None) #' Start index of the iteration to predict. #' If None or <= 0, starts from the first iteration. diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index def2d2ebecf1..de4fa3466a95 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -710,7 +710,9 @@ Dataset <- R6::R6Class( #' @title Construct \code{lgb.Dataset} object #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' or local file (that was created previously by saving an \code{lgb.Dataset}). -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename +#' @param data a \code{matrix} object, a \code{dgCMatrix} object or +#' a character representing path to a text file (CSV, TSV, or LibSVM) +#' or a LightGBM Dataset binary file #' @param params a list of parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ #' The "Dataset Parameters" section of the documentation} for a list of parameters @@ -774,7 +776,8 @@ lgb.Dataset <- function(data, #' @title Construct validation data #' @description Construct validation data according to training data #' @param dataset \code{lgb.Dataset} object, training data -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename +#' @param data a \code{matrix} object, a \code{dgCMatrix} object or +#' a character representing a path to a text file (CSV, TSV, or LibSVM) #' @param info a list of information of the \code{lgb.Dataset} object #' @param ... other information to pass to \code{info}. #' diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4a5abcf78f2c..4c0280af6004 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -16,7 +16,9 @@ lgb.Dataset( ) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or +a character representing path to a text file (CSV, TSV, or LibSVM) +or a LightGBM Dataset binary file} \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index ce34908e1828..a5ee89489aa9 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -9,7 +9,8 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...) \arguments{ \item{dataset}{\code{lgb.Dataset} object, training data} -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or +a character representing a path to a text file (CSV, TSV, or LibSVM)} \item{info}{a list of information of the \code{lgb.Dataset} object} diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index c1c4cfb0cc77..359eb1c80a0a 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -20,7 +20,8 @@ \arguments{ \item{object}{Object of class \code{lgb.Booster}} -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or +a character representing a path to a text file (CSV, TSV, or LibSVM)} \item{start_iteration}{int or None, optional (default=None) Start index of the iteration to predict. diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index de408a4be4db..85d556dd83de 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -744,7 +744,7 @@ def predict(self, data, start_iteration=0, num_iteration=-1, ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for prediction. - When data type is string or pathlib.Path, it represents the path of txt file. + When data type is string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) Start index of the iteration to predict. num_iteration : int, optional (default=-1) @@ -1132,7 +1132,7 @@ def __init__(self, data, label=None, reference=None, ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays Data source of Dataset. - If string or pathlib.Path, it represents the path to txt file. + If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None) Label of the data. reference : Dataset or None, optional (default=None) @@ -1776,7 +1776,7 @@ def create_valid(self, data, label=None, weight=None, group=None, ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays Data source of Dataset. - If string or pathlib.Path, it represents the path to txt file. + If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None) Label of the data. weight : list, numpy 1-D array, pandas Series or None, optional (default=None) @@ -3405,7 +3405,7 @@ def predict(self, data, start_iteration=0, num_iteration=None, ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for prediction. - If string or pathlib.Path, it represents the path to txt file. + If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). start_iteration : int, optional (default=0) Start index of the iteration to predict. If <= 0, starts from the first iteration. @@ -3460,7 +3460,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for refit. - If string or pathlib.Path, it represents the path to txt file. + If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. label : list, numpy 1-D array or pandas Series / one-column DataFrame Label for refit. decay_rate : float, optional (default=0.9) From 9a0cc2334214d4d5a8212b8fac608b132f50e7d3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 21 Aug 2021 22:07:20 -0500 Subject: [PATCH 2/6] add list of supported formats to error message --- src/io/parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/parser.cpp b/src/io/parser.cpp index 550c4e13d5c0..a71d59ed57e5 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -235,7 +235,7 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features int num_col = 0; DataType type = GetDataType(filename, header, lines, &num_col); if (type == DataType::INVALID) { - Log::Fatal("Unknown format of training data."); + Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM formats are supported."); } std::unique_ptr ret; int output_label_index = -1; From 582362ae194a5f4ce6b9252765bc7d00db5948f0 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 21 Aug 2021 22:31:11 -0500 Subject: [PATCH 3/6] add unit tests --- R-package/R/lgb.Dataset.R | 2 +- R-package/man/lgb.Dataset.Rd | 2 +- R-package/tests/testthat/test_lgb.Booster.R | 50 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index de4fa3466a95..c2105c798173 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -711,7 +711,7 @@ Dataset <- R6::R6Class( #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' or local file (that was created previously by saving an \code{lgb.Dataset}). #' @param data a \code{matrix} object, a \code{dgCMatrix} object or -#' a character representing path to a text file (CSV, TSV, or LibSVM) +#' a character representing a path to a text file (CSV, TSV, or LibSVM) #' or a LightGBM Dataset binary file #' @param params a list of parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 4c0280af6004..04e72205b361 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -17,7 +17,7 @@ lgb.Dataset( } \arguments{ \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or -a character representing path to a text file (CSV, TSV, or LibSVM) +a character representing a path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file} \item{params}{a list of parameters. See diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 735f2fef9b66..76d3a41c9b5b 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -1,5 +1,7 @@ context("Booster") +TOLERANCE <- 1e-6 + test_that("Booster$finalize() should not fail", { X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) y <- iris[["Sepal.Length"]] @@ -419,6 +421,54 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w expect_equal(bst_from_ds$current_iter(), nrounds) }) +test_that("Booster$eval() should work on a Dataset stored in a binary file", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + train <- agaricus.train + dtrain <- lgb.Dataset(train$data, label = train$label) + + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "l2" + , num_leaves = 4L + ) + , data = dtrain + , nrounds = 2L + ) + + data(agaricus.test, package = "lightgbm") + test <- agaricus.test + dtest <- lgb.Dataset.create.valid( + dataset = dtrain + , data = test$data + , label = test$label + ) + dtest$construct() + + eval_in_mem <- bst$eval( + data = dtest + , name = "test" + ) + + test_file <- tempfile(pattern = "lgb.Dataset_") + lgb.Dataset.save( + dataset = dtest + , fname = test_file + ) + rm(dtest) + + eval_from_file <- bst$eval( + data = lgb.Dataset( + data = test_file + )$construct() + , name = "test" + ) + + expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < TOLERANCE) + expect_identical(eval_in_mem, eval_from_file) +}) + test_that("Booster$rollback_one_iter() should work as expected", { set.seed(708L) data(agaricus.train, package = "lightgbm") From 4dcca6a8084dcb0ef9c18f8bd17309ba31181416 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 22 Aug 2021 15:53:47 +0100 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Nikita Titov --- R-package/R/lgb.Dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index c2105c798173..dfd35fa36dde 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -710,7 +710,7 @@ Dataset <- R6::R6Class( #' @title Construct \code{lgb.Dataset} object #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' or local file (that was created previously by saving an \code{lgb.Dataset}). -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or +#' @param data a \code{matrix} object, a \code{dgCMatrix} object, #' a character representing a path to a text file (CSV, TSV, or LibSVM) #' or a LightGBM Dataset binary file #' @param params a list of parameters. See From 33448b6d2a396447e6515b8357d69fa1cfc7c351 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 22 Aug 2021 15:25:58 -0500 Subject: [PATCH 5/6] update per review comments --- R-package/R/lgb.Dataset.R | 5 +++-- R-package/man/lgb.Dataset.Rd | 2 +- R-package/man/lgb.Dataset.create.valid.Rd | 5 +++-- docs/Python-Intro.rst | 2 +- python-package/lightgbm/basic.py | 2 +- src/io/parser.cpp | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index dfd35fa36dde..cdbbfd907484 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -776,8 +776,9 @@ lgb.Dataset <- function(data, #' @title Construct validation data #' @description Construct validation data according to training data #' @param dataset \code{lgb.Dataset} object, training data -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or -#' a character representing a path to a text file (CSV, TSV, or LibSVM) +#' @param data a \code{matrix} object, a \code{dgCMatrix} object, +#' a character representing a path to a text file (CSV, TSV, or LibSVM), +#' or a character representing a path to a binary \code{Dataset} file #' @param info a list of information of the \code{lgb.Dataset} object #' @param ... other information to pass to \code{info}. #' diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 04e72205b361..966503a8420d 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -16,7 +16,7 @@ lgb.Dataset( ) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object, a character representing a path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file} diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index a5ee89489aa9..d0fe428d6b18 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -9,8 +9,9 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...) \arguments{ \item{dataset}{\code{lgb.Dataset} object, training data} -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or -a character representing a path to a text file (CSV, TSV, or LibSVM)} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object, +a character representing a path to a text file (CSV, TSV, or LibSVM), +or a character representing a path to a binary \code{Dataset} file} \item{info}{a list of information of the \code{lgb.Dataset} object} diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst index 063dbf172445..090bbc1c3b54 100644 --- a/docs/Python-Intro.rst +++ b/docs/Python-Intro.rst @@ -33,7 +33,7 @@ Data Interface The LightGBM Python module can load data from: -- LibSVM (zero-based) / TSV / CSV / TXT format file +- LibSVM (zero-based) / TSV / CSV format text file - NumPy 2D array(s), pandas DataFrame, H2O DataTable's Frame, SciPy sparse matrix diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 70569c966c9b..483a72d6acaa 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3460,7 +3460,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): ---------- data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse Data source for refit. - If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file. + If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM). label : list, numpy 1-D array or pandas Series / one-column DataFrame Label for refit. decay_rate : float, optional (default=0.9) diff --git a/src/io/parser.cpp b/src/io/parser.cpp index 8c4e88b07b17..58f2d5b94467 100644 --- a/src/io/parser.cpp +++ b/src/io/parser.cpp @@ -236,7 +236,7 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features int num_col = 0; DataType type = GetDataType(filename, header, lines, &num_col); if (type == DataType::INVALID) { - Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM formats are supported."); + Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM (zero-based) formatted text files are supported."); } std::unique_ptr ret; int output_label_index = -1; From de189a677a0eb5a69536dfe409744871b95055be Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 24 Aug 2021 17:22:58 -0500 Subject: [PATCH 6/6] make references consistent --- R-package/R/lgb.Dataset.R | 4 ++-- R-package/man/lgb.Dataset.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index cdbbfd907484..e3081e7de0d6 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -711,8 +711,8 @@ Dataset <- R6::R6Class( #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' or local file (that was created previously by saving an \code{lgb.Dataset}). #' @param data a \code{matrix} object, a \code{dgCMatrix} object, -#' a character representing a path to a text file (CSV, TSV, or LibSVM) -#' or a LightGBM Dataset binary file +#' a character representing a path to a text file (CSV, TSV, or LibSVM), +#' or a character representing a path to a binary \code{lgb.Dataset} file #' @param params a list of parameters. See #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{ #' The "Dataset Parameters" section of the documentation} for a list of parameters diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index 966503a8420d..cb71120142d3 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -17,8 +17,8 @@ lgb.Dataset( } \arguments{ \item{data}{a \code{matrix} object, a \code{dgCMatrix} object, -a character representing a path to a text file (CSV, TSV, or LibSVM) -or a LightGBM Dataset binary file} +a character representing a path to a text file (CSV, TSV, or LibSVM), +or a character representing a path to a binary \code{lgb.Dataset} file} \item{params}{a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{