From fc45eb5908a89af607e212bb3cbc619bcc713a0d Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 19 Aug 2021 22:09:12 -0500
Subject: [PATCH 1/6] documentation changes

---
 R-package/R/lgb.Booster.R                 |  3 ++-
 R-package/R/lgb.Dataset.R                 |  7 +++++--
 R-package/man/lgb.Dataset.Rd              |  4 +++-
 R-package/man/lgb.Dataset.create.valid.Rd |  3 ++-
 R-package/man/predict.lgb.Booster.Rd      |  3 ++-
 python-package/lightgbm/basic.py          | 10 +++++-----
 6 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index d9e0186f97b1..507e5e01085b 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -682,7 +682,8 @@ Booster <- R6::R6Class(
 #' @title Predict method for LightGBM model
 #' @description Predicted values based on class \code{lgb.Booster}
 #' @param object Object of class \code{lgb.Booster}
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
+#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
+#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
 #' @param start_iteration int or None, optional (default=None)
 #'                        Start index of the iteration to predict.
 #'                        If None or <= 0, starts from the first iteration.
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index def2d2ebecf1..de4fa3466a95 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -710,7 +710,9 @@ Dataset <- R6::R6Class(
 #' @title Construct \code{lgb.Dataset} object
 #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 #'              or local file (that was created previously by saving an \code{lgb.Dataset}).
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
+#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
+#'             a character representing path to a text file (CSV, TSV, or LibSVM)
+#'             or a LightGBM Dataset binary file
 #' @param params a list of parameters. See
 #'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
 #'               The "Dataset Parameters" section of the documentation} for a list of parameters
@@ -774,7 +776,8 @@ lgb.Dataset <- function(data,
 #' @title Construct validation data
 #' @description Construct validation data according to training data
 #' @param dataset \code{lgb.Dataset} object, training data
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
+#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
+#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
 #' @param info a list of information of the \code{lgb.Dataset} object
 #' @param ... other information to pass to \code{info}.
 #'
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 4a5abcf78f2c..4c0280af6004 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -16,7 +16,9 @@ lgb.Dataset(
 )
 }
 \arguments{
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
+a character representing path to a text file (CSV, TSV, or LibSVM)
+or a LightGBM Dataset binary file}
 
 \item{params}{a list of parameters. See
 \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd
index ce34908e1828..a5ee89489aa9 100644
--- a/R-package/man/lgb.Dataset.create.valid.Rd
+++ b/R-package/man/lgb.Dataset.create.valid.Rd
@@ -9,7 +9,8 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...)
 \arguments{
 \item{dataset}{\code{lgb.Dataset} object, training data}
 
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
+a character representing a path to a text file (CSV, TSV, or LibSVM)}
 
 \item{info}{a list of information of the \code{lgb.Dataset} object}
 
diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd
index c1c4cfb0cc77..359eb1c80a0a 100644
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -20,7 +20,8 @@
 \arguments{
 \item{object}{Object of class \code{lgb.Booster}}
 
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
+a character representing a path to a text file (CSV, TSV, or LibSVM)}
 
 \item{start_iteration}{int or None, optional (default=None)
 Start index of the iteration to predict.
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index de408a4be4db..85d556dd83de 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -744,7 +744,7 @@ def predict(self, data, start_iteration=0, num_iteration=-1,
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
-            When data type is string or pathlib.Path, it represents the path of txt file.
+            When data type is string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
         start_iteration : int, optional (default=0)
             Start index of the iteration to predict.
         num_iteration : int, optional (default=-1)
@@ -1132,7 +1132,7 @@ def __init__(self, data, label=None, reference=None,
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
             Data source of Dataset.
-            If string or pathlib.Path, it represents the path to txt file.
+            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
         label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
             Label of the data.
         reference : Dataset or None, optional (default=None)
@@ -1776,7 +1776,7 @@ def create_valid(self, data, label=None, weight=None, group=None,
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
             Data source of Dataset.
-            If string or pathlib.Path, it represents the path to txt file.
+            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
         label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
             Label of the data.
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
@@ -3405,7 +3405,7 @@ def predict(self, data, start_iteration=0, num_iteration=None,
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
-            If string or pathlib.Path, it represents the path to txt file.
+            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
         start_iteration : int, optional (default=0)
             Start index of the iteration to predict.
             If <= 0, starts from the first iteration.
@@ -3460,7 +3460,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for refit.
-            If string or pathlib.Path, it represents the path to txt file.
+            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
         label : list, numpy 1-D array or pandas Series / one-column DataFrame
             Label for refit.
         decay_rate : float, optional (default=0.9)

From 9a0cc2334214d4d5a8212b8fac608b132f50e7d3 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 21 Aug 2021 22:07:20 -0500
Subject: [PATCH 2/6] add list of supported formats to error message

---
 src/io/parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/parser.cpp b/src/io/parser.cpp
index 550c4e13d5c0..a71d59ed57e5 100644
--- a/src/io/parser.cpp
+++ b/src/io/parser.cpp
@@ -235,7 +235,7 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features
   int num_col = 0;
   DataType type = GetDataType(filename, header, lines, &num_col);
   if (type == DataType::INVALID) {
-    Log::Fatal("Unknown format of training data.");
+    Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM formats are supported.");
   }
   std::unique_ptr<Parser> ret;
   int output_label_index = -1;

From 582362ae194a5f4ce6b9252765bc7d00db5948f0 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 21 Aug 2021 22:31:11 -0500
Subject: [PATCH 3/6] add unit tests

---
 R-package/R/lgb.Dataset.R                   |  2 +-
 R-package/man/lgb.Dataset.Rd                |  2 +-
 R-package/tests/testthat/test_lgb.Booster.R | 50 +++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index de4fa3466a95..c2105c798173 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -711,7 +711,7 @@ Dataset <- R6::R6Class(
 #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 #'              or local file (that was created previously by saving an \code{lgb.Dataset}).
 #' @param data a \code{matrix} object, a \code{dgCMatrix} object or
-#'             a character representing path to a text file (CSV, TSV, or LibSVM)
+#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
 #'             or a LightGBM Dataset binary file
 #' @param params a list of parameters. See
 #'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 4c0280af6004..04e72205b361 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -17,7 +17,7 @@ lgb.Dataset(
 }
 \arguments{
 \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
-a character representing path to a text file (CSV, TSV, or LibSVM)
+a character representing a path to a text file (CSV, TSV, or LibSVM)
 or a LightGBM Dataset binary file}
 
 \item{params}{a list of parameters. See
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 735f2fef9b66..76d3a41c9b5b 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -1,5 +1,7 @@
 context("Booster")
 
+TOLERANCE <- 1e-6
+
 test_that("Booster$finalize() should not fail", {
     X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L)
     y <- iris[["Sepal.Length"]]
@@ -419,6 +421,54 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
     expect_equal(bst_from_ds$current_iter(), nrounds)
 })
 
+test_that("Booster$eval() should work on a Dataset stored in a binary file", {
+    set.seed(708L)
+    data(agaricus.train, package = "lightgbm")
+    train <- agaricus.train
+    dtrain <- lgb.Dataset(train$data, label = train$label)
+
+    bst <- lgb.train(
+        params = list(
+            objective = "regression"
+            , metric = "l2"
+            , num_leaves = 4L
+        )
+        , data = dtrain
+        , nrounds = 2L
+    )
+
+    data(agaricus.test, package = "lightgbm")
+    test <- agaricus.test
+    dtest <- lgb.Dataset.create.valid(
+        dataset = dtrain
+        , data = test$data
+        , label = test$label
+    )
+    dtest$construct()
+
+    eval_in_mem <- bst$eval(
+        data = dtest
+        , name = "test"
+    )
+
+    test_file <- tempfile(pattern = "lgb.Dataset_")
+    lgb.Dataset.save(
+        dataset = dtest
+        , fname = test_file
+    )
+    rm(dtest)
+
+    eval_from_file <- bst$eval(
+        data = lgb.Dataset(
+            data = test_file
+        )$construct()
+        , name = "test"
+    )
+
+    expect_true(abs(eval_in_mem[[1L]][["value"]] - 0.1744423) < TOLERANCE)
+    expect_identical(eval_in_mem, eval_from_file)
+})
+
 test_that("Booster$rollback_one_iter() should work as expected", {
     set.seed(708L)
     data(agaricus.train, package = "lightgbm")

From 4dcca6a8084dcb0ef9c18f8bd17309ba31181416 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 22 Aug 2021 15:53:47 +0100
Subject: [PATCH 4/6] Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 R-package/R/lgb.Dataset.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index c2105c798173..dfd35fa36dde 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -710,7 +710,7 @@ Dataset <- R6::R6Class(
 #' @title Construct \code{lgb.Dataset} object
 #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 #'              or local file (that was created previously by saving an \code{lgb.Dataset}).
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
+#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
 #'             a character representing a path to a text file (CSV, TSV, or LibSVM)
 #'             or a LightGBM Dataset binary file
 #' @param params a list of parameters. See

From 33448b6d2a396447e6515b8357d69fa1cfc7c351 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 22 Aug 2021 15:25:58 -0500
Subject: [PATCH 5/6] update per review comments

---
 R-package/R/lgb.Dataset.R                 | 5 +++--
 R-package/man/lgb.Dataset.Rd              | 2 +-
 R-package/man/lgb.Dataset.create.valid.Rd | 5 +++--
 docs/Python-Intro.rst                     | 2 +-
 python-package/lightgbm/basic.py          | 2 +-
 src/io/parser.cpp                         | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index dfd35fa36dde..cdbbfd907484 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -776,8 +776,9 @@ lgb.Dataset <- function(data,
 #' @title Construct validation data
 #' @description Construct validation data according to training data
 #' @param dataset \code{lgb.Dataset} object, training data
-#' @param data a \code{matrix} object, a \code{dgCMatrix} object or
-#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
+#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
+#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
+#'             or a character representing a path to a binary \code{Dataset} file
 #' @param info a list of information of the \code{lgb.Dataset} object
 #' @param ... other information to pass to \code{info}.
 #'
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 04e72205b361..966503a8420d 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -16,7 +16,7 @@ lgb.Dataset(
 )
 }
 \arguments{
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object,
 a character representing a path to a text file (CSV, TSV, or LibSVM)
 or a LightGBM Dataset binary file}
 
diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd
index a5ee89489aa9..d0fe428d6b18 100644
--- a/R-package/man/lgb.Dataset.create.valid.Rd
+++ b/R-package/man/lgb.Dataset.create.valid.Rd
@@ -9,8 +9,9 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...)
 \arguments{
 \item{dataset}{\code{lgb.Dataset} object, training data}
 
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or
-a character representing a path to a text file (CSV, TSV, or LibSVM)}
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object,
+a character representing a path to a text file (CSV, TSV, or LibSVM),
+or a character representing a path to a binary \code{Dataset} file}
 
 \item{info}{a list of information of the \code{lgb.Dataset} object}
 
diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst
index 063dbf172445..090bbc1c3b54 100644
--- a/docs/Python-Intro.rst
+++ b/docs/Python-Intro.rst
@@ -33,7 +33,7 @@ Data Interface
 
 The LightGBM Python module can load data from:
 
--  LibSVM (zero-based) / TSV / CSV / TXT format file
+-  LibSVM (zero-based) / TSV / CSV format text file
 
 -  NumPy 2D array(s), pandas DataFrame, H2O DataTable's Frame, SciPy sparse matrix
 
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 70569c966c9b..483a72d6acaa 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -3460,7 +3460,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
         ----------
         data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
             Data source for refit.
-            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
+            If string or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
         label : list, numpy 1-D array or pandas Series / one-column DataFrame
             Label for refit.
         decay_rate : float, optional (default=0.9)
diff --git a/src/io/parser.cpp b/src/io/parser.cpp
index 8c4e88b07b17..58f2d5b94467 100644
--- a/src/io/parser.cpp
+++ b/src/io/parser.cpp
@@ -236,7 +236,7 @@ Parser* Parser::CreateParser(const char* filename, bool header, int num_features
   int num_col = 0;
   DataType type = GetDataType(filename, header, lines, &num_col);
   if (type == DataType::INVALID) {
-    Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM formats are supported.");
+    Log::Fatal("Unknown format of training data. Only CSV, TSV, and LibSVM (zero-based) formatted text files are supported.");
   }
   std::unique_ptr<Parser> ret;
   int output_label_index = -1;

From de189a677a0eb5a69536dfe409744871b95055be Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 24 Aug 2021 17:22:58 -0500
Subject: [PATCH 6/6] make references consistent

---
 R-package/R/lgb.Dataset.R    | 4 ++--
 R-package/man/lgb.Dataset.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index cdbbfd907484..e3081e7de0d6 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -711,8 +711,8 @@ Dataset <- R6::R6Class(
 #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 #'              or local file (that was created previously by saving an \code{lgb.Dataset}).
 #' @param data a \code{matrix} object, a \code{dgCMatrix} object,
-#'             a character representing a path to a text file (CSV, TSV, or LibSVM)
-#'             or a LightGBM Dataset binary file
+#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
+#'             or a character representing a path to a binary \code{lgb.Dataset} file
 #' @param params a list of parameters. See
 #'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
 #'               The "Dataset Parameters" section of the documentation} for a list of parameters
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 966503a8420d..cb71120142d3 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -17,8 +17,8 @@ lgb.Dataset(
 }
 \arguments{
 \item{data}{a \code{matrix} object, a \code{dgCMatrix} object,
-a character representing a path to a text file (CSV, TSV, or LibSVM)
-or a LightGBM Dataset binary file}
+a character representing a path to a text file (CSV, TSV, or LibSVM),
+or a character representing a path to a binary \code{lgb.Dataset} file}
 
 \item{params}{a list of parameters. See
 \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{