diff --git a/.appveyor.yml b/.appveyor.yml
index 8733301fbfe9..4cff03d571a1 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,4 +1,4 @@
-version: 4.1.0.{build}
+version: 4.1.0.99.{build}
 
 image: Visual Studio 2015
 platform: x64
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5087d6a8fddb..6705ef130052 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN)
     CMAKE_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
   )
+  if(MINGW)
+    # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353
+    set(
+      CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow"
+    )
+  endif()
   if(USE_DEBUG)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
   else()
diff --git a/R-package/configure b/R-package/configure
index 5f441f942e63..39a18d669833 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.
+# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='lightgbm'
 PACKAGE_TARNAME='lightgbm'
-PACKAGE_VERSION='4.1.0'
-PACKAGE_STRING='lightgbm 4.1.0'
+PACKAGE_VERSION='4.1.0.99'
+PACKAGE_STRING='lightgbm 4.1.0.99'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lightgbm 4.1.0 to adapt to many kinds of systems.
+\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1273,7 +1273,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lightgbm 4.1.0:";;
+     short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";;
    esac
   cat <<\_ACEOF
 
@@ -1341,7 +1341,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lightgbm configure 4.1.0
+lightgbm configure 4.1.0.99
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lightgbm $as_me 4.1.0, which was
+It was created by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lightgbm $as_me 4.1.0, which was
+This file was extended by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-lightgbm config.status 4.1.0
+lightgbm config.status 4.1.0.99
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml
index ca4a84a5d045..233a31f0ead9 100644
--- a/R-package/pkgdown/_pkgdown.yml
+++ b/R-package/pkgdown/_pkgdown.yml
@@ -14,7 +14,7 @@ repo:
     user: https://github.com/
 
 development:
-  mode: release
+  mode: unreleased
 
 authors:
   Yu Shi:
diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R
index 9da2f9bd7167..9c928c1f71d1 100644
--- a/R-package/tests/testthat/helper.R
+++ b/R-package/tests/testthat/helper.R
@@ -29,3 +29,20 @@
 .LGB_VERBOSITY <- as.integer(
   Sys.getenv("LIGHTGBM_TEST_VERBOSITY", "-1")
 )
+
+# [description]
+#    test that every element of 'x' is in 'y'
+#
+#    testthat::expect_in() is not available in version of {testthat}
+#    built for R 3.6, this is here to support a similar interface on R 3.6
+.expect_in <- function(x, y) {
+  if (exists("expect_in")) {
+    expect_in(x, y)
+  } else {
+    missing_items <- x[!(x %in% y)]
+    if (length(missing_items) != 0L) {
+      error_msg <- paste0("Some expected items not found: ", toString(missing_items))
+      stop(error_msg)
+    }
+  }
+}
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 1ff038598db1..5f398f1c081d 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -799,37 +799,166 @@ test_that("all parameters are stored correctly with save_model_to_string()", {
         data = matrix(rnorm(500L), nrow = 100L)
         , label = rnorm(100L)
     )
-    nrounds <- 4L
     bst <- lgb.train(
         params = list(
-            objective = "regression"
-            , metric = "l2"
+            objective = "mape"
+            , metric = c("l2", "mae")
             , num_threads = .LGB_MAX_THREADS
+            , seed = 708L
+            , data_sample_strategy = "bagging"
+            , sub_row = 0.8234
         )
         , data = dtrain
-        , nrounds = nrounds
+        , nrounds = 3L
         , verbose = .LGB_VERBOSITY
     )
 
-    model_str <- bst$save_model_to_string()
-    params_in_file <- .params_from_model_string(model_str = model_str)
+    # entries whose values should reflect params passed to lgb.train()
+    non_default_param_entries <- c(
+        "[objective: mape]"
+        # 'l1' was passed in with alias 'mae'
+        , "[metric: l2,l1]"
+        , "[data_sample_strategy: bagging]"
+        , "[seed: 708]"
+        # this was passed in with alias 'sub_row'
+        , "[bagging_fraction: 0.8234]"
+        , "[num_iterations: 3]"
+    )
+
+    # entries with default values of params
+    default_param_entries <- c(
+        "[boosting: gbdt]"
+        , "[tree_learner: serial]"
+        , "[device_type: cpu]"
+        , "[data: ]"
+        , "[valid: ]"
+        , "[learning_rate: 0.1]"
+        , "[num_leaves: 31]"
+        , sprintf("[num_threads: %i]", .LGB_MAX_THREADS)
+        , "[deterministic: 0]"
+        , "[histogram_pool_size: -1]"
+        , "[max_depth: -1]"
+        , "[min_data_in_leaf: 20]"
+        , "[min_sum_hessian_in_leaf: 0.001]"
+        , "[pos_bagging_fraction: 1]"
+        , "[neg_bagging_fraction: 1]"
+        , "[bagging_freq: 0]"
+        , "[bagging_seed: 15415]"
+        , "[feature_fraction: 1]"
+        , "[feature_fraction_bynode: 1]"
+        , "[feature_fraction_seed: 32671]"
+        , "[extra_trees: 0]"
+        , "[extra_seed: 6642]"
+        , "[early_stopping_round: 0]"
+        , "[first_metric_only: 0]"
+        , "[max_delta_step: 0]"
+        , "[lambda_l1: 0]"
+        , "[lambda_l2: 0]"
+        , "[linear_lambda: 0]"
+        , "[min_gain_to_split: 0]"
+        , "[drop_rate: 0.1]"
+        , "[max_drop: 50]"
+        , "[skip_drop: 0.5]"
+        , "[xgboost_dart_mode: 0]"
+        , "[uniform_drop: 0]"
+        , "[drop_seed: 20623]"
+        , "[top_rate: 0.2]"
+        , "[other_rate: 0.1]"
+        , "[min_data_per_group: 100]"
+        , "[max_cat_threshold: 32]"
+        , "[cat_l2: 10]"
+        , "[cat_smooth: 10]"
+        , "[max_cat_to_onehot: 4]"
+        , "[top_k: 20]"
+        , "[monotone_constraints: ]"
+        , "[monotone_constraints_method: basic]"
+        , "[monotone_penalty: 0]"
+        , "[feature_contri: ]"
+        , "[forcedsplits_filename: ]"
+        , "[force_col_wise: 0]"
+        , "[force_row_wise: 0]"
+        , "[refit_decay_rate: 0.9]"
+        , "[cegb_tradeoff: 1]"
+        , "[cegb_penalty_split: 0]"
+        , "[cegb_penalty_feature_lazy: ]"
+        , "[cegb_penalty_feature_coupled: ]"
+        , "[path_smooth: 0]"
+        , "[interaction_constraints: ]"
+        , sprintf("[verbosity: %i]", .LGB_VERBOSITY)
+        , "[saved_feature_importance_type: 0]"
+        , "[use_quantized_grad: 0]"
+        , "[num_grad_quant_bins: 4]"
+        , "[quant_train_renew_leaf: 0]"
+        , "[stochastic_rounding: 1]"
+        , "[linear_tree: 0]"
+        , "[max_bin: 255]"
+        , "[max_bin_by_feature: ]"
+        , "[min_data_in_bin: 3]"
+        , "[bin_construct_sample_cnt: 200000]"
+        , "[data_random_seed: 2350]"
+        , "[is_enable_sparse: 1]"
+        , "[enable_bundle: 1]"
+        , "[use_missing: 1]"
+        , "[zero_as_missing: 0]"
+        , "[feature_pre_filter: 1]"
+        , "[pre_partition: 0]"
+        , "[two_round: 0]"
+        , "[header: 0]"
+        , "[label_column: ]"
+        , "[weight_column: ]"
+        , "[group_column: ]"
+        , "[ignore_column: ]"
+        , "[categorical_feature: ]"
+        , "[forcedbins_filename: ]"
+        , "[precise_float_parser: 0]"
+        , "[parser_config_file: ]"
+        , "[objective_seed: 4309]"
+        , "[num_class: 1]"
+        , "[is_unbalance: 0]"
+        , "[scale_pos_weight: 1]"
+        , "[sigmoid: 1]"
+        , "[boost_from_average: 1]"
+        , "[reg_sqrt: 0]"
+        , "[alpha: 0.9]"
+        , "[fair_c: 1]"
+        , "[poisson_max_delta_step: 0.7]"
+        , "[tweedie_variance_power: 1.5]"
+        , "[lambdarank_truncation_level: 30]"
+        , "[lambdarank_norm: 1]"
+        , "[label_gain: ]"
+        , "[lambdarank_position_bias_regularization: 0]"
+        , "[eval_at: ]"
+        , "[multi_error_top_k: 1]"
+        , "[auc_mu_weights: ]"
+        , "[num_machines: 1]"
+        , "[local_listen_port: 12400]"
+        , "[time_out: 120]"
+        , "[machine_list_filename: ]"
+        , "[machines: ]"
+        , "[gpu_platform_id: -1]"
+        , "[gpu_device_id: -1]"
+        , "[gpu_use_dp: 0]"
+        , "[num_gpu: 1]"
+    )
+    all_param_entries <- c(non_default_param_entries, default_param_entries)
 
     # parameters should match what was passed from the R package
-    expect_equal(sum(startsWith(params_in_file, "[metric:")), 1L)
-    expect_equal(sum(params_in_file == "[metric: l2]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[num_iterations:")), 1L)
-    expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[objective:")), 1L)
-    expect_equal(sum(params_in_file == "[objective: regression]"), 1L)
-
-    expect_equal(sum(startsWith(params_in_file, "[verbosity:")), 1L)
-    expect_equal(sum(params_in_file == sprintf("[verbosity: %i]", .LGB_VERBOSITY)), 1L)
+    model_str <- bst$save_model_to_string()
+    params_in_file <- .params_from_model_string(model_str = model_str)
+    .expect_in(all_param_entries, params_in_file)
 
     # early stopping should be off by default
     expect_equal(sum(startsWith(params_in_file, "[early_stopping_round:")), 1L)
     expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)
+
+    # since save_model_to_string() is used when serializing with saveRDS(), check that parameters all
+    # roundtrip saveRDS()/loadRDS() successfully
+    rds_file <- tempfile()
+    saveRDS(bst, rds_file)
+    bst_rds <- readRDS(rds_file)
+    model_str <- bst_rds$save_model_to_string()
+    params_in_file <- .params_from_model_string(model_str = model_str)
+    .expect_in(all_param_entries, params_in_file)
 })
 
 test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", {
diff --git a/VERSION.txt b/VERSION.txt
index ee74734aa225..1f06da0058c9 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-4.1.0
+4.1.0.99
diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser
index ace60646c02d..efec03532ef6 160000
--- a/external_libs/fast_double_parser
+++ b/external_libs/fast_double_parser
@@ -1 +1 @@
-Subproject commit ace60646c02dc54c57f19d644e49a61e7e7758ec
+Subproject commit efec03532ef65984786e5e32dbc81f6e6a55a115
diff --git a/external_libs/fmt b/external_libs/fmt
index b6f4ceaed0a0..f5e54359df4c 160000
--- a/external_libs/fmt
+++ b/external_libs/fmt
@@ -1 +1 @@
-Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9
+Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py
index 407f2c73e1e3..a554ee60b6c9 100644
--- a/helpers/parameter_generator.py
+++ b/helpers/parameter_generator.py
@@ -330,7 +330,7 @@ def gen_parameter_code(
     str_to_write += '  std::string tmp_str = "";\n'
     for x in infos:
         for y in x:
-            if "[doc-only]" in y:
+            if "[no-automatically-extract]" in y:
                 continue
             param_type = y["inner_type"][0]
             name = y["name"][0]
@@ -345,7 +345,7 @@ def gen_parameter_code(
     str_to_write += "  std::stringstream str_buf;\n"
     for x in infos:
         for y in x:
-            if "[doc-only]" in y or "[no-save]" in y:
+            if "[no-save]" in y:
                 continue
             param_type = y["inner_type"][0]
             name = y["name"][0]
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 187043cc2053..6d61bc764924 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -5,8 +5,13 @@
  * \note
  * - desc and descl2 fields must be written in reStructuredText format;
  * - nested sections can be placed only at the bottom of parent's section;
- * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually;
- * - [no-save] tag indicates that this param should not be saved into a model text representation.
+ * - [no-automatically-extract]
+ *       - do not automatically extract this parameter into a Config property with the same name in Config::GetMembersFromString(). Use if:
+ *           - specialized extraction logic for this param exists in Config::GetMembersFromString()
+ * - [no-save]
+ *       - this param should not be saved into a model text representation via Config::SaveMembersToString(). Use if:
+ *           - param is only used by the CLI (especially the "predict" and "convert_model" tasks)
+ *           - param is related to LightGBM writing files (e.g. "output_model", "save_binary")
  */
 #ifndef LIGHTGBM_CONFIG_H_
 #define LIGHTGBM_CONFIG_H_
@@ -97,15 +102,15 @@ struct Config {
   #pragma region Core Parameters
   #endif  // __NVCC__
 
+  // [no-automatically-extract]
   // [no-save]
-  // [doc-only]
   // alias = config_file
   // desc = path of config file
   // desc = **Note**: can be used only in CLI version
   std::string config = "";
 
+  // [no-automatically-extract]
   // [no-save]
-  // [doc-only]
   // type = enum
   // default = train
   // options = train, predict, convert_model, refit
@@ -118,7 +123,8 @@ struct Config {
   // desc = **Note**: can be used only in CLI version; for language-specific packages you can use the correspondent functions
   TaskType task = TaskType::kTrain;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank, rank_xendcg
   // alias = objective_type, app, application, loss
@@ -150,7 +156,8 @@ struct Config {
   // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
   std::string objective = "regression";
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // alias = boosting_type, boost
   // options = gbdt, rf, dart
@@ -160,7 +167,7 @@ struct Config {
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
   std::string boosting = "gbdt";
 
-  // [doc-only]
+  // [no-automatically-extract]
   // type = enum
   // options = bagging, goss
   // desc = ``bagging``, Randomly Bagging Sampling
@@ -200,7 +207,8 @@ struct Config {
   // desc = max number of leaves in one tree
   int num_leaves = kDefaultNumLeaves;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = serial, feature, data, voting
   // alias = tree, tree_type, tree_learner_type
@@ -222,7 +230,8 @@ struct Config {
   // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
   int num_threads = 0;
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // type = enum
   // options = cpu, gpu, cuda
   // alias = device
@@ -235,7 +244,7 @@ struct Config {
   // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
   std::string device_type = "cpu";
 
-  // [doc-only]
+  // [no-automatically-extract]
   // alias = random_seed, random_state
   // default = None
   // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
@@ -593,7 +602,6 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   int snapshot_freq = -1;
 
-  // [no-save]
   // desc = whether to use gradient quantization when training
   // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
   // desc = with quantized training, most arithmetics in the training process will be integer operations
@@ -602,21 +610,18 @@ struct Config {
   // desc = *New in version 4.0.0*
   bool use_quantized_grad = false;
 
-  // [no-save]
   // desc = number of bins to quantization gradients and hessians
   // desc = with more bins, the quantized training will be closer to full precision training
   // desc = **Note**: can be used only with ``device_type = cpu``
   // desc = *New in 4.0.0*
   int num_grad_quant_bins = 4;
 
-  // [no-save]
   // desc = whether to renew the leaf values with original gradients when quantized training
   // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
   // desc = **Note**: can be used only with ``device_type = cpu``
   // desc = *New in 4.0.0*
   bool quant_train_renew_leaf = false;
 
-  // [no-save]
   // desc = whether to use stochastic rounding in gradient quantization
   // desc = *New in 4.0.0*
   bool stochastic_rounding = true;
@@ -976,7 +981,8 @@ struct Config {
   #pragma region Metric Parameters
   #endif  // __NVCC__
 
-  // [doc-only]
+  // [no-automatically-extract]
+  // [no-save]
   // alias = metrics, metric_types
   // default = ""
   // type = multi-enum
diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py
index 5815bc602bde..0dc5b75cfdf2 100644
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 
 from .basic import Booster, Dataset, Sequence, register_logger
-from .callback import early_stopping, log_evaluation, record_evaluation, reset_parameter
+from .callback import EarlyStopException, early_stopping, log_evaluation, record_evaluation, reset_parameter
 from .engine import CVBooster, cv, train
 
 try:
@@ -32,5 +32,5 @@
            'train', 'cv',
            'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
            'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker',
-           'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
+           'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException',
            'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph']
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index cb27b4e1af39..84f5ec02bcb4 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -55,6 +55,7 @@
 _LGBM_EvalFunctionResultType = Tuple[str, float, bool]
 _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]]
 _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool]
+_LGBM_BoosterEvalMethodResultWithStandardDeviationType = Tuple[str, str, float, bool, float]
 _LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"]
 _LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"]
 _LGBM_GroupType = Union[
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index ccf0059faf84..7db3d400ecd6 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -3,14 +3,16 @@
 from collections import OrderedDict
 from dataclasses import dataclass
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
-from .basic import Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, _log_info, _log_warning
+from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType,
+                    _LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning)
 
 if TYPE_CHECKING:
     from .engine import CVBooster
 
 __all__ = [
+    'EarlyStopException',
     'early_stopping',
     'log_evaluation',
     'record_evaluation',
@@ -20,16 +22,20 @@
 _EvalResultDict = Dict[str, Dict[str, List[Any]]]
 _EvalResultTuple = Union[
     _LGBM_BoosterEvalMethodResultType,
-    Tuple[str, str, float, bool, float]
+    _LGBM_BoosterEvalMethodResultWithStandardDeviationType
 ]
 _ListOfEvalResultTuples = Union[
     List[_LGBM_BoosterEvalMethodResultType],
-    List[Tuple[str, str, float, bool, float]]
+    List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]
 ]
 
 
 class EarlyStopException(Exception):
-    """Exception of early stopping."""
+    """Exception of early stopping.
+
+    Raise this from a callback passed in via keyword argument ``callbacks``
+    in ``cv()`` or ``train()`` to trigger early stopping.
+    """
 
     def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None:
         """Create early stopping exception.
@@ -38,6 +44,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) ->
         ----------
         best_iteration : int
             The best iteration stopped.
+            0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one.
         best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple
             Scores for each metric, on each validation set, as of the best iteration.
         """
@@ -54,7 +61,7 @@ class CallbackEnv:
     iteration: int
     begin_iteration: int
     end_iteration: int
-    evaluation_result_list: Optional[List[_LGBM_BoosterEvalMethodResultType]]
+    evaluation_result_list: Optional[_ListOfEvalResultTuples]
 
 
 def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str:
@@ -124,6 +131,11 @@ def __init__(self, eval_result: _EvalResultDict) -> None:
         self.eval_result = eval_result
 
     def _init(self, env: CallbackEnv) -> None:
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         self.eval_result.clear()
         for item in env.evaluation_result_list:
             if len(item) == 4:  # regular train
@@ -140,6 +152,11 @@ def _init(self, env: CallbackEnv) -> None:
     def __call__(self, env: CallbackEnv) -> None:
         if env.iteration == env.begin_iteration:
             self._init(env)
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "record_evaluation() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         for item in env.evaluation_result_list:
             if len(item) == 4:
                 data_name, eval_name, result = item[:3]
@@ -278,6 +295,10 @@ def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool:
         return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name
 
     def _init(self, env: CallbackEnv) -> None:
+        if env.evaluation_result_list is None or env.evaluation_result_list == []:
+            raise ValueError(
+                "For early stopping, at least one dataset and eval metric is required for evaluation"
+            )
         is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting"))
         only_train_set = (
             len(env.evaluation_result_list) == 1
@@ -293,9 +314,6 @@ def _init(self, env: CallbackEnv) -> None:
             elif only_train_set:
                 _log_warning('Only training set found, disabling early stopping.')
             return
-        if not env.evaluation_result_list:
-            raise ValueError('For early stopping, '
-                             'at least one dataset and eval metric is required for evaluation')
 
         if self.stopping_rounds <= 0:
             raise ValueError("stopping_rounds should be greater than zero.")
@@ -357,6 +375,11 @@ def __call__(self, env: CallbackEnv) -> None:
             self._init(env)
         if not self.enabled:
             return
+        if env.evaluation_result_list is None:
+            raise RuntimeError(
+                "early_stopping() callback enabled but no evaluation results found. This is a probably bug in LightGBM. "
+                "Please report it at https://github.com/microsoft/LightGBM/issues"
+            )
         # self.best_score_list is initialized to an empty list
         first_time_updating_best_score_list = (self.best_score_list == [])
         for i in range(len(env.evaluation_result_list)):
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index daa6e16b6a9a..822aa3b35017 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -11,9 +11,9 @@
 
 from . import callback
 from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
-                    _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration,
-                    _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
-                    _log_warning)
+                    _LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
+                    _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType,
+                    _LGBM_FeatureNameConfiguration, _log_warning)
 from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
 
 __all__ = [
@@ -519,8 +519,8 @@ def _make_n_folds(
 
 
 def _agg_cv_result(
-    raw_results: List[List[Tuple[str, str, float, bool]]]
-) -> List[Tuple[str, str, float, bool, float]]:
+    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]]
+) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
     """Aggregate cross-validation results."""
     cvmap: Dict[str, List[float]] = OrderedDict()
     metric_type: Dict[str, bool] = {}
@@ -530,7 +530,7 @@ def _agg_cv_result(
             metric_type[key] = one_line[3]
             cvmap.setdefault(key, [])
             cvmap[key].append(one_line[2])
-    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
+    return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
 
 
 def cv(
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 7e909342c01f..c71c233df908 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1103,6 +1103,8 @@ def fit(  # type: ignore[override]
 
         self._classes = self._le.classes_
         self._n_classes = len(self._classes)  # type: ignore[arg-type]
+        if self.objective is None:
+            self._objective = None
 
         # adjust eval metrics to match whether binary or multiclass
         # classification is being performed
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index f05b6fc22ddd..83520c5248cd 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -30,7 +30,7 @@ maintainers = [
 name = "lightgbm"
 readme = "README.rst"
 requires-python = ">=3.6"
-version = "4.1.0"
+version = "4.1.0.99"
 
 [project.optional-dependencies]
 arrow = [
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 8182c9b52b93..394614af3f33 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -664,12 +664,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
 std::string Config::SaveMembersToString() const {
   std::stringstream str_buf;
+  str_buf << "[data_sample_strategy: " << data_sample_strategy << "]\n";
   str_buf << "[data: " << data << "]\n";
   str_buf << "[valid: " << Common::Join(valid, ",") << "]\n";
   str_buf << "[num_iterations: " << num_iterations << "]\n";
   str_buf << "[learning_rate: " << learning_rate << "]\n";
   str_buf << "[num_leaves: " << num_leaves << "]\n";
   str_buf << "[num_threads: " << num_threads << "]\n";
+  str_buf << "[seed: " << seed << "]\n";
   str_buf << "[deterministic: " << deterministic << "]\n";
   str_buf << "[force_col_wise: " << force_col_wise << "]\n";
   str_buf << "[force_row_wise: " << force_row_wise << "]\n";
@@ -722,6 +724,10 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[interaction_constraints: " << interaction_constraints << "]\n";
   str_buf << "[verbosity: " << verbosity << "]\n";
   str_buf << "[saved_feature_importance_type: " << saved_feature_importance_type << "]\n";
+  str_buf << "[use_quantized_grad: " << use_quantized_grad << "]\n";
+  str_buf << "[num_grad_quant_bins: " << num_grad_quant_bins << "]\n";
+  str_buf << "[quant_train_renew_leaf: " << quant_train_renew_leaf << "]\n";
+  str_buf << "[stochastic_rounding: " << stochastic_rounding << "]\n";
   str_buf << "[linear_tree: " << linear_tree << "]\n";
   str_buf << "[max_bin: " << max_bin << "]\n";
   str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index d5aa707adcc0..cd692afb031a 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -1278,21 +1278,34 @@ void Dataset::ConstructHistogramsInner(
   auto ptr_ordered_grad = gradients;
   auto ptr_ordered_hess = hessians;
   if (num_used_dense_group > 0) {
-    if (USE_INDICES) {
-      if (USE_HESSIAN) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+    if (USE_QUANT_GRAD) {
+      int16_t* ordered_gradients_and_hessians = reinterpret_cast<int16_t*>(ordered_gradients);
+      const int16_t* gradients_and_hessians = reinterpret_cast<const int16_t*>(gradients);
+      if (USE_INDICES) {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
         for (data_size_t i = 0; i < num_data; ++i) {
-          ordered_gradients[i] = gradients[data_indices[i]];
-          ordered_hessians[i] = hessians[data_indices[i]];
+          ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]];
         }
-        ptr_ordered_grad = ordered_gradients;
-        ptr_ordered_hess = ordered_hessians;
-      } else {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
-        for (data_size_t i = 0; i < num_data; ++i) {
-          ordered_gradients[i] = gradients[data_indices[i]];
+        ptr_ordered_grad = reinterpret_cast<const score_t*>(ordered_gradients);
+        ptr_ordered_hess = nullptr;
+      }
+    } else {
+      if (USE_INDICES) {
+        if (USE_HESSIAN) {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+          for (data_size_t i = 0; i < num_data; ++i) {
+            ordered_gradients[i] = gradients[data_indices[i]];
+            ordered_hessians[i] = hessians[data_indices[i]];
+          }
+          ptr_ordered_grad = ordered_gradients;
+          ptr_ordered_hess = ordered_hessians;
+        } else {
+  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+          for (data_size_t i = 0; i < num_data; ++i) {
+            ordered_gradients[i] = gradients[data_indices[i]];
+          }
+          ptr_ordered_grad = ordered_gradients;
         }
-        ptr_ordered_grad = ordered_gradients;
       }
     }
     OMP_INIT_EX();
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 163bfc4df9ca..fdf55693a0e9 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -53,6 +53,25 @@ class LeafSplits {
     weight_ = weight;
   }
 
+  /*!
+  * \brief Init split on current leaf on partial data. 
+  * \param leaf Index of current leaf
+  * \param data_partition current data partition
+  * \param sum_gradients
+  * \param sum_hessians
+  * \param sum_gradients_and_hessians
+  * \param weight
+  */
+  void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
+            double sum_hessians, int64_t sum_gradients_and_hessians, double weight) {
+    leaf_index_ = leaf;
+    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    int_sum_gradients_and_hessians_ = sum_gradients_and_hessians;
+    weight_ = weight;
+  }
+
   /*!
   * \brief Init split on current leaf on partial data.
   * \param leaf Index of current leaf
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index c322c1a796c2..37d9a2a50713 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -841,32 +841,65 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
 #endif
 
   // init the leaves that used on next iteration
-  if (best_split_info.left_count < best_split_info.right_count) {
-    CHECK_GT(best_split_info.left_count, 0);
-    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
-                               best_split_info.left_sum_gradient,
-                               best_split_info.left_sum_hessian,
-                               best_split_info.left_output);
-    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                              best_split_info.right_sum_gradient,
-                              best_split_info.right_sum_hessian,
-                              best_split_info.right_output);
+  if (!config_->use_quantized_grad) {
+    if (best_split_info.left_count < best_split_info.right_count) {
+      CHECK_GT(best_split_info.left_count, 0);
+      smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                 best_split_info.left_sum_gradient,
+                                 best_split_info.left_sum_hessian,
+                                 best_split_info.left_output);
+      larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                best_split_info.right_sum_gradient,
+                                best_split_info.right_sum_hessian,
+                                best_split_info.right_output);
+    } else {
+      CHECK_GT(best_split_info.right_count, 0);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                best_split_info.left_sum_gradient,
+                                best_split_info.left_sum_hessian,
+                                best_split_info.left_output);
+    }
   } else {
-    CHECK_GT(best_split_info.right_count, 0);
-    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
-                               best_split_info.right_sum_gradient,
-                               best_split_info.right_sum_hessian,
-                               best_split_info.right_output);
-    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
-                              best_split_info.left_sum_gradient,
-                              best_split_info.left_sum_hessian,
-                              best_split_info.left_output);
+    if (best_split_info.left_count < best_split_info.right_count) {
+      CHECK_GT(best_split_info.left_count, 0);
+      smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                 best_split_info.left_sum_gradient,
+                                 best_split_info.left_sum_hessian,
+                                 best_split_info.left_sum_gradient_and_hessian,
+                                 best_split_info.left_output);
+      larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_sum_gradient_and_hessian,
+                                 best_split_info.right_output);
+    } else {
+      CHECK_GT(best_split_info.right_count, 0);
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
+                                 best_split_info.right_sum_gradient,
+                                 best_split_info.right_sum_hessian,
+                                 best_split_info.right_sum_gradient_and_hessian,
+                                 best_split_info.right_output);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
+                                best_split_info.left_sum_gradient,
+                                best_split_info.left_sum_hessian,
+                                best_split_info.left_sum_gradient_and_hessian,
+                                best_split_info.left_output);
+    }
   }
   if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
     gradient_discretizer_->SetNumBitsInHistogramBin<false>(*left_leaf, *right_leaf,
                                                     data_partition_->leaf_count(*left_leaf),
                                                     data_partition_->leaf_count(*right_leaf));
   }
+
+  #ifdef DEBUG
+  CheckSplit(best_split_info, *left_leaf, *right_leaf);
+  #endif
+
   auto leaves_need_update = constraints_->Update(
       is_numerical_split, *left_leaf, *right_leaf,
       best_split_info.monotone_type, best_split_info.right_output,
@@ -1024,4 +1057,48 @@ std::vector<int8_t> node_used_features = col_sampler_.GetByNode(tree, leaf);
   *split = bests[best_idx];
 }
 
+#ifdef DEBUG
+void SerialTreeLearner::CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index) {
+  data_size_t num_data_in_left = 0;
+  data_size_t num_data_in_right = 0;
+  const data_size_t* data_indices_in_left = data_partition_->GetIndexOnLeaf(left_leaf_index, &num_data_in_left);
+  const data_size_t* data_indices_in_right = data_partition_->GetIndexOnLeaf(right_leaf_index, &num_data_in_right);
+  if (config_->use_quantized_grad) {
+    int32_t sum_left_gradient = 0;
+    int32_t sum_left_hessian = 0;
+    int32_t sum_right_gradient = 0;
+    int32_t sum_right_hessian = 0;
+    const int8_t* discretized_grad_and_hess = gradient_discretizer_->discretized_gradients_and_hessians();
+    for (data_size_t i = 0; i < num_data_in_left; ++i) {
+      const data_size_t index = data_indices_in_left[i];
+      sum_left_gradient += discretized_grad_and_hess[2 * index + 1];
+      sum_left_hessian += discretized_grad_and_hess[2 * index];
+    }
+    for (data_size_t i = 0; i < num_data_in_right; ++i) {
+      const data_size_t index = data_indices_in_right[i];
+      sum_right_gradient += discretized_grad_and_hess[2 * index + 1];
+      sum_right_hessian += discretized_grad_and_hess[2 * index];
+    }
+    Log::Warning("============================ start leaf split info ============================");
+    Log::Warning("left_leaf_index = %d, right_leaf_index = %d", left_leaf_index, right_leaf_index);
+    Log::Warning("num_data_in_left = %d, num_data_in_right = %d", num_data_in_left, num_data_in_right);
+    Log::Warning("sum_left_gradient = %d, best_split_info->left_sum_gradient_and_hessian.gradient = %d", sum_left_gradient,
+      static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian >> 32));
+    Log::Warning("sum_left_hessian = %d, best_split_info->left_sum_gradient_and_hessian.hessian = %d", sum_left_hessian,
+      static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff));
+    Log::Warning("sum_right_gradient = %d, best_split_info->right_sum_gradient_and_hessian.gradient = %d", sum_right_gradient,
+      static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian >> 32));
+    Log::Warning("sum_right_hessian = %d, best_split_info->right_sum_gradient_and_hessian.hessian = %d", sum_right_hessian,
+      static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff));
+    CHECK_EQ(num_data_in_left, best_split_info.left_count);
+    CHECK_EQ(num_data_in_right, best_split_info.right_count);
+    CHECK_EQ(sum_left_gradient, static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian >> 32))
+    CHECK_EQ(sum_left_hessian, static_cast<int32_t>(best_split_info.left_sum_gradient_and_hessian & 0x00000000ffffffff));
+    CHECK_EQ(sum_right_gradient, static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian >> 32));
+    CHECK_EQ(sum_right_hessian, static_cast<int32_t>(best_split_info.right_sum_gradient_and_hessian & 0x00000000ffffffff));
+    Log::Warning("============================ end leaf split info ============================");
+  }
+}
+#endif
+
 }  // namespace LightGBM
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index d815d265c0d2..93e0787a90cf 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -171,7 +171,9 @@ class SerialTreeLearner: public TreeLearner {
 
   std::set<int> FindAllForceFeatures(Json force_split_leaf_setting);
 
+  #ifdef DEBUG
   void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index);
+  #endif
 
   /*!
   * \brief Get the number of data in a leaf
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index cb69440b3cde..9da50945385c 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -1838,7 +1838,6 @@ def test_distributed_quantized_training(cluster):
             'num_grad_quant_bins': 30,
             'quant_train_renew_leaf': True,
             'verbose': -1,
-            'force_row_wise': True,
         }
 
         quant_dask_classifier = lgb.DaskLGBMRegressor(
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 25413d7ea072..b46526bcfaf6 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1092,6 +1092,33 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
         assert np.greater_equal(last_score, best_score - min_delta).any()
 
 
+def test_early_stopping_can_be_triggered_via_custom_callback():
+    X, y = make_synthetic_regression()
+
+    def _early_stop_after_seventh_iteration(env):
+        if env.iteration == 6:
+            exc = lgb.EarlyStopException(
+                best_iteration=6,
+                best_score=[("some_validation_set", "some_metric", 0.708, True)]
+            )
+            raise exc
+
+    bst = lgb.train(
+        params={
+            "objective": "regression",
+            "verbose": -1,
+            "num_leaves": 2
+        },
+        train_set=lgb.Dataset(X, label=y),
+        num_boost_round=23,
+        callbacks=[_early_stop_after_seventh_iteration]
+    )
+    assert bst.num_trees() == 7
+    assert bst.best_score["some_validation_set"]["some_metric"] == 0.708
+    assert bst.best_iteration == 7
+    assert bst.current_iteration() == 7
+
+
 def test_continue_train():
     X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -1507,6 +1534,203 @@ def train_and_predict(init_model=None, return_model=False):
         assert ret_origin == pytest.approx(ret)
 
 
+def test_all_expected_params_are_written_out_to_model_text(tmp_path):
+    X, y = make_synthetic_regression()
+    params = {
+        'objective': 'mape',
+        'metric': ['l2', 'mae'],
+        'seed': 708,
+        'data_sample_strategy': 'bagging',
+        'sub_row': 0.8234,
+        'verbose': -1
+    }
+    dtrain = lgb.Dataset(data=X, label=y)
+    gbm = lgb.train(
+        params=params,
+        train_set=dtrain,
+        num_boost_round=3
+    )
+
+    model_txt_from_memory = gbm.model_to_string()
+    model_file = tmp_path / "out.model"
+    gbm.save_model(filename=model_file)
+    with open(model_file, "r") as f:
+        model_txt_from_file = f.read()
+
+    assert model_txt_from_memory == model_txt_from_file
+
+    # entries whose values should reflect params passed to lgb.train()
+    non_default_param_entries = [
+        "[objective: mape]",
+        # 'l1' was passed in with alias 'mae'
+        "[metric: l2,l1]",
+        "[data_sample_strategy: bagging]",
+        "[seed: 708]",
+        # NOTE: this was passed in with alias 'sub_row'
+        "[bagging_fraction: 0.8234]",
+        "[num_iterations: 3]",
+    ]
+
+    # entries with default values of params
+    default_param_entries = [
+        "[boosting: gbdt]",
+        "[tree_learner: serial]",
+        "[data: ]",
+        "[valid: ]",
+        "[learning_rate: 0.1]",
+        "[num_leaves: 31]",
+        "[num_threads: 0]",
+        "[deterministic: 0]",
+        "[histogram_pool_size: -1]",
+        "[max_depth: -1]",
+        "[min_data_in_leaf: 20]",
+        "[min_sum_hessian_in_leaf: 0.001]",
+        "[pos_bagging_fraction: 1]",
+        "[neg_bagging_fraction: 1]",
+        "[bagging_freq: 0]",
+        "[bagging_seed: 15415]",
+        "[feature_fraction: 1]",
+        "[feature_fraction_bynode: 1]",
+        "[feature_fraction_seed: 32671]",
+        "[extra_trees: 0]",
+        "[extra_seed: 6642]",
+        "[early_stopping_round: 0]",
+        "[first_metric_only: 0]",
+        "[max_delta_step: 0]",
+        "[lambda_l1: 0]",
+        "[lambda_l2: 0]",
+        "[linear_lambda: 0]",
+        "[min_gain_to_split: 0]",
+        "[drop_rate: 0.1]",
+        "[max_drop: 50]",
+        "[skip_drop: 0.5]",
+        "[xgboost_dart_mode: 0]",
+        "[uniform_drop: 0]",
+        "[drop_seed: 20623]",
+        "[top_rate: 0.2]",
+        "[other_rate: 0.1]",
+        "[min_data_per_group: 100]",
+        "[max_cat_threshold: 32]",
+        "[cat_l2: 10]",
+        "[cat_smooth: 10]",
+        "[max_cat_to_onehot: 4]",
+        "[top_k: 20]",
+        "[monotone_constraints: ]",
+        "[monotone_constraints_method: basic]",
+        "[monotone_penalty: 0]",
+        "[feature_contri: ]",
+        "[forcedsplits_filename: ]",
+        "[refit_decay_rate: 0.9]",
+        "[cegb_tradeoff: 1]",
+        "[cegb_penalty_split: 0]",
+        "[cegb_penalty_feature_lazy: ]",
+        "[cegb_penalty_feature_coupled: ]",
+        "[path_smooth: 0]",
+        "[interaction_constraints: ]",
+        "[verbosity: -1]",
+        "[saved_feature_importance_type: 0]",
+        "[use_quantized_grad: 0]",
+        "[num_grad_quant_bins: 4]",
+        "[quant_train_renew_leaf: 0]",
+        "[stochastic_rounding: 1]",
+        "[linear_tree: 0]",
+        "[max_bin: 255]",
+        "[max_bin_by_feature: ]",
+        "[min_data_in_bin: 3]",
+        "[bin_construct_sample_cnt: 200000]",
+        "[data_random_seed: 2350]",
+        "[is_enable_sparse: 1]",
+        "[enable_bundle: 1]",
+        "[use_missing: 1]",
+        "[zero_as_missing: 0]",
+        "[feature_pre_filter: 1]",
+        "[pre_partition: 0]",
+        "[two_round: 0]",
+        "[header: 0]",
+        "[label_column: ]",
+        "[weight_column: ]",
+        "[group_column: ]",
+        "[ignore_column: ]",
+        "[categorical_feature: ]",
+        "[forcedbins_filename: ]",
+        "[precise_float_parser: 0]",
+        "[parser_config_file: ]",
+        "[objective_seed: 4309]",
+        "[num_class: 1]",
+        "[is_unbalance: 0]",
+        "[scale_pos_weight: 1]",
+        "[sigmoid: 1]",
+        "[boost_from_average: 1]",
+        "[reg_sqrt: 0]",
+        "[alpha: 0.9]",
+        "[fair_c: 1]",
+        "[poisson_max_delta_step: 0.7]",
+        "[tweedie_variance_power: 1.5]",
+        "[lambdarank_truncation_level: 30]",
+        "[lambdarank_norm: 1]",
+        "[label_gain: ]",
+        "[lambdarank_position_bias_regularization: 0]",
+        "[eval_at: ]",
+        "[multi_error_top_k: 1]",
+        "[auc_mu_weights: ]",
+        "[num_machines: 1]",
+        "[local_listen_port: 12400]",
+        "[time_out: 120]",
+        "[machine_list_filename: ]",
+        "[machines: ]",
+        "[gpu_platform_id: -1]",
+        "[gpu_device_id: -1]",
+        "[num_gpu: 1]",
+    ]
+    all_param_entries = non_default_param_entries + default_param_entries
+
+    # add device-specific entries
+    #
+    # passed-in force_col_wise / force_row_wise parameters are ignored on CUDA and GPU builds...
+    # https://github.com/microsoft/LightGBM/blob/1d7ee63686272bceffd522284127573b511df6be/src/io/config.cpp#L375-L377
+    if getenv('TASK', '') == 'cuda':
+        device_entries = [
+            "[force_col_wise: 0]",
+            "[force_row_wise: 1]",
+            "[device_type: cuda]",
+            "[gpu_use_dp: 1]"
+        ]
+    elif getenv('TASK', '') == 'gpu':
+        device_entries = [
+            "[force_col_wise: 1]",
+            "[force_row_wise: 0]",
+            "[device_type: gpu]",
+            "[gpu_use_dp: 0]"
+        ]
+    else:
+        device_entries = [
+            "[force_col_wise: 0]",
+            "[force_row_wise: 0]",
+            "[device_type: cpu]",
+            "[gpu_use_dp: 0]"
+        ]
+
+    all_param_entries += device_entries
+
+    # check that model text has all expected param entries
+    for param_str in all_param_entries:
+        assert param_str in model_txt_from_file
+        assert param_str in model_txt_from_memory
+
+    # since Booster.model_to_string() is used when pickling, check that parameters all
+    # roundtrip pickling successfully too
+    gbm_pkl = pickle_and_unpickle_object(gbm, serializer="joblib")
+    model_txt_from_memory = gbm_pkl.model_to_string()
+    model_file = tmp_path / "out-pkl.model"
+    gbm_pkl.save_model(filename=model_file)
+    with open(model_file, "r") as f:
+        model_txt_from_file = f.read()
+
+    for param_str in all_param_entries:
+        assert param_str in model_txt_from_file
+        assert param_str in model_txt_from_memory
+
+
 def test_pandas_categorical():
     pd = pytest.importorskip("pandas")
     np.random.seed(42)  # sometimes there is no difference how cols are treated (cat or not cat)
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index e41719845c0a..2247c9a512d2 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1561,3 +1561,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
     )
     preds = model.predict(X)
     assert spearmanr(preds, y).correlation >= 0.99
+
+
+def test_classifier_fit_detects_classes_every_time():
+    rng = np.random.default_rng(seed=123)
+    nrows = 1000
+    ncols = 20
+
+    X = rng.standard_normal(size=(nrows, ncols))
+    y_bin = (rng.random(size=nrows) <= .3).astype(np.float64)
+    y_multi = rng.integers(4, size=nrows)
+
+    model = lgb.LGBMClassifier(verbose=-1)
+    for _ in range(2):
+        model.fit(X, y_multi)
+        assert model.objective_ == "multiclass"
+        model.fit(X, y_bin)
+        assert model.objective_ == "binary"