Merge branch 'master' into ci/r-4.3

microsoft · Sep 12, 2023 · 32269fc · 32269fc
2 parents 36a8aef + 921479b
commit 32269fc
Show file tree

Hide file tree

Showing 36 changed files with 900 additions and 135 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -1,4 +1,4 @@
-version: 4.0.0.99.{build}
+version: 4.1.0.99.{build}
 
 image: Visual Studio 2015
 platform: x64

diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
@@ -21,9 +21,9 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     export R_LINUX_VERSION="3.6.3-1bionic"
     export R_APT_REPO="bionic-cran35/"
 elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
-    export R_MAC_VERSION=4.2.2
-    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg
-    export R_LINUX_VERSION="4.2.2-1.2204.0"
+    export R_MAC_VERSION=4.3.1
+    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/big-sur-x86_64/base/R-${R_MAC_VERSION}-x86_64.pkg
+    export R_LINUX_VERSION="4.3.1-1.2204.0"
     export R_APT_REPO="jammy-cran40/"
 else
     echo "Unrecognized R version: ${R_VERSION}"
@@ -56,6 +56,7 @@ if [[ $OS_NAME == "linux" ]]; then
             texlive-latex-recommended \
             texlive-fonts-recommended \
             texlive-fonts-extra \
+            tidy \
             qpdf \
             || exit -1
 

diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
@@ -48,7 +48,7 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: 'ubuntu:22.04'
           - os: ubuntu-latest
@@ -60,19 +60,19 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: 'ubuntu:22.04'
           - os: macOS-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           - os: windows-latest
@@ -125,13 +125,13 @@ jobs:
           - os: ubuntu-latest
             task: r-package
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: 'ubuntu:22.04'
           - os: macOS-latest
             task: r-package
             compiler: clang
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: null
           ################
@@ -140,7 +140,7 @@ jobs:
           - os: ubuntu-latest
             task: r-rchk
             compiler: gcc
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: 'ubuntu:22.04'
     steps:

diff --git a/.gitignore b/.gitignore
@@ -139,8 +139,6 @@ publish/
 # Publish Web Output
 *.[Pp]ublish.xml
 *.azurePubxml
-# TODO: Comment the next line if you want to checkin your web deploy settings 
-# but database connection strings (with potential passwords) will be unencrypted
 *.pubxml
 *.publishproj
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN)
     CMAKE_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
   )
+  if(MINGW)
+    # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353
+    set(
+      CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow"
+    )
+  endif()
   if(USE_DEBUG)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
   else()

diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
@@ -116,7 +116,7 @@ NULL
 #'                  \item If passing a factor with more than two variables, will use objective \code{"multiclass"}
 #'                  (note that parameter \code{num_class} in this case will also be determined automatically from
 #'                  \code{label}).
-#'                  \item Otherwise, will use objective \code{"regression"}.
+#'                  \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}.
 #'                  }
 #'
 #'                  \emph{New in version 4.0.0}
@@ -211,6 +211,9 @@ lightgbm <- function(data,
     rm(temp)
   } else {
     data_processor <- NULL
+    if (objective == "auto") {
+      objective <- "regression"
+    }
   }
 
   # Set data to a temporary variable

diff --git a/R-package/configure b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for lightgbm 4.0.0.99.
+# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='lightgbm'
 PACKAGE_TARNAME='lightgbm'
-PACKAGE_VERSION='4.0.0.99'
-PACKAGE_STRING='lightgbm 4.0.0.99'
+PACKAGE_VERSION='4.1.0.99'
+PACKAGE_STRING='lightgbm 4.1.0.99'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lightgbm 4.0.0.99 to adapt to many kinds of systems.
+\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1273,7 +1273,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lightgbm 4.0.0.99:";;
+     short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";;
    esac
   cat <<\_ACEOF
 
@@ -1341,7 +1341,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lightgbm configure 4.0.0.99
+lightgbm configure 4.1.0.99
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lightgbm $as_me 4.0.0.99, which was
+It was created by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lightgbm $as_me 4.0.0.99, which was
+This file was extended by lightgbm $as_me 4.1.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-lightgbm config.status 4.0.0.99
+lightgbm config.status 4.1.0.99
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 

diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md
@@ -1,5 +1,66 @@
 # CRAN Submission History
 
+## v4.1.0 - not submitted
+
+v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved.
+
+## v4.0.0 - Submission 2 - (July 19, 2023)
+
+### CRAN response
+
+> Dear maintainer,
+> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
+
+The logs linked from those messagges showed one issue remaining on Debian (0 on Windows).
+
+```text
+* checking examples ... [7s/4s] NOTE
+Examples with CPU time > 2.5 times elapsed time
+                    user system elapsed  ratio
+lgb.restore_handle 1.206  0.085   0.128 10.08
+```
+
+### Maintainer Notes
+
+Chose to document the issue and need for a fix in https://github.com/microsoft/LightGBM/issues/5987, but not resubmit,
+to avoid annoying CRAN maintainers.
+
+## v4.0.0 - Submission 1 - (July 16, 2023)
+
+### CRAN response
+
+> Dear maintainer,
+> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
+
+The logs linked from those messages showed the following issues from `R CMD check`.
+
+```text
+* checking S3 generic/method consistency ... NOTE
+Mismatches for apparent methods not registered:
+merge:
+  function(x, y, ...)
+merge.eval.string:
+  function(env)
+
+format:
+  function(x, ...)
+format.eval.string:
+  function(eval_res, eval_err)
+See section 'Registering S3 methods' in the 'Writing R Extensions'
+manual.
+```
+
+```text
+* checking examples ... [8s/4s] NOTE
+Examples with CPU time > 2.5 times elapsed time
+                    user system elapsed ratio
+lgb.restore_handle 1.819  0.128   0.165  11.8
+```
+
+### Maintainer Notes
+
+Attempted to fix these with https://github.com/microsoft/LightGBM/pull/5988 and resubmitted.
+
 ## v3.3.5 - Submission 2 - (January 16, 2023)
 
 ### CRAN response

diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -3790,3 +3790,18 @@ test_that("lightgbm() accepts named categorical_features", {
   )
   expect_true(length(model$params$categorical_feature) > 0L)
 })
+
+test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1L])
+  ds <- lgb.Dataset(x, label = y)
+  model <- lightgbm(
+    ds
+    , objective = "auto"
+    , verbose = .LGB_VERBOSITY
+    , nrounds = 5L
+    , num_threads = .LGB_MAX_THREADS
+  )
+  expect_equal(model$params$objective, "regression")
+})
diff --git a/README.md b/README.md
@@ -126,6 +126,8 @@ lightgbm-transform (feature transformation binding): https://github.com/microsof
 
 `postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml
 
+`vaex-ml` (Python DataFrame library with its own interface to LightGBM): https://github.com/vaexio/vaex
+
 Support
 -------
 

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-4.0.0.99
+4.1.0.99
diff --git a/docs/.linkcheckerrc b/docs/.linkcheckerrc
@@ -9,6 +9,7 @@ threads=1
 ignore=
   pythonapi/lightgbm\..*\.html.*
   http.*amd.com/.*
+  https.*dl.acm.org/doi/.*
   https.*tandfonline.com/.*
 ignorewarnings=http-robots-denied,https-certificate-error
 checkextern=1

diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst
@@ -77,3 +77,44 @@ Recommendations for gcc Users (MinGW, \*nix)
 --------------------------------------------
 
 -  Refer to `gcc Tips <./gcc-Tips.rst>`__.
+
+Support for Position Bias Treatment
+------------------------------------
+
+Often the relevance labels provided in Learning-to-Rank tasks might be derived from implicit user feedback (e.g., clicks) and therefore might be biased due to their position/location on the screen when having been presented to a user.
+LightGBM can make use of positional data.
+
+For example, consider the case where you expect that the first 3 results from a search engine will be visible in users' browsers without scrolling, and all other results for a query would require scrolling.
+
+LightGBM could be told to account for the position bias from results being "above the fold" by providing a ``positions`` array encoded as follows:
+
+::
+
+    0
+    0
+    0
+    1
+    1
+    0
+    0
+    0
+    1
+    ...
+
+Where ``0 = "above the fold"`` and ``1 = "requires scrolling"``.
+The specific values are not important, as long as they are consistent across all observations in the training data.
+An encoding like ``100 = "above the fold"`` and ``17 = "requires scrolling"`` would result in exactly the same trained model.
+
+In that way, ``positions`` in LightGBM's API are similar to a categorical feature.
+Just as with non-ordinal categorical features, an integer representation is just used for memory and computational efficiency... LightGBM does not care about the absolute or relative magnitude of the values.
+
+Unlike a categorical feature, however, ``positions`` are used to adjust the target to reduce the bias in predictions made by the trained model.
+
+The position file corresponds with training data file line by line, and has one position per line. And if the name of training data file is ``train.txt``, the position file should be named as ``train.txt.position`` and placed in the same folder as the data file.
+In this case, LightGBM will load the position file automatically if it exists. The positions can also be specified through the ``Dataset`` constructor when using Python API. If the positions are specified in both approaches, the ``.position`` file will be ignored.
+
+Currently, implemented is an approach to model position bias by using an idea of Generalized Additive Models (`GAM <https://en.wikipedia.org/wiki/Generalized_additive_model>`_) to linearly decompose the document score ``s`` into the sum of a relevance component ``f`` and a positional component ``g``:  ``s(x, pos) = f(x) + g(pos)`` where the former component depends on the original query-document features and the latter depends on the position of an item. 
+During the training, the compound scoring function ``s(x, pos)`` is fit with a standard ranking algorithm (e.g., LambdaMART) which boils down to jointly learning the relevance component ``f(x)`` (it is later returned as an unbiased model) and the position factors ``g(pos)`` that help better explain the observed (biased) labels. 
+Similar score decomposition ideas have previously been applied for classification & pointwise ranking tasks with assumptions of binary labels and binary relevance (a.k.a. "two-tower" models, refer to the papers: `Towards Disentangling Relevance and Bias in Unbiased Learning to Rank <https://arxiv.org/abs/2212.13937>`_, `PAL: a position-bias aware learning framework for CTR prediction in live recommender systems <https://dl.acm.org/doi/10.1145/3298689.3347033>`_, `A General Framework for Debiasing in CTR Prediction <https://arxiv.org/abs/2112.02767>`_). 
+In LightGBM, we adapt this idea to general pairwise Lerarning-to-Rank with arbitrary ordinal relevance labels. 
+Besides, GAMs have been used in the context of explainable ML (`Accurate Intelligible Models with Pairwise Interactions <https://www.cs.cornell.edu/~yinlou/papers/lou-kdd13.pdf>`_) to linearly decompose the contribution of each feature (and possibly their pairwise interactions) to the overall score, for subsequent analysis and interpretation of their effects in the trained models.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -1137,6 +1137,12 @@ Objective Parameters
 
    -  separate by ``,``
 
+-  ``lambdarank_position_bias_regularization`` :raw-html:`<a id="lambdarank_position_bias_regularization" title="Permalink to this parameter" href="#lambdarank_position_bias_regularization">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``lambdarank_position_bias_regularization >= 0.0``
+
+   -  used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+
+   -  *New in version 4.1.0*
+
 Metric Parameters
 -----------------
 

diff --git a/external_libs/fast_double_parser b/external_libs/fast_double_parser
diff --git a/external_libs/fmt b/external_libs/fmt
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -965,6 +965,11 @@ struct Config {
   // desc = separate by ``,``
   std::vector<double> label_gain;
 
+  // check = >=0.0
+  // desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+  // desc = *New in version 4.1.0*
+  double lambdarank_position_bias_regularization = 0.0;
+
   #ifndef __NVCC__
   #pragma endregion
-Original file line number
+Diff line change
@@ Expand Up @@
     `postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml
+    `vaex-ml` (Python DataFrame library with its own interface to LightGBM): https://github.com/vaexio/vaex
     Support
     -------
@@ Expand Down @@
+3 −3		.appveyor.yml
+41 −0		.github/workflows/codeql.yml
+0 −49		.github/workflows/mingw-ci.yml
+0 −49		.github/workflows/mingw64-ci.yml
+44 −0		.github/workflows/msys2-clang.yml
+1 −1		.github/workflows/msys2.yml
+1 −1		.github/workflows/rhub.yml
+2 −2		.github/workflows/ubuntu18.yml
+2 −2		.github/workflows/ubuntu20.yml
+0 −24		.github/workflows/vs16-ci.yml
+0 −25		.github/workflows/vs16-clang-ci.yml
+1 −1		.github/workflows/vs16-ninja-ci.yml
+37 −0		.github/workflows/vs17-ci.yml
+37 −0		.github/workflows/vs17-clang-ci.yml
+2 −2		CMakeLists.txt
+26 −12		README.md
+43 −31		include/fast_double_parser.h
+31 −0		script/table_generation.py
+56 −2		tests/unit.cpp
+8 −0		.github/dependabot.yml
+6 −0		.github/issue_template.md
+3 −2		.github/pull_request_template.md
+30 −0		.github/workflows/cifuzz.yml
+12 −1		.github/workflows/doc.yml
+42 −10		.github/workflows/linux.yml
+20 −2		.github/workflows/macos.yml
+65 −0		.github/workflows/scorecard.yml
+61 −21		.github/workflows/windows.yml
+99 −70		CMakeLists.txt
+1,187 −3		ChangeLog.rst
+1 −1		LICENSE.rst
+55 −38		README.rst
+11 −2		doc/CMakeLists.txt
+270 −155		doc/api.rst
+9 −6		doc/build.py
+2 −2		doc/index.rst
+163 −10		doc/syntax.rst
+5 −5		include/fmt/args.h
+482 −341		include/fmt/chrono.h
+116 −122		include/fmt/color.h
+62 −170		include/fmt/compile.h
+976 −1,290		include/fmt/core.h
+820 −1,801		include/fmt/format-inl.h
+2,253 −847		include/fmt/format.h
+0 −2		include/fmt/locale.h
+70 −146		include/fmt/os.h
+129 −55		include/fmt/ostream.h
+202 −192		include/fmt/printf.h
+367 −425		include/fmt/ranges.h
+465 −0		include/fmt/std.h
+86 −64		include/fmt/xchar.h
+40 −29		src/fmt.cc
+15 −96		src/format.cc
+117 −80		src/os.cc
+3 −3		support/Vagrantfile
+0 −43		support/appveyor-build.py
+0 −31		support/appveyor.yml
+0 −1		support/bazel/.bazelrc
+1 −1		support/bazel/.bazelversion
+1 −2		support/bazel/BUILD.bazel
+5 −4		support/bazel/README.md
+1 −1		support/build.gradle
+0 −70		support/cmake/cxx14.cmake
+4 −1		support/cmake/fmt-config.cmake.in
+6 −0		support/manage.py
+1 −1		support/printable.py
+7 −0		support/rst2md.py
+30 −17		test/CMakeLists.txt
+1 −1		test/add-subdirectory-test/CMakeLists.txt
+1 −1		test/args-test.cc
+396 −18		test/chrono-test.cc
+6 −0		test/color-test.cc
+42 −4		test/compile-error-test/CMakeLists.txt
+2 −1		test/compile-fp-test.cc
+37 −39		test/compile-test.cc
+167 −232		test/core-test.cc
+18 −0		test/detect-stdfs.cc
+2 −0		test/enforce-checks-test.cc
+1 −1		test/find-package-test/CMakeLists.txt
+257 −151		test/format-impl-test.cc
+465 −388		test/format-test.cc
+1 −1		test/fuzzing/CMakeLists.txt
+2 −2		test/fuzzing/one-arg.cc
+2 −2		test/fuzzing/two-args.cc
+3 −1		test/gtest-extra-test.cc
+1 −1		test/gtest-extra.cc
+2 −7		test/gtest-extra.h
+1 −7		test/gtest/CMakeLists.txt
+2 −2		test/gtest/gmock-gtest-all.cc
+2 −2		test/mock-allocator.h
+36 −96		test/module-test.cc
+24 −69		test/os-test.cc
+69 −79		test/ostream-test.cc
+3 −90		test/posix-mock-test.cc
+0 −2		test/posix-mock.h
+14 −42		test/printf-test.cc
+198 −34		test/ranges-test.cc
+1 −1		test/scan-test.cc
+17 −14		test/scan.h
+1 −1		test/static-export-test/CMakeLists.txt
+257 −0		test/std-test.cc
+2 −3		test/test-main.cc
+4 −4		test/unicode-test.cc
+2 −6		test/util.h
+177 −95		test/xchar-test.cc