Skip to content

Commit

Permalink
Merge branch 'master' into ci/r-4.3
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb authored Sep 12, 2023
2 parents 36a8aef + 921479b commit 32269fc
Show file tree
Hide file tree
Showing 36 changed files with 900 additions and 135 deletions.
2 changes: 1 addition & 1 deletion .appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 4.0.0.99.{build}
version: 4.1.0.99.{build}

image: Visual Studio 2015
platform: x64
Expand Down
7 changes: 4 additions & 3 deletions .ci/test_r_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
export R_LINUX_VERSION="3.6.3-1bionic"
export R_APT_REPO="bionic-cran35/"
elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
export R_MAC_VERSION=4.2.2
export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/base/R-${R_MAC_VERSION}.pkg
export R_LINUX_VERSION="4.2.2-1.2204.0"
export R_MAC_VERSION=4.3.1
export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/big-sur-x86_64/base/R-${R_MAC_VERSION}-x86_64.pkg
export R_LINUX_VERSION="4.3.1-1.2204.0"
export R_APT_REPO="jammy-cran40/"
else
echo "Unrecognized R version: ${R_VERSION}"
Expand Down Expand Up @@ -56,6 +56,7 @@ if [[ $OS_NAME == "linux" ]]; then
texlive-latex-recommended \
texlive-fonts-recommended \
texlive-fonts-extra \
tidy \
qpdf \
|| exit -1

Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/r_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
- os: ubuntu-latest
task: r-package
compiler: gcc
r_version: 4.2
r_version: 4.3
build_type: cmake
container: 'ubuntu:22.04'
- os: ubuntu-latest
Expand All @@ -60,19 +60,19 @@ jobs:
- os: ubuntu-latest
task: r-package
compiler: clang
r_version: 4.2
r_version: 4.3
build_type: cmake
container: 'ubuntu:22.04'
- os: macOS-latest
task: r-package
compiler: gcc
r_version: 4.2
r_version: 4.3
build_type: cmake
container: null
- os: macOS-latest
task: r-package
compiler: clang
r_version: 4.2
r_version: 4.3
build_type: cmake
container: null
- os: windows-latest
Expand Down Expand Up @@ -125,13 +125,13 @@ jobs:
- os: ubuntu-latest
task: r-package
compiler: gcc
r_version: 4.2
r_version: 4.3
build_type: cran
container: 'ubuntu:22.04'
- os: macOS-latest
task: r-package
compiler: clang
r_version: 4.2
r_version: 4.3
build_type: cran
container: null
################
Expand All @@ -140,7 +140,7 @@ jobs:
- os: ubuntu-latest
task: r-rchk
compiler: gcc
r_version: 4.2
r_version: 4.3
build_type: cran
container: 'ubuntu:22.04'
steps:
Expand Down
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,6 @@ publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj

Expand Down
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,13 @@ if(UNIX OR MINGW OR CYGWIN)
CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
)
if(MINGW)
# ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353
set(
CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wno-stringop-overflow"
)
endif()
if(USE_DEBUG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
else()
Expand Down
5 changes: 4 additions & 1 deletion R-package/R/lightgbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ NULL
#' \item If passing a factor with more than two variables, will use objective \code{"multiclass"}
#' (note that parameter \code{num_class} in this case will also be determined automatically from
#' \code{label}).
#' \item Otherwise, will use objective \code{"regression"}.
#' \item Otherwise (or if passing \code{lgb.Dataset} as input), will use objective \code{"regression"}.
#' }
#'
#' \emph{New in version 4.0.0}
Expand Down Expand Up @@ -211,6 +211,9 @@ lightgbm <- function(data,
rm(temp)
} else {
data_processor <- NULL
if (objective == "auto") {
objective <- "regression"
}
}

# Set data to a temporary variable
Expand Down
18 changes: 9 additions & 9 deletions R-package/configure
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for lightgbm 4.0.0.99.
# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
Expand Down Expand Up @@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='lightgbm'
PACKAGE_TARNAME='lightgbm'
PACKAGE_VERSION='4.0.0.99'
PACKAGE_STRING='lightgbm 4.0.0.99'
PACKAGE_VERSION='4.1.0.99'
PACKAGE_STRING='lightgbm 4.1.0.99'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''

Expand Down Expand Up @@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures lightgbm 4.0.0.99 to adapt to many kinds of systems.
\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
Expand Down Expand Up @@ -1273,7 +1273,7 @@ fi

if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of lightgbm 4.0.0.99:";;
short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";;
esac
cat <<\_ACEOF
Expand Down Expand Up @@ -1341,7 +1341,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
lightgbm configure 4.0.0.99
lightgbm configure 4.1.0.99
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
Expand Down Expand Up @@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by lightgbm $as_me 4.0.0.99, which was
It was created by lightgbm $as_me 4.1.0.99, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
Expand Down Expand Up @@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by lightgbm $as_me 4.0.0.99, which was
This file was extended by lightgbm $as_me 4.1.0.99, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
Expand Down Expand Up @@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
lightgbm config.status 4.0.0.99
lightgbm config.status 4.1.0.99
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"
Expand Down
61 changes: 61 additions & 0 deletions R-package/cran-comments.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,66 @@
# CRAN Submission History

## v4.1.0 - not submitted

v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved.

## v4.0.0 - Submission 2 - (July 19, 2023)

### CRAN response

> Dear maintainer,
> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
The logs linked from those messagges showed one issue remaining on Debian (0 on Windows).

```text
* checking examples ... [7s/4s] NOTE
Examples with CPU time > 2.5 times elapsed time
user system elapsed ratio
lgb.restore_handle 1.206 0.085 0.128 10.08
```

### Maintainer Notes

Chose to document the issue and need for a fix in https://github.com/microsoft/LightGBM/issues/5987, but not resubmit,
to avoid annoying CRAN maintainers.

## v4.0.0 - Submission 1 - (July 16, 2023)

### CRAN response

> Dear maintainer,
> package lightgbm_4.0.0.tar.gz does not pass the incoming checks automatically.
The logs linked from those messages showed the following issues from `R CMD check`.

```text
* checking S3 generic/method consistency ... NOTE
Mismatches for apparent methods not registered:
merge:
function(x, y, ...)
merge.eval.string:
function(env)
format:
function(x, ...)
format.eval.string:
function(eval_res, eval_err)
See section 'Registering S3 methods' in the 'Writing R Extensions'
manual.
```

```text
* checking examples ... [8s/4s] NOTE
Examples with CPU time > 2.5 times elapsed time
user system elapsed ratio
lgb.restore_handle 1.819 0.128 0.165 11.8
```

### Maintainer Notes

Attempted to fix these with https://github.com/microsoft/LightGBM/pull/5988 and resubmitted.

## v3.3.5 - Submission 2 - (January 16, 2023)

### CRAN response
Expand Down
2 changes: 1 addition & 1 deletion R-package/man/lightgbm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -3790,3 +3790,18 @@ test_that("lightgbm() accepts named categorical_features", {
)
expect_true(length(model$params$categorical_feature) > 0L)
})

test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input", {
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1L])
ds <- lgb.Dataset(x, label = y)
model <- lightgbm(
ds
, objective = "auto"
, verbose = .LGB_VERBOSITY
, nrounds = 5L
, num_threads = .LGB_MAX_THREADS
)
expect_equal(model$params$objective, "regression")
})
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ lightgbm-transform (feature transformation binding): https://github.com/microsof

`postgresml` (LightGBM training and prediction in SQL, via a Postgres extension): https://github.com/postgresml/postgresml

`vaex-ml` (Python DataFrame library with its own interface to LightGBM): https://github.com/vaexio/vaex

Support
-------

Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4.0.0.99
4.1.0.99
1 change: 1 addition & 0 deletions docs/.linkcheckerrc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ threads=1
ignore=
pythonapi/lightgbm\..*\.html.*
http.*amd.com/.*
https.*dl.acm.org/doi/.*
https.*tandfonline.com/.*
ignorewarnings=http-robots-denied,https-certificate-error
checkextern=1
Expand Down
41 changes: 41 additions & 0 deletions docs/Advanced-Topics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,44 @@ Recommendations for gcc Users (MinGW, \*nix)
--------------------------------------------

- Refer to `gcc Tips <./gcc-Tips.rst>`__.

Support for Position Bias Treatment
------------------------------------

Often the relevance labels provided in Learning-to-Rank tasks might be derived from implicit user feedback (e.g., clicks) and therefore might be biased due to their position/location on the screen when having been presented to a user.
LightGBM can make use of positional data.

For example, consider the case where you expect that the first 3 results from a search engine will be visible in users' browsers without scrolling, and all other results for a query would require scrolling.

LightGBM could be told to account for the position bias from results being "above the fold" by providing a ``positions`` array encoded as follows:

::

0
0
0
1
1
0
0
0
1
...

Where ``0 = "above the fold"`` and ``1 = "requires scrolling"``.
The specific values are not important, as long as they are consistent across all observations in the training data.
An encoding like ``100 = "above the fold"`` and ``17 = "requires scrolling"`` would result in exactly the same trained model.

In that way, ``positions`` in LightGBM's API are similar to a categorical feature.
Just as with non-ordinal categorical features, an integer representation is just used for memory and computational efficiency... LightGBM does not care about the absolute or relative magnitude of the values.

Unlike a categorical feature, however, ``positions`` are used to adjust the target to reduce the bias in predictions made by the trained model.

The position file corresponds with training data file line by line, and has one position per line. And if the name of training data file is ``train.txt``, the position file should be named as ``train.txt.position`` and placed in the same folder as the data file.
In this case, LightGBM will load the position file automatically if it exists. The positions can also be specified through the ``Dataset`` constructor when using Python API. If the positions are specified in both approaches, the ``.position`` file will be ignored.

Currently, implemented is an approach to model position bias by using an idea of Generalized Additive Models (`GAM <https://en.wikipedia.org/wiki/Generalized_additive_model>`_) to linearly decompose the document score ``s`` into the sum of a relevance component ``f`` and a positional component ``g``: ``s(x, pos) = f(x) + g(pos)`` where the former component depends on the original query-document features and the latter depends on the position of an item.
During the training, the compound scoring function ``s(x, pos)`` is fit with a standard ranking algorithm (e.g., LambdaMART) which boils down to jointly learning the relevance component ``f(x)`` (it is later returned as an unbiased model) and the position factors ``g(pos)`` that help better explain the observed (biased) labels.
Similar score decomposition ideas have previously been applied for classification & pointwise ranking tasks with assumptions of binary labels and binary relevance (a.k.a. "two-tower" models, refer to the papers: `Towards Disentangling Relevance and Bias in Unbiased Learning to Rank <https://arxiv.org/abs/2212.13937>`_, `PAL: a position-bias aware learning framework for CTR prediction in live recommender systems <https://dl.acm.org/doi/10.1145/3298689.3347033>`_, `A General Framework for Debiasing in CTR Prediction <https://arxiv.org/abs/2112.02767>`_).
In LightGBM, we adapt this idea to general pairwise Lerarning-to-Rank with arbitrary ordinal relevance labels.
Besides, GAMs have been used in the context of explainable ML (`Accurate Intelligible Models with Pairwise Interactions <https://www.cs.cornell.edu/~yinlou/papers/lou-kdd13.pdf>`_) to linearly decompose the contribution of each feature (and possibly their pairwise interactions) to the overall score, for subsequent analysis and interpretation of their effects in the trained models.
6 changes: 6 additions & 0 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,12 @@ Objective Parameters

- separate by ``,``

- ``lambdarank_position_bias_regularization`` :raw-html:`<a id="lambdarank_position_bias_regularization" title="Permalink to this parameter" href="#lambdarank_position_bias_regularization">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``lambdarank_position_bias_regularization >= 0.0``

- used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.

- *New in version 4.1.0*

Metric Parameters
-----------------

Expand Down
2 changes: 1 addition & 1 deletion external_libs/fmt
Submodule fmt updated 86 files
+8 −0 .github/dependabot.yml
+6 −0 .github/issue_template.md
+3 −2 .github/pull_request_template.md
+30 −0 .github/workflows/cifuzz.yml
+12 −1 .github/workflows/doc.yml
+42 −10 .github/workflows/linux.yml
+20 −2 .github/workflows/macos.yml
+65 −0 .github/workflows/scorecard.yml
+61 −21 .github/workflows/windows.yml
+99 −70 CMakeLists.txt
+1,187 −3 ChangeLog.rst
+1 −1 LICENSE.rst
+55 −38 README.rst
+11 −2 doc/CMakeLists.txt
+270 −155 doc/api.rst
+9 −6 doc/build.py
+2 −2 doc/index.rst
+163 −10 doc/syntax.rst
+5 −5 include/fmt/args.h
+482 −341 include/fmt/chrono.h
+116 −122 include/fmt/color.h
+62 −170 include/fmt/compile.h
+976 −1,290 include/fmt/core.h
+820 −1,801 include/fmt/format-inl.h
+2,253 −847 include/fmt/format.h
+0 −2 include/fmt/locale.h
+70 −146 include/fmt/os.h
+129 −55 include/fmt/ostream.h
+202 −192 include/fmt/printf.h
+367 −425 include/fmt/ranges.h
+465 −0 include/fmt/std.h
+86 −64 include/fmt/xchar.h
+40 −29 src/fmt.cc
+15 −96 src/format.cc
+117 −80 src/os.cc
+3 −3 support/Vagrantfile
+0 −43 support/appveyor-build.py
+0 −31 support/appveyor.yml
+0 −1 support/bazel/.bazelrc
+1 −1 support/bazel/.bazelversion
+1 −2 support/bazel/BUILD.bazel
+5 −4 support/bazel/README.md
+1 −1 support/build.gradle
+0 −70 support/cmake/cxx14.cmake
+4 −1 support/cmake/fmt-config.cmake.in
+6 −0 support/manage.py
+1 −1 support/printable.py
+7 −0 support/rst2md.py
+30 −17 test/CMakeLists.txt
+1 −1 test/add-subdirectory-test/CMakeLists.txt
+1 −1 test/args-test.cc
+396 −18 test/chrono-test.cc
+6 −0 test/color-test.cc
+42 −4 test/compile-error-test/CMakeLists.txt
+2 −1 test/compile-fp-test.cc
+37 −39 test/compile-test.cc
+167 −232 test/core-test.cc
+18 −0 test/detect-stdfs.cc
+2 −0 test/enforce-checks-test.cc
+1 −1 test/find-package-test/CMakeLists.txt
+257 −151 test/format-impl-test.cc
+465 −388 test/format-test.cc
+1 −1 test/fuzzing/CMakeLists.txt
+2 −2 test/fuzzing/one-arg.cc
+2 −2 test/fuzzing/two-args.cc
+3 −1 test/gtest-extra-test.cc
+1 −1 test/gtest-extra.cc
+2 −7 test/gtest-extra.h
+1 −7 test/gtest/CMakeLists.txt
+2 −2 test/gtest/gmock-gtest-all.cc
+2 −2 test/mock-allocator.h
+36 −96 test/module-test.cc
+24 −69 test/os-test.cc
+69 −79 test/ostream-test.cc
+3 −90 test/posix-mock-test.cc
+0 −2 test/posix-mock.h
+14 −42 test/printf-test.cc
+198 −34 test/ranges-test.cc
+1 −1 test/scan-test.cc
+17 −14 test/scan.h
+1 −1 test/static-export-test/CMakeLists.txt
+257 −0 test/std-test.cc
+2 −3 test/test-main.cc
+4 −4 test/unicode-test.cc
+2 −6 test/util.h
+177 −95 test/xchar-test.cc
5 changes: 5 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,11 @@ struct Config {
// desc = separate by ``,``
std::vector<double> label_gain;

// check = >=0.0
// desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
// desc = *New in version 4.1.0*
double lambdarank_position_bias_regularization = 0.0;

#ifndef __NVCC__
#pragma endregion

Expand Down
Loading

0 comments on commit 32269fc

Please sign in to comment.