From 5fa887bb79d0e4d1961c8adafce9832448535e71 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 17 Nov 2021 22:15:32 -0600 Subject: [PATCH] [R-package] [docs] add intro vignette (#3946) (#4775) * [R-package] [docs] add intro vignette (#3946) * add 10 test vignettes * Revert "add 10 test vignettes" This reverts commit 40fb2e2f1982402798776ee44e4ec82fc4644d3d. * Apply suggestions from code review Co-authored-by: Nikita Titov Co-authored-by: Michael Mayer Co-authored-by: Nikita Titov --- .ci/lint_r_code.R | 2 +- .ci/test_r_package.sh | 6 +- .ci/test_r_package_solaris.sh | 4 +- .ci/test_r_package_valgrind.sh | 2 +- .ci/test_r_package_windows.ps1 | 10 +- .github/workflows/r_package.yml | 4 +- .github/workflows/static_analysis.yml | 2 +- .vsts-ci.yml | 3 +- R-package/DESCRIPTION | 3 + R-package/README.md | 8 +- R-package/pkgdown/_pkgdown.yml | 2 + R-package/vignettes/basic_walkthrough.Rmd | 115 ++++++++++++++++++++++ build-cran-package.sh | 61 +++++++++++- build_r.R | 10 +- docs/conf.py | 2 + 15 files changed, 217 insertions(+), 17 deletions(-) create mode 100644 R-package/vignettes/basic_walkthrough.Rmd diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R index cc83cb7c1cc9..d477a1a70b9c 100755 --- a/.ci/lint_r_code.R +++ b/.ci/lint_r_code.R @@ -8,7 +8,7 @@ SOURCE_DIR <- args[[1L]] FILES_TO_LINT <- list.files( path = SOURCE_DIR - , pattern = "\\.r$" + , pattern = "\\.r$|\\.rmd$" , all.files = TRUE , ignore.case = TRUE , full.names = TRUE diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 522e622b51f0..e7d36e59ceeb 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -92,13 +92,13 @@ if [[ $OS_NAME == "macos" ]]; then fi fi -# Manually install Depends and Imports libraries + 'testthat' +# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat' # to avoid a CI-time dependency on devtools (for devtools::install_deps()) # NOTE: testthat is not required when running rchk if [[ "${TASK}" == "r-rchk" ]]; then - packages="c('data.table', 'jsonlite', 'Matrix', 'R6')" + packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')" else - packages="c('data.table', 'jsonlite', 'Matrix', 'R6', 'testthat')" + packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')" fi compile_from_source="both" if [[ $OS_NAME == "macos" ]]; then diff --git a/.ci/test_r_package_solaris.sh b/.ci/test_r_package_solaris.sh index 1daabf08d91a..18ed6cb2f7ad 100755 --- a/.ci/test_r_package_solaris.sh +++ b/.ci/test_r_package_solaris.sh @@ -5,7 +5,9 @@ apt-get install --no-install-recommends -y \ libxml2-dev \ libssl-dev -Rscript -e "install.packages('rhub', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 +# installation of dependencies needs to happen before building the package, +# since `R CMD build` needs to install the package to build vignettes +Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh || exit -1 diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index fc27689f9090..e7a6cb027d2d 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -1,6 +1,6 @@ #!/bin/bash -RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 +RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh \ --r-executable=RDvalgrind \ || exit -1 diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1 index b9845148e7b7..62562b471345 100644 --- a/.ci/test_r_package_windows.ps1 +++ b/.ci/test_r_package_windows.ps1 @@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT Write-Output "Done installing Rtools" Write-Output "Installing dependencies" -$packages = "c('data.table', 'jsonlite', 'Matrix', 'processx', 'R6', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" +$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $? # MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't @@ -165,7 +165,15 @@ if ($env:COMPILER -ne "MSVC") { } Run-R-Code-Redirect-Stderr "commandArgs <- function(...){$env:BUILD_R_FLAGS}; source('build_r.R')"; Check-Output $? } elseif ($env:R_BUILD_TYPE -eq "cran") { + # NOTE: gzip and tar are needed to create a CRAN package on Windows, but + # some flavors of tar.exe can fail in some settings on Windows. + # Putting the msys64 utilities at the beginning of PATH temporarily to be + # sure they're used for that purpose. + if ($env:R_MAJOR_VERSION -eq "3") { + $env:PATH = "C:\msys64\usr\bin;" + $env:PATH + } Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'sh', args = 'build-cran-package.sh', echo = TRUE, windows_verbatim_args = FALSE, error_on_status = TRUE)" ; Check-Output $? + Remove-From-Path ".*msys64.*" # Test CRAN source .tar.gz in a directory that is not this repo or below it. # When people install.packages('lightgbm'), they won't have the LightGBM # git repo around. This is to protect against the use of relative paths diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 8746297ba159..bff33548ce1d 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -194,7 +194,7 @@ jobs: - name: Install packages shell: bash run: | - RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" + RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }} RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1 - name: Run tests with sanitizers @@ -225,7 +225,7 @@ jobs: shell: bash run: | export PATH=/opt/R-devel/bin/:${PATH} - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" sh build-cran-package.sh R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1 if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml index 77be3c89eae4..123a93e2462b 100644 --- a/.github/workflows/static_analysis.yml +++ b/.github/workflows/static_analysis.yml @@ -57,7 +57,7 @@ jobs: - name: Install packages shell: bash run: | - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" sh build-cran-package.sh || exit -1 R CMD INSTALL --with-keep.source lightgbm_*.tar.gz || exit -1 - name: Test documentation diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 059ebf9d35b4..ccf6245e3dc4 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -19,7 +19,7 @@ resources: image: 'ubuntu:latest' options: "--name ci-container -v /usr/bin/docker:/tmp/docker:ro" - container: rbase - image: rocker/r-base + image: wch1/r-debug jobs: ########################################### - job: Linux @@ -300,6 +300,7 @@ jobs: steps: - script: | LGB_VER=$(head -n 1 VERSION.txt | sed "s/rc/-/g") + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh || exit -1 mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz displayName: 'Build CRAN R-package' diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 490ca15b306a..e7993850f7c6 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -45,8 +45,11 @@ URL: https://github.com/Microsoft/LightGBM BugReports: https://github.com/Microsoft/LightGBM/issues NeedsCompilation: yes Biarch: true +VignetteBuilder: knitr Suggests: + knitr, processx, + rmarkdown, testthat Depends: R (>= 3.5), diff --git a/R-package/README.md b/R-package/README.md index 73659d2b48bf..a32ab91c1576 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -152,7 +152,8 @@ Rscript build_r.R The `build_r.R` script builds the package in a temporary directory called `lightgbm_r`. It will destroy and recreate that directory each time you run the script. That script supports the following command-line options: -- `-j[jobs]`: number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time. +- `--no-build-vignettes`: Skip building vignettes. +- `-j[jobs]`: Number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time. - by default, this script uses single-thread compilation - for best results, set `-j` to the number of physical CPUs - `--skip-install`: Build the package tarball, but do not install it. @@ -269,6 +270,11 @@ sh build-cran-package.sh This will create a file `lightgbm_${VERSION}.tar.gz`, where `VERSION` is the version of `LightGBM`. +That script supports the following command-line options: + +- `--no-build-vignettes`: Skip building vignettes. +- `--r-executable=[path-to-executable]`: Use an alternative build of R. + Also, CRAN package is generated with every commit to any repo's branch and can be found in "Artifacts" section of the associated Azure Pipelines run. ### Standard Installation from CRAN Package diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml index b89ab96f3cd9..9e105d2c6bb5 100644 --- a/R-package/pkgdown/_pkgdown.yml +++ b/R-package/pkgdown/_pkgdown.yml @@ -41,6 +41,8 @@ navbar: href: ../ - icon: fa-home fa-lg href: index.html + - text: Articles + href: articles/index.html - text: Reference href: reference/index.html right: diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd new file mode 100644 index 000000000000..bfdabde7f90e --- /dev/null +++ b/R-package/vignettes/basic_walkthrough.Rmd @@ -0,0 +1,115 @@ +--- +title: + "Basic Walkthrough" +description: > + This vignette describes how to train a LightGBM model for binary classification. +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Basic Walkthrough} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE + , comment = "#>" + , warning = FALSE + , message = FALSE +) +``` + +## Introduction + +Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), a highly efficient gradient boosting implementation (Ke et al. 2017). + +```{r setup} +library(lightgbm) +``` + +This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit. + +## The dataset + +The dataset looks as follows. + +```{r} +data(bank, package = "lightgbm") + +bank[1L:5L, c("y", "age", "balance")] + +# Distribution of the response +table(bank$y) +``` + +## Training the model + +The R package of LightGBM offers two functions to train a model: + +- `lgb.train()`: This is the main training logic. It offers full flexibility but requires a `Dataset` object created by the `lgb.Dataset()` function. +- `lightgbm()`: Simpler, but less flexible. Data can be passed without having to bother with `lgb.Dataset()`. + +### Using the `lightgbm()` function + +In a first step, you need to convert data to numeric. Afterwards, you are ready to fit the model by the `lightgbm()` function. + +```{r} +# Numeric response and feature matrix +y <- as.numeric(bank$y == "yes") +X <- data.matrix(bank[, c("age", "balance")]) + +# Train +fit <- lightgbm( + data = X + , label = y + , num_leaves = 4L + , learning_rate = 1.0 + , nrounds = 10L + , objective = "binary" + , verbose = -1L +) + +# Result +summary(predict(fit, X)) +``` + +It seems to have worked! And the predictions are indeed probabilities between 0 and 1. + +### Using the `lgb.train()` function + +Alternatively, you can go for the more flexible interface `lgb.train()`. Here, as an additional step, you need to prepare `y` and `X` by the data API `lgb.Dataset()` of LightGBM. Parameters are passed to `lgb.train()` as a named list. + +```{r} +# Data interface +dtrain <- lgb.Dataset(X, label = y) + +# Parameters +params <- list( + objective = "binary" + , num_leaves = 4L + , learning_rate = 1.0 +) + +# Train +fit <- lgb.train( + params + , data = dtrain + , nrounds = 10L + , verbose = -1L +) +``` + +Try it out! If stuck, visit LightGBM's [documentation](https://lightgbm.readthedocs.io/en/latest/R/index.html) for more details. + +```{r, echo = FALSE, results = "hide"} +# Cleanup +if (file.exists("lightgbm.model")) { + file.remove("lightgbm.model") +} +``` + +## References + +Ke, Guolin, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. "LightGBM: A Highly Efficient Gradient Boosting Decision Tree." In Advances in Neural Information Processing Systems 30 (NIPS 2017). + +Moro, Sérgio, Paulo Cortez, and Paulo Rita. 2014. "A Data-Driven Approach to Predict the Success of Bank Telemarketing." Decision Support Systems 62: 22–31. diff --git a/build-cran-package.sh b/build-cran-package.sh index e4def4a14657..704edfbcc63e 100755 --- a/build-cran-package.sh +++ b/build-cran-package.sh @@ -11,6 +11,11 @@ # non-standard builds of R, such as those provided in # https://github.com/wch/r-debug. # +# --no-build-vignettes Pass this flag to skip creating vignettes. +# You might want to do this to avoid installing +# vignette-only dependencies, or to avoid +# portability issues. +# # [usage] # # # default usage @@ -18,9 +23,14 @@ # # # custom R build # sh build-cran-package.sh --r-executable=RDvalgrind +# +# # skip vignette building +# sh build-cran-package.sh --no-build-vignettes set -e +# Default values of arguments +BUILD_VIGNETTES=true LGB_R_EXECUTABLE=R while [ $# -gt 0 ]; do @@ -28,6 +38,9 @@ while [ $# -gt 0 ]; do --r-executable=*) LGB_R_EXECUTABLE="${1#*=}" ;; + --no-build-vignettes=*) + BUILD_VIGNETTES=false + ;; *) echo "invalid argument '${1}'" exit -1 @@ -57,6 +70,10 @@ cp -R R-package/* "${TEMP_R_DIR}" cp -R include "${TEMP_R_DIR}/src/" cp -R src/* "${TEMP_R_DIR}/src/" +if ${BUILD_VIGNETTES} ; then + cp docs/logo/LightGBM_logo_black_text.svg "${TEMP_R_DIR}/vignettes/" +fi + cp \ external_libs/fast_double_parser/include/fast_double_parser.h \ "${TEMP_R_DIR}/src/include/LightGBM" @@ -169,8 +186,46 @@ cd "${TEMP_R_DIR}" cd "${ORIG_WD}" -"${LGB_R_EXECUTABLE}" CMD build \ - --keep-empty-dirs \ - lightgbm_r +if ${BUILD_VIGNETTES} ; then + "${LGB_R_EXECUTABLE}" CMD build \ + --keep-empty-dirs \ + lightgbm_r + + echo "removing object files created by vignettes" + rm -rf ./_tmp + mkdir _tmp + TARBALL_NAME="lightgbm_${LGB_VERSION}.tar.gz" + mv "${TARBALL_NAME}" _tmp/ + + echo "untarring ${TARBALL_NAME}" + cd _tmp + tar -xvf "${TARBALL_NAME}" > /dev/null 2>&1 + rm -rf "${TARBALL_NAME}" + cd .. + echo "done untarring ${TARBALL_NAME}" + + echo "re-tarring ${TARBALL_NAME}" + tar \ + -czv \ + -C ./_tmp \ + --exclude=*.a \ + --exclude=*.dll \ + --exclude=*.o \ + --exclude=*.so \ + --exclude=*.tar.gz \ + --exclude=**/conftest.c \ + --exclude=**/conftest.exe \ + -f "${TARBALL_NAME}" \ + lightgbm \ + > /dev/null 2>&1 + echo "Done creating ${TARBALL_NAME}" + + rm -rf ./_tmp +else + "${LGB_R_EXECUTABLE}" CMD build \ + --keep-empty-dirs \ + --no-build-vignettes \ + lightgbm_r +fi echo "Done building R package" diff --git a/build_r.R b/build_r.R index b4c610197692..dbf225e25f70 100644 --- a/build_r.R +++ b/build_r.R @@ -39,6 +39,7 @@ TEMP_SOURCE_DIR <- file.path(TEMP_R_DIR, "src") } parsed_args <- .parse_args(args) +SKIP_VIGNETTES <- "--no-build-vignettes" %in% parsed_args[["flags"]] USING_GPU <- "--use-gpu" %in% parsed_args[["flags"]] USING_MINGW <- "--use-mingw" %in% parsed_args[["flags"]] USING_MSYS2 <- "--use-msys2" %in% parsed_args[["flags"]] @@ -54,7 +55,8 @@ ARGS_TO_DEFINES <- c( ) recognized_args <- c( - "--skip-install" + "--no-build-vignettes" + , "--skip-install" , "--use-gpu" , "--use-mingw" , "--use-msys2" @@ -424,7 +426,11 @@ writeLines(namespace_contents, NAMESPACE_FILE) # NOTE: --keep-empty-dirs is necessary to keep the deep paths expected # by CMake while also meeting the CRAN req to create object files # on demand -.run_shell_command("R", c("CMD", "build", TEMP_R_DIR, "--keep-empty-dirs")) +r_build_args <- c("CMD", "build", TEMP_R_DIR, "--keep-empty-dirs") +if (isTRUE(SKIP_VIGNETTES)) { + r_build_args <- c(r_build_args, "--no-build-vignettes") +} +.run_shell_command("R", r_build_args) # Install the package version <- gsub( diff --git a/docs/conf.py b/docs/conf.py index b48ddce16c10..13751145d422 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -273,8 +273,10 @@ def generate_r_docs(app: Sphinx) -> None: r-base=4.1.0=hb67fd72_2 \ r-data.table=1.14.0=r41hcfec24a_0 \ r-jsonlite=1.7.2=r41hcfec24a_0 \ + r-knitr=1.35=r41hc72bb7e_0 \ r-matrix=1.3_4=r41he454529_0 \ r-pkgdown=1.6.1=r41hc72bb7e_0 \ + r-rmarkdown=2.11=r41hc72bb7e_0 \ r-roxygen2=7.1.1=r41h03ef668_0 source /home/docs/.conda/bin/activate r_env export TAR=/bin/tar