Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R-package] [docs] add intro vignette (#3946) #4775

Merged
merged 9 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/lint_r_code.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ SOURCE_DIR <- args[[1L]]

FILES_TO_LINT <- list.files(
path = SOURCE_DIR
, pattern = "\\.r$"
, pattern = "\\.r$|\\.rmd$"
, all.files = TRUE
, ignore.case = TRUE
, full.names = TRUE
Expand Down
6 changes: 3 additions & 3 deletions .ci/test_r_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ if [[ $OS_NAME == "macos" ]]; then
fi
fi

# Manually install Depends and Imports libraries + 'testthat'
# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat'
# to avoid a CI-time dependency on devtools (for devtools::install_deps())
# NOTE: testthat is not required when running rchk
if [[ "${TASK}" == "r-rchk" ]]; then
packages="c('data.table', 'jsonlite', 'Matrix', 'R6')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')"
else
packages="c('data.table', 'jsonlite', 'Matrix', 'R6', 'testthat')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')"
fi
compile_from_source="both"
if [[ $OS_NAME == "macos" ]]; then
Expand Down
4 changes: 3 additions & 1 deletion .ci/test_r_package_solaris.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ apt-get install --no-install-recommends -y \
libxml2-dev \
libssl-dev

Rscript -e "install.packages('rhub', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
# installation of dependencies needs to happen before building the package,
# since `R CMD build` needs to install the package to build vignettes
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1

sh build-cran-package.sh || exit -1

Expand Down
2 changes: 1 addition & 1 deletion .ci/test_r_package_valgrind.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh \
--r-executable=RDvalgrind \
|| exit -1
Expand Down
10 changes: 9 additions & 1 deletion .ci/test_r_package_windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
Write-Output "Done installing Rtools"

Write-Output "Installing dependencies"
$packages = "c('data.table', 'jsonlite', 'Matrix', 'processx', 'R6', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?

# MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
Expand Down Expand Up @@ -165,7 +165,15 @@ if ($env:COMPILER -ne "MSVC") {
}
Run-R-Code-Redirect-Stderr "commandArgs <- function(...){$env:BUILD_R_FLAGS}; source('build_r.R')"; Check-Output $?
} elseif ($env:R_BUILD_TYPE -eq "cran") {
# NOTE: gzip and tar are needed to create a CRAN package on Windows, but
# some flavors of tar.exe can fail in some settings on Windows.
# Putting the msys64 utilities at the beginning of PATH temporarily to be
# sure they're used for that purpose.
if ($env:R_MAJOR_VERSION -eq "3") {
$env:PATH = "C:\msys64\usr\bin;" + $env:PATH
}
Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'sh', args = 'build-cran-package.sh', echo = TRUE, windows_verbatim_args = FALSE, error_on_status = TRUE)" ; Check-Output $?
Remove-From-Path ".*msys64.*"
# Test CRAN source .tar.gz in a directory that is not this repo or below it.
# When people install.packages('lightgbm'), they won't have the LightGBM
# git repo around. This is to protect against the use of relative paths
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/r_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ jobs:
- name: Install packages
shell: bash
run: |
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
- name: Run tests with sanitizers
Expand Down Expand Up @@ -225,7 +225,7 @@ jobs:
shell: bash
run: |
export PATH=/opt/R-devel/bin/:${PATH}
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh
R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/static_analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
- name: Install packages
shell: bash
run: |
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh || exit -1
R CMD INSTALL --with-keep.source lightgbm_*.tar.gz || exit -1
- name: Test documentation
Expand Down
3 changes: 2 additions & 1 deletion .vsts-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ resources:
image: 'ubuntu:latest'
options: "--name ci-container -v /usr/bin/docker:/tmp/docker:ro"
- container: rbase
image: rocker/r-base
image: wch1/r-debug
jobs:
###########################################
- job: Linux
Expand Down Expand Up @@ -300,6 +300,7 @@ jobs:
steps:
- script: |
LGB_VER=$(head -n 1 VERSION.txt | sed "s/rc/-/g")
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh || exit -1
mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
displayName: 'Build CRAN R-package'
Expand Down
3 changes: 3 additions & 0 deletions R-package/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,11 @@ URL: https://github.com/Microsoft/LightGBM
BugReports: https://github.com/Microsoft/LightGBM/issues
NeedsCompilation: yes
Biarch: true
VignetteBuilder: knitr
Suggests:
knitr,
processx,
rmarkdown,
testthat
Depends:
R (>= 3.5),
Expand Down
8 changes: 7 additions & 1 deletion R-package/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ Rscript build_r.R

The `build_r.R` script builds the package in a temporary directory called `lightgbm_r`. It will destroy and recreate that directory each time you run the script. That script supports the following command-line options:

- `-j[jobs]`: number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time.
- `--no-build-vignettes`: Skip building vignettes.
- `-j[jobs]`: Number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time.
- by default, this script uses single-thread compilation
- for best results, set `-j` to the number of physical CPUs
- `--skip-install`: Build the package tarball, but do not install it.
Expand Down Expand Up @@ -269,6 +270,11 @@ sh build-cran-package.sh

This will create a file `lightgbm_${VERSION}.tar.gz`, where `VERSION` is the version of `LightGBM`.

That script supports the following command-line options:

- `--no-build-vignettes`: Skip building vignettes.
- `--r-executable=[path-to-executable]`: Use an alternative build of R.

Also, CRAN package is generated with every commit to any repo's branch and can be found in "Artifacts" section of the associated Azure Pipelines run.

### Standard Installation from CRAN Package
Expand Down
2 changes: 2 additions & 0 deletions R-package/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ navbar:
href: ../
- icon: fa-home fa-lg
href: index.html
- text: Articles
href: articles/index.html
- text: Reference
href: reference/index.html
right:
Expand Down
115 changes: 115 additions & 0 deletions R-package/vignettes/basic_walkthrough.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
---
title:
"Basic Walkthrough"
description: >
This vignette describes how to train a LightGBM model for binary classification.
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Basic Walkthrough}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE
, comment = "#>"
, warning = FALSE
, message = FALSE
)
```

## Introduction

Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), a highly efficient gradient boosting implementation (Ke et al. 2017).

```{r setup}
library(lightgbm)
```

This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit.

## The dataset

The dataset looks as follows.

```{r}
data(bank, package = "lightgbm")
bank[1L:5L, c("y", "age", "balance")]
# Distribution of the response
table(bank$y)
```

## Training the model

The R package of LightGBM offers two functions to train a model:

- `lgb.train()`: This is the main training logic. It offers full flexibility but requires a `Dataset` object created by the `lgb.Dataset()` function.
- `lightgbm()`: Simpler, but less flexible. Data can be passed without having to bother with `lgb.Dataset()`.

### Using the `lightgbm()` function

In a first step, you need to convert data to numeric. Afterwards, you are ready to fit the model by the `lightgbm()` function.

```{r}
# Numeric response and feature matrix
y <- as.numeric(bank$y == "yes")
X <- data.matrix(bank[, c("age", "balance")])
# Train
fit <- lightgbm(
data = X
, label = y
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 10L
, objective = "binary"
, verbose = -1L
)
# Result
summary(predict(fit, X))
```

It seems to have worked! And the predictions are indeed probabilities between 0 and 1.

### Using the `lgb.train()` function

Alternatively, you can go for the more flexible interface `lgb.train()`. Here, as an additional step, you need to prepare `y` and `X` by the data API `lgb.Dataset()` of LightGBM. Parameters are passed to `lgb.train()` as a named list.

```{r}
# Data interface
dtrain <- lgb.Dataset(X, label = y)
# Parameters
params <- list(
objective = "binary"
, num_leaves = 4L
, learning_rate = 1.0
)
# Train
fit <- lgb.train(
params
, data = dtrain
, nrounds = 10L
, verbose = -1L
)
```

Try it out! If stuck, visit LightGBM's [documentation](https://lightgbm.readthedocs.io/en/latest/R/index.html) for more details.

```{r, echo = FALSE, results = "hide"}
# Cleanup
if (file.exists("lightgbm.model")) {
file.remove("lightgbm.model")
}
```

## References

Ke, Guolin, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. "LightGBM: A Highly Efficient Gradient Boosting Decision Tree." In Advances in Neural Information Processing Systems 30 (NIPS 2017).

Moro, Sérgio, Paulo Cortez, and Paulo Rita. 2014. "A Data-Driven Approach to Predict the Success of Bank Telemarketing." Decision Support Systems 62: 22–31.
61 changes: 58 additions & 3 deletions build-cran-package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,36 @@
# non-standard builds of R, such as those provided in
# https://github.com/wch/r-debug.
#
# --no-build-vignettes Pass this flag to skip creating vignettes.
# You might want to do this to avoid installing
# vignette-only dependencies, or to avoid
# portability issues.
#
# [usage]
#
# # default usage
# sh build-cran-package.sh
#
# # custom R build
# sh build-cran-package.sh --r-executable=RDvalgrind
#
# # skip vignette building
# sh build-cran-package.sh --no-build-vignettes

set -e

# Default values of arguments
BUILD_VIGNETTES=true
LGB_R_EXECUTABLE=R

while [ $# -gt 0 ]; do
case "$1" in
--r-executable=*)
LGB_R_EXECUTABLE="${1#*=}"
;;
--no-build-vignettes=*)
BUILD_VIGNETTES=false
;;
*)
echo "invalid argument '${1}'"
exit -1
Expand Down Expand Up @@ -57,6 +70,10 @@ cp -R R-package/* "${TEMP_R_DIR}"
cp -R include "${TEMP_R_DIR}/src/"
cp -R src/* "${TEMP_R_DIR}/src/"

if ${BUILD_VIGNETTES} ; then
cp docs/logo/LightGBM_logo_black_text.svg "${TEMP_R_DIR}/vignettes/"
fi

cp \
external_libs/fast_double_parser/include/fast_double_parser.h \
"${TEMP_R_DIR}/src/include/LightGBM"
Expand Down Expand Up @@ -169,8 +186,46 @@ cd "${TEMP_R_DIR}"

cd "${ORIG_WD}"

"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
lightgbm_r
if ${BUILD_VIGNETTES} ; then
"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
lightgbm_r

echo "removing object files created by vignettes"
rm -rf ./_tmp
mkdir _tmp
TARBALL_NAME="lightgbm_${LGB_VERSION}.tar.gz"
mv "${TARBALL_NAME}" _tmp/

echo "untarring ${TARBALL_NAME}"
cd _tmp
tar -xvf "${TARBALL_NAME}" > /dev/null 2>&1
rm -rf "${TARBALL_NAME}"
cd ..
echo "done untarring ${TARBALL_NAME}"

echo "re-tarring ${TARBALL_NAME}"
tar \
-czv \
-C ./_tmp \
--exclude=*.a \
--exclude=*.dll \
--exclude=*.o \
--exclude=*.so \
--exclude=*.tar.gz \
--exclude=**/conftest.c \
--exclude=**/conftest.exe \
-f "${TARBALL_NAME}" \
lightgbm \
> /dev/null 2>&1
echo "Done creating ${TARBALL_NAME}"

rm -rf ./_tmp
else
"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
--no-build-vignettes \
lightgbm_r
fi

echo "Done building R package"
Loading