From b4bb38d9267ccc966f0bb8cdd6e2eece2b4c08e3 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 23 Oct 2019 19:25:03 -0700
Subject: [PATCH] [R-package] Added unit tests (#2498)

---
 R-package/man/lgb.interprete.Rd               |   2 +-
 R-package/man/slice.Rd                        |   2 +-
 R-package/tests/testthat/test_basic.R         |   1 +
 .../tests/testthat/test_lgb.importance.R      |  39 ++++++
 .../tests/testthat/test_lgb.interprete.R      | 113 ++++++++++++++++++
 .../testthat/test_lgb.plot.interpretation.R   |  97 +++++++++++++++
 6 files changed, 252 insertions(+), 2 deletions(-)
 create mode 100644 R-package/tests/testthat/test_lgb.importance.R
 create mode 100644 R-package/tests/testthat/test_lgb.interprete.R
 create mode 100644 R-package/tests/testthat/test_lgb.plot.interpretation.R

diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd
index 798f39c2eef7..62f5c58caa93 100644
--- a/R-package/man/lgb.interprete.Rd
+++ b/R-package/man/lgb.interprete.Rd
@@ -11,7 +11,7 @@ lgb.interprete(model, data, idxset, num_iteration = NULL)
 
 \item{data}{a matrix object or a dgCMatrix object.}
 
-\item{idxset}{an integer vector of indices of rows needed.}
+\item{idxset}{a integer vector of indices of rows needed.}
 
 \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
 }
diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd
index 38a311a38acc..10040d11a2bc 100644
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -14,7 +14,7 @@ slice(dataset, ...)
 
 \item{...}{other parameters (currently not used)}
 
-\item{idxset}{an integer vector of indices of rows needed}
+\item{idxset}{a integer vector of indices of rows needed}
 }
 \value{
 constructed sub dataset
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 412dccfa7937..11cd0a51ef4a 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -51,6 +51,7 @@ test_that("use of multiple eval metrics works", {
 
 
 test_that("training continuation works", {
+  testthat::skip("This test is currently broken. See issue #2468 for details.")
   dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE)
   watchlist = list(train=dtrain)
   param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1)
diff --git a/R-package/tests/testthat/test_lgb.importance.R b/R-package/tests/testthat/test_lgb.importance.R
new file mode 100644
index 000000000000..9a0d1e9c3e32
--- /dev/null
+++ b/R-package/tests/testthat/test_lgb.importance.R
@@ -0,0 +1,39 @@
+context("lgb.importance")
+
+test_that("lgb.importance() should reject bad inputs", {
+    bad_inputs <- list(
+        .Machine$integer.max
+        , Inf
+        , -Inf
+        , NA
+        , NA_real_
+        , -10L:10L
+        , list(c("a", "b", "c"))
+        , data.frame(
+            x = rnorm(20)
+            , y = sample(
+                x = c(1, 2)
+                , size = 20
+                , replace = TRUE
+            )
+        )
+        , data.table::data.table(
+            x = rnorm(20)
+            , y = sample(
+                x = c(1, 2)
+                , size = 20
+                , replace = TRUE
+            )
+        )
+        , lgb.Dataset(
+            data = matrix(rnorm(100), ncol = 2)
+            , label = matrix(sample(c(0, 1), 50, replace = TRUE))
+        )
+        , "lightgbm.model"
+    )
+    for (input in bad_inputs){
+        expect_error({
+            lgb.importance(input)
+        }, regexp = "'model' has to be an object of class lgb\\.Booster")
+    }
+})
diff --git a/R-package/tests/testthat/test_lgb.interprete.R b/R-package/tests/testthat/test_lgb.interprete.R
new file mode 100644
index 000000000000..e4656b9879b7
--- /dev/null
+++ b/R-package/tests/testthat/test_lgb.interprete.R
@@ -0,0 +1,113 @@
+context("lgb.interpete")
+
+.sigmoid <- function(x){
+    1 / (1 + exp(-x))
+}
+.logit <- function(x){
+    log(x / (1 - x))
+}
+
+test_that("lgb.intereprete works as expected for binary classification", {
+    data(agaricus.train, package = "lightgbm")
+    train <- agaricus.train
+    dtrain <- lgb.Dataset(train$data, label = train$label)
+    setinfo(
+        dataset = dtrain
+        , "init_score"
+        , rep(
+            .logit(mean(train$label))
+            , length(train$label)
+        )
+    )
+    data(agaricus.test, package = "lightgbm")
+    test <- agaricus.test
+    params <- list(
+        objective = "binary"
+        , learning_rate = 0.01
+        , num_leaves = 63
+        , max_depth = -1
+        , min_data_in_leaf = 1
+        , min_sum_hessian_in_leaf = 1
+    )
+    model <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = 10
+    )
+    num_trees <- 5
+    tree_interpretation <- lgb.interprete(
+        model = model
+        , data = test$data
+        , idxset = 1:num_trees
+    )
+    expect_true(methods::is(tree_interpretation, "list"))
+    expect_true(length(tree_interpretation) == num_trees)
+    expect_null(names(tree_interpretation))
+    expect_true(all(
+        sapply(
+            X = tree_interpretation
+            , FUN = function(treeDT){
+                checks <- c(
+                    data.table::is.data.table(treeDT)
+                    , identical(names(treeDT), c("Feature", "Contribution"))
+                    , is.character(treeDT[, Feature])
+                    , is.numeric(treeDT[, Contribution])
+                )
+                return(all(checks))
+            }
+        )
+    ))
+})
+
+test_that("lgb.intereprete works as expected for multiclass classification", {
+    data(iris)
+
+    # We must convert factors to numeric
+    # They must be starting from number 0 to use multiclass
+    # For instance: 0, 1, 2, 3, 4, 5...
+    iris$Species <- as.numeric(as.factor(iris$Species)) - 1
+
+    # Create imbalanced training data (20, 30, 40 examples for classes 0, 1, 2)
+    train <- as.matrix(iris[c(1:20, 51:80, 101:140), ])
+    # The 10 last samples of each class are for validation
+    test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
+    dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
+    dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
+    params <- list(
+        objective = "multiclass"
+        , metric = "multi_logloss"
+        , num_class = 3
+        , learning_rate = 0.00001
+    )
+    model <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = 10
+        , min_data = 1
+    )
+    num_trees <- 5
+    tree_interpretation <- lgb.interprete(
+        model = model
+        , data = test[, 1:4]
+        , idxset = 1:num_trees
+    )
+    expect_true(methods::is(tree_interpretation, "list"))
+    expect_true(length(tree_interpretation) == num_trees)
+    expect_null(names(tree_interpretation))
+    expect_true(all(
+        sapply(
+            X = tree_interpretation
+            , FUN = function(treeDT){
+                checks <- c(
+                    data.table::is.data.table(treeDT)
+                    , identical(names(treeDT), c("Feature", "Class 0", "Class 1", "Class 2"))
+                    , is.character(treeDT[, Feature])
+                    , is.numeric(treeDT[, `Class 0`])
+                    , is.numeric(treeDT[, `Class 1`])
+                    , is.numeric(treeDT[, `Class 2`])
+                )
+                return(all(checks))
+            }
+        )
+    ))
+})
diff --git a/R-package/tests/testthat/test_lgb.plot.interpretation.R b/R-package/tests/testthat/test_lgb.plot.interpretation.R
new file mode 100644
index 000000000000..9332c01a07bd
--- /dev/null
+++ b/R-package/tests/testthat/test_lgb.plot.interpretation.R
@@ -0,0 +1,97 @@
+context("lgb.plot.interpretation")
+
+.sigmoid <- function(x){
+    1 / (1 + exp(-x))
+}
+.logit <- function(x){
+    log(x / (1 - x))
+}
+
+test_that("lgb.plot.interepretation works as expected for binary classification", {
+    data(agaricus.train, package = "lightgbm")
+    train <- agaricus.train
+    dtrain <- lgb.Dataset(train$data, label = train$label)
+    setinfo(
+        dataset = dtrain
+        , "init_score"
+        , rep(
+            .logit(mean(train$label))
+            , length(train$label)
+        )
+    )
+    data(agaricus.test, package = "lightgbm")
+    test <- agaricus.test
+    params <- list(
+        objective = "binary"
+        , learning_rate = 0.01
+        , num_leaves = 63
+        , max_depth = -1
+        , min_data_in_leaf = 1
+        , min_sum_hessian_in_leaf = 1
+    )
+    model <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = 10
+    )
+    num_trees <- 5
+    tree_interpretation <- lgb.interprete(
+        model = model
+        , data = test$data
+        , idxset = 1:num_trees
+    )
+    expect_true({
+        lgb.plot.interpretation(
+            tree_interpretation_dt = tree_interpretation[[1]]
+            , top_n = 5
+        )
+        TRUE
+    })
+
+    # should also work when you explicitly pass cex
+    plot_res <- lgb.plot.interpretation(
+        tree_interpretation_dt = tree_interpretation[[1]]
+        , top_n = 5
+        , cex = 0.95
+    )
+    expect_null(plot_res)
+})
+
+test_that("lgb.plot.interepretation works as expected for multiclass classification", {
+    data(iris)
+
+    # We must convert factors to numeric
+    # They must be starting from number 0 to use multiclass
+    # For instance: 0, 1, 2, 3, 4, 5...
+    iris$Species <- as.numeric(as.factor(iris$Species)) - 1
+
+    # Create imbalanced training data (20, 30, 40 examples for classes 0, 1, 2)
+    train <- as.matrix(iris[c(1:20, 51:80, 101:140), ])
+    # The 10 last samples of each class are for validation
+    test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
+    dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
+    dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
+    params <- list(
+        objective = "multiclass"
+        , metric = "multi_logloss"
+        , num_class = 3
+        , learning_rate = 0.00001
+    )
+    model <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = 10
+        , min_data = 1
+    )
+    num_trees <- 5
+    tree_interpretation <- lgb.interprete(
+        model = model
+        , data = test[, 1:4]
+        , idxset = 1:num_trees
+    )
+    plot_res <- lgb.plot.interpretation(
+        tree_interpretation_dt = tree_interpretation[[1]]
+        , top_n = 5
+    )
+    expect_null(plot_res)
+})