From 2a06f0e2d60eed329dddcd5c7e43ad9ce74cb74c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 25 May 2020 03:24:38 +0100
Subject: [PATCH] [R-package] fix issue where early stopping thinks higher MAPE
 is desirable (fixes #3099) (#3101)

* [R-package] fix issue where early stopping thinks higher MAPE is desirable (fixes #3099)

* fix linting

* only use main metrics

* fix tests
---
 R-package/R/callback.R                  |   6 +-
 R-package/R/lgb.Booster.R               |   6 +-
 R-package/R/metrics.R                   |  34 ++++++
 R-package/tests/testthat/test_basic.R   | 133 +++++++++++++++++++++++-
 R-package/tests/testthat/test_metrics.R |  12 +++
 5 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 R-package/R/metrics.R
 create mode 100644 R-package/tests/testthat/test_metrics.R

diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index d5495c9d56cd..4edb2d0e19b1 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -293,7 +293,7 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
       cat("Will train until there is no improvement in ", stopping_rounds, " rounds.\n\n", sep = "")
     }
 
-    # Maximization or minimization task
+    # Internally treat everything as a maximization task
     factor_to_bigger_better <<- rep.int(1.0, eval_len)
     best_iter <<- rep.int(-1L, eval_len)
     best_score <<- rep.int(-Inf, eval_len)
@@ -305,8 +305,8 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
       # Prepend message
       best_msg <<- c(best_msg, "")
 
-      # Check if maximization or minimization
-      if (!env$eval_list[[i]]$higher_better) {
+      # Internally treat everything as a maximization task
+      if (!isTRUE(env$eval_list[[i]]$higher_better)) {
         factor_to_bigger_better[i] <<- -1.0
       }
 
diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index c9eb81f4d3c7..101cad0b998e 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -596,7 +596,11 @@ Booster <- R6::R6Class(
           # Parse and store privately names
           names <- strsplit(names, "\t")[[1L]]
           private$eval_names <- names
-          private$higher_better_inner_eval <- grepl("^ndcg|^map|^auc", names)
+
+          # some metrics don't map cleanly to metric names, for example "ndcg@1" is just the
+          # ndcg metric evaluated at the first "query result" in learning-to-rank
+          metric_names <- gsub("@.*", "", names)
+          private$higher_better_inner_eval <- .METRICS_HIGHER_BETTER()[metric_names]
 
         }
 
diff --git a/R-package/R/metrics.R b/R-package/R/metrics.R
new file mode 100644
index 000000000000..d36d39190070
--- /dev/null
+++ b/R-package/R/metrics.R
@@ -0,0 +1,34 @@
+# [description] List of metrics known to LightGBM. The most up to date list can be found
+#               at https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters
+#
+# [return] A named logical vector, where each key is a metric name and each value is a boolean.
+#          TRUE if higher values of the metric are desirable, FALSE if lower values are desirable.
+#          Note that only the 'main' metrics are stored here, not aliases, since only the 'main' metrics
+#          are returned from the C++ side. For example, if you use `metric = "mse"` in your code,
+#          the metric name `"l2"` will be returned.
+.METRICS_HIGHER_BETTER <- function() {
+    return(c(
+        "l1" = FALSE
+        , "l2" = FALSE
+        , "mape" = FALSE
+        , "rmse" = FALSE
+        , "quantile" = FALSE
+        , "huber" = FALSE
+        , "fair" = FALSE
+        , "poisson" = FALSE
+        , "gamma" = FALSE
+        , "gamma_deviance" = FALSE
+        , "tweedie" = FALSE
+        , "ndcg" = TRUE
+        , "map" = TRUE
+        , "auc" = TRUE
+        , "binary_logloss" = FALSE
+        , "binary_error" = FALSE
+        , "auc_mu" = TRUE
+        , "multi_logloss" = FALSE
+        , "multi_error" = FALSE
+        , "cross_entropy" = FALSE
+        , "cross_entropy_lambda" = FALSE
+        , "kullback_leibler" = FALSE
+    ))
+}
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index ecaf539bd47e..1b22a894a04e 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -139,7 +139,10 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
     , num_leaves = 5L
     , nrounds = nrounds
     , objective = "binary"
-    , metric = "binary_error"
+    , metric = c(
+      "binary_error"
+      , "auc"
+    )
     , valids = list(
       "valid1" = dvalid1
       , "valid2" = dvalid2
@@ -534,6 +537,73 @@ test_that("lgb.train() works with early stopping for classification", {
 
 })
 
+test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", {
+  set.seed(708L)
+  dtrain <- lgb.Dataset(
+    data = train$data
+    , label = train$label
+  )
+  dvalid <- lgb.Dataset(
+    data = test$data
+    , label = test$label
+  )
+  nrounds <- 10L
+
+  #############################
+  # train with early stopping #
+  #############################
+  early_stopping_rounds <- 5L
+  # the harsh max_depth guarantees that AUC improves over at least the first few iterations
+  bst_auc  <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "auc"
+      , max_depth = 3L
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , data = dtrain
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = dvalid
+    )
+  )
+  bst_binary_error  <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "binary_error"
+      , max_depth = 3L
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , data = dtrain
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = dvalid
+    )
+  )
+
+  # early stopping should have been hit for binary_error (higher_better = FALSE)
+  eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info()
+  expect_identical(eval_info, "binary_error")
+  expect_identical(
+    unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval)
+    , FALSE
+  )
+  expect_identical(bst_binary_error$best_iter, 1L)
+  expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L)
+  expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE)
+
+  # early stopping should not have been hit for AUC (higher_better = TRUE)
+  eval_info <- bst_auc$.__enclos_env__$private$get_eval_info()
+  expect_identical(eval_info, "auc")
+  expect_identical(
+    unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval)
+    , TRUE
+  )
+  expect_identical(bst_auc$best_iter, 9L)
+  expect_identical(bst_auc$current_iter(), nrounds)
+  expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE)
+})
+
 test_that("lgb.train() works with early stopping for regression", {
   set.seed(708L)
   trainDF <- data.frame(
@@ -604,6 +674,67 @@ test_that("lgb.train() works with early stopping for regression", {
   )
 })
 
+test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", {
+  set.seed(708L)
+  trainDF <- data.frame(
+    "feat1" = rep(c(10.0, 100.0), 500L)
+    , "target" = rep(c(-50.0, 50.0), 500L)
+  )
+  validDF <- data.frame(
+    "feat1" = rep(50.0, 4L)
+    , "target" = rep(50.0, 4L)
+  )
+  dtrain <- lgb.Dataset(
+    data = as.matrix(trainDF[["feat1"]], drop = FALSE)
+    , label = trainDF[["target"]]
+  )
+  dvalid <- lgb.Dataset(
+    data = as.matrix(validDF[["feat1"]], drop = FALSE)
+    , label = validDF[["target"]]
+  )
+  nrounds <- 10L
+
+  #############################
+  # train with early stopping #
+  #############################
+  early_stopping_rounds <- 5L
+  bst  <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = c(
+          "mape"
+          , "rmse"
+          , "mae"
+      )
+      , min_data_in_bin = 5L
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , data = dtrain
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = dvalid
+    )
+  )
+
+  # the best model should be from the first iteration, and only 6 rounds
+  # should have happened (1 with improvement, 5 consecutive with no improvement)
+  expect_equal(bst$best_score, 1.1)
+  expect_equal(bst$best_iter, 1L)
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["mape"]][["eval"]])
+    , early_stopping_rounds + 1L
+  )
+
+  # Booster should understand thatt all three of these metrics should be minimized
+  eval_info <- bst$.__enclos_env__$private$get_eval_info()
+  expect_identical(eval_info, c("mape", "rmse", "l1"))
+  expect_identical(
+    unname(bst$.__enclos_env__$private$higher_better_inner_eval)
+    , rep(FALSE, 3L)
+  )
+})
+
+
 test_that("lgb.train() supports non-ASCII feature names", {
   testthat::skip("UTF-8 feature names are not fully supported in the R package")
   dtrain <- lgb.Dataset(
diff --git a/R-package/tests/testthat/test_metrics.R b/R-package/tests/testthat/test_metrics.R
new file mode 100644
index 000000000000..636a00b36bdc
--- /dev/null
+++ b/R-package/tests/testthat/test_metrics.R
@@ -0,0 +1,12 @@
+context(".METRICS_HIGHER_BETTER()")
+
+test_that(".METRICS_HIGHER_BETTTER() should be well formed", {
+    metrics <- .METRICS_HIGHER_BETTER()
+    metric_names <- names(.METRICS_HIGHER_BETTER())
+    # should be a logical vector
+    expect_true(is.logical(metrics))
+    # no metrics should be repeated
+    expect_true(length(unique(metric_names)) == length(metrics))
+    # should not be any NAs
+    expect_false(any(is.na(metrics)))
+})