From 9f52282d0bb1e9f441afd7f44002632306771e71 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 20 Jul 2020 07:31:20 -0500 Subject: [PATCH] [R-package] factor out {ggplot2} (#3224) * more changes * factor out ggplot2 * update CI * remove library() * linting * reduce NOTEs on Windows --- .ci/test_r_package.sh | 2 +- .ci/test_r_package_windows.ps1 | 2 +- R-package/DESCRIPTION | 1 - R-package/demo/leaf_stability.R | 140 +++++++++++++++++++------------- 4 files changed, 84 insertions(+), 61 deletions(-) diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 8fbff64a24a6..1b98a98224bf 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -148,7 +148,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then exit -1 fi -ALLOWED_CHECK_NOTES=3 +ALLOWED_CHECK_NOTES=2 NUM_CHECK_NOTES=$( cat ${LOG_FILE_NAME} \ | grep -e '^Status: .* NOTE.*' \ diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1 index 2f8e40e78dcf..1a0d0b35b106 100644 --- a/.ci/test_r_package_windows.ps1 +++ b/.ci/test_r_package_windows.ps1 @@ -157,7 +157,7 @@ if ($env:COMPILER -ne "MSVC") { $note_str = Get-Content -Path "${LOG_FILE_NAME}" | Select-String -Pattern '.*Status.* NOTE' | Out-String ; Check-Output $? $relevant_line = $note_str -match '(\d+) NOTE' $NUM_CHECK_NOTES = $matches[1] - $ALLOWED_CHECK_NOTES = 3 + $ALLOWED_CHECK_NOTES = 2 if ([int]$NUM_CHECK_NOTES -gt $ALLOWED_CHECK_NOTES) { Write-Output "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed" Check-Output $False diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index d7c7ab94f413..48ee660cf40e 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -25,7 +25,6 @@ BugReports: https://github.com/Microsoft/LightGBM/issues NeedsCompilation: yes Biarch: false Suggests: - ggplot2 (>= 1.0.1), processx, testthat Depends: diff --git a/R-package/demo/leaf_stability.R b/R-package/demo/leaf_stability.R index dfa11c252c0e..bad2e83107b1 100644 --- a/R-package/demo/leaf_stability.R +++ b/R-package/demo/leaf_stability.R @@ -2,11 +2,81 @@ # Obviously, we are in a controlled environment, without issues (real rules). # Do not do this in a real scenario. -# First, we load our libraries library(lightgbm) -library(ggplot2) -# Second, we load our data +# define helper functions for creating plots + +# output of `RColorBrewer::brewer.pal(10, "RdYlGn")`, hardcooded here to avoid a dependency +.diverging_palette <- c( + "#A50026", "#D73027", "#F46D43", "#FDAE61", "#FEE08B" + , "#D9EF8B", "#A6D96A", "#66BD63", "#1A9850", "#006837" +) + +.prediction_depth_plot <- function(df) { + plot( + x = df$X + , y = df$Y + , type = "p" + , main = "Prediction Depth" + , xlab = "Leaf Bin" + , ylab = "Prediction Probability" + , pch = 19L + , col = .diverging_palette[df$binned + 1L] + ) + legend( + "topright" + , title = "bin" + , legend = sort(unique(df$binned)) + , pch = 19L + , col = .diverging_palette[sort(unique(df$binned + 1L))] + , cex = 0.7 + ) +} + +.prediction_depth_spread_plot <- function(df) { + plot( + x = df$binned + , xlim = c(0L, 9L) + , y = df$Z + , type = "p" + , main = "Prediction Depth Spread" + , xlab = "Leaf Bin" + , ylab = "Logloss" + , pch = 19L + , col = .diverging_palette[df$binned + 1L] + ) + legend( + "topright" + , title = "bin" + , legend = sort(unique(df$binned)) + , pch = 19L + , col = .diverging_palette[sort(unique(df$binned + 1L))] + , cex = 0.7 + ) +} + +.depth_density_plot <- function(df) { + plot( + x = density(df$Y) + , xlim = c(min(df$Y), max(df$Y)) + , type = "p" + , main = "Depth Density" + , xlab = "Prediction Probability" + , ylab = "Bin Density" + , pch = 19L + , col = .diverging_palette[df$binned + 1L] + ) + legend( + "topright" + , title = "bin" + , legend = sort(unique(df$binned)) + , pch = 19L + , col = .diverging_palette[sort(unique(df$binned + 1L))] + , cex = 0.7 + ) +} + +# load some data data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -14,7 +84,7 @@ data(agaricus.test, package = "lightgbm") test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) -# Third, we setup parameters and we train a model +# setup parameters and we train a model params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) model <- lgb.train( @@ -59,7 +129,6 @@ new_data$binned <- .bincode( , include.lowest = TRUE ) new_data$binned[is.na(new_data$binned)] <- 0L -new_data$binned <- as.factor(new_data$binned) # We can check the binned content table(new_data$binned) @@ -67,25 +136,9 @@ table(new_data$binned) # We can plot the binned content # On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss # On the third plot, it is smooth! -ggplot( - data = new_data - , mapping = aes(x = X, y = Y, color = binned) -) + geom_point() + - theme_bw() + - labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") -ggplot( - data = new_data - , mapping = aes(x = binned, y = Z, fill = binned, group = binned) -) + geom_boxplot() + - theme_bw() + - labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") -ggplot( - data = new_data - , mapping = aes(x = Y, y = ..count.., fill = binned) -) + geom_density(position = "fill") + - theme_bw() + - labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") - +.prediction_depth_plot(df = new_data) +.prediction_depth_spread_plot(df = new_data) +.depth_density_plot(df = new_data) # Now, let's show with other parameters model2 <- lgb.train( @@ -126,7 +179,6 @@ new_data2$binned <- .bincode( , include.lowest = TRUE ) new_data2$binned[is.na(new_data2$binned)] <- 0L -new_data2$binned <- as.factor(new_data2$binned) # We can check the binned content table(new_data2$binned) @@ -136,25 +188,9 @@ table(new_data2$binned) # On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are # real thus it is not an issue # However, if the rules were not true, the loss would explode. -ggplot( - data = new_data2 - , mapping = aes(x = X, y = Y, color = binned) -) + geom_point() + - theme_bw() + - labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") -ggplot( - data = new_data2 - , mapping = aes(x = binned, y = Z, fill = binned, group = binned) -) + geom_boxplot() + - theme_bw() + - labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") -ggplot( - data = new_data2 - , mapping = aes(x = Y, y = ..count.., fill = binned) -) + geom_density(position = "fill") + - theme_bw() + - labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") - +.prediction_depth_plot(df = new_data2) +.prediction_depth_spread_plot(df = new_data2) +.depth_density_plot(df = new_data2) # Now, try with very severe overfitting model3 <- lgb.train( @@ -195,7 +231,6 @@ new_data3$binned <- .bincode( , include.lowest = TRUE ) new_data3$binned[is.na(new_data3$binned)] <- 0L -new_data3$binned <- as.factor(new_data3$binned) # We can check the binned content table(new_data3$binned) @@ -204,18 +239,7 @@ table(new_data3$binned) # On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules # are real thus it is not an issue. # However, if the rules were not true, the loss would explode. See the sudden spikes? -ggplot( - data = new_data3 - , mapping = aes(x = Y, y = ..count.., fill = binned) -) + - geom_density(position = "fill") + - theme_bw() + - labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +.depth_density_plot(df = new_data3) # Compare with our second model, the difference is severe. This is smooth. -ggplot( - data = new_data2 - , mapping = aes(x = Y, y = ..count.., fill = binned) -) + geom_density(position = "fill") + - theme_bw() + - labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +.depth_density_plot(df = new_data2)