diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R new file mode 100755 index 000000000000..5d1615a82774 --- /dev/null +++ b/.ci/lint_r_code.R @@ -0,0 +1,62 @@ + +library(lintr) + +args <- commandArgs( + trailingOnly = TRUE +) +SOURCE_DIR <- args[[1]] + +FILES_TO_LINT <- list.files( + path = SOURCE_DIR + , pattern = "\\.r$" + , all.files = TRUE + , ignore.case = TRUE + , full.names = TRUE + , recursive = TRUE + , include.dirs = FALSE +) + +# Some linters from the lintr package have not made it to CRAN yet +# We build lintr from source to address that. +LINTERS_TO_USE <- list( + "closed_curly" = lintr::closed_curly_linter + , "infix_spaces" = lintr::infix_spaces_linter + , "long_lines" = lintr::line_length_linter(length = 120) + , "tabs" = lintr::no_tab_linter + , "open_curly" = lintr::open_curly_linter + , "spaces_inside" = lintr::spaces_inside_linter + , "spaces_left_parens" = lintr::spaces_left_parentheses_linter + , "trailing_blank" = lintr::trailing_blank_lines_linter + , "trailing_white" = lintr::trailing_whitespace_linter +) + +cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT))) + +results <- c() + +for (r_file in FILES_TO_LINT){ + + this_result <- lintr::lint( + filename = r_file + , linters = LINTERS_TO_USE + , cache = FALSE + ) + + cat(sprintf( + "Found %i linting errors in %s\n" + , length(this_result) + , r_file + )) + + results <- c(results, this_result) + +} + +issues_found <- length(results) + +if (issues_found > 0){ + cat("\n") + print(results) +} + +quit(save = "no", status = issues_found) diff --git a/.ci/test.sh b/.ci/test.sh index 5733c43f2639..7be39b130d06 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then fi if [[ $TASK == "lint" ]]; then - conda install -q -y -n $CONDA_ENV pycodestyle pydocstyle + conda install -q -y -n $CONDA_ENV \ + pycodestyle \ + pydocstyle \ + r-lintr pip install --user cpplint + echo "Linting Python code" pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1 pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 + echo "Linting R code" + Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 + echo "Linting C++ code" cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0 exit 0 fi diff --git a/R-package/R/callback.R b/R-package/R/callback.R index 6895d11f422f..92bd9c035a97 100644 --- a/R-package/R/callback.R +++ b/R-package/R/callback.R @@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) { # since changing them would simply wreck some chaos not_allowed <- c("num_class", "metric", "boosting_type") if (any(pnames %in% not_allowed)) { - stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting") + stop( + "Parameters " + , paste0(pnames[pnames %in% not_allowed], collapse = ", ") + , " cannot be changed during boosting" + ) } # Check parameter names @@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) { i <- env$iteration # Check if iteration matches moduo - if ((i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration ))) { + if ( (i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration))) { # Merge evaluation string msg <- merge.eval.string(env) @@ -244,8 +248,14 @@ cb.record.evaluation <- function() { name <- eval_res$name # Store evaluation data - env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value) - env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err) + env$model$record_evals[[data_name]][[name]]$eval <- c( + env$model$record_evals[[data_name]][[name]]$eval + , eval_res$value + ) + env$model$record_evals[[data_name]][[name]]$eval_err <- c( + env$model$record_evals[[data_name]][[name]]$eval_err + , eval_err + ) } @@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) { } # Extract callback names from the list of callbacks -callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) } +callback.names <- function(cb_list) { + unlist(lapply(cb_list, attr, "name")) +} add.cb <- function(cb_list, cb) { diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 9e22fa5e9b64..a23245533b06 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -46,7 +46,12 @@ Booster <- R6::R6Class( } # Store booster handle - handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str) + handle <- lgb.call( + "LGBM_BoosterCreate_R" + , ret = handle + , train_set$.__enclos_env__$private$get_handle() + , params_str + ) # Create private booster information private$train_set <- train_set @@ -57,10 +62,12 @@ Booster <- R6::R6Class( if (!is.null(private$init_predictor)) { # Merge booster - lgb.call("LGBM_BoosterMerge_R", - ret = NULL, - handle, - private$init_predictor$.__enclos_env__$private$handle) + lgb.call( + "LGBM_BoosterMerge_R" + , ret = NULL + , handle + , private$init_predictor$.__enclos_env__$private$handle + ) } @@ -75,9 +82,11 @@ Booster <- R6::R6Class( } # Create booster from model - handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", - ret = handle, - lgb.c_str(modelfile)) + handle <- lgb.call( + "LGBM_BoosterCreateFromModelfile_R" + , ret = handle + , lgb.c_str(modelfile) + ) } else if (!is.null(model_str)) { @@ -87,14 +96,19 @@ Booster <- R6::R6Class( } # Create booster from model - handle <- lgb.call("LGBM_BoosterLoadModelFromString_R", - ret = handle, - lgb.c_str(model_str)) + handle <- lgb.call( + "LGBM_BoosterLoadModelFromString_R" + , ret = handle + , lgb.c_str(model_str) + ) } else { # Booster non existent - stop("lgb.Booster: Need at least either training dataset, model file, or model_str to create booster instance") + stop( + "lgb.Booster: Need at least either training dataset, " + , "model file, or model_str to create booster instance" + ) } @@ -111,9 +125,11 @@ Booster <- R6::R6Class( class(handle) <- "lgb.Booster.handle" private$handle <- handle private$num_class <- 1L - private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R", - ret = private$num_class, - private$handle) + private$num_class <- lgb.call( + "LGBM_BoosterGetNumClasses_R" + , ret = private$num_class + , private$handle + ) } @@ -138,7 +154,10 @@ Booster <- R6::R6Class( # Check if predictors are identical if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) { - stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data") + stop( + "lgb.Booster.add_valid: Failed to add validation data; " + , "you should use the same predictor for these data" + ) } # Check if names are character @@ -147,10 +166,12 @@ Booster <- R6::R6Class( } # Add validation data to booster - lgb.call("LGBM_BoosterAddValidData_R", - ret = NULL, - private$handle, - data$.__enclos_env__$private$get_handle()) + lgb.call( + "LGBM_BoosterAddValidData_R" + , ret = NULL + , private$handle + , data$.__enclos_env__$private$get_handle() + ) # Store private information private$valid_sets <- c(private$valid_sets, data) @@ -171,10 +192,12 @@ Booster <- R6::R6Class( params_str <- lgb.params2str(params) # Reset parameters - lgb.call("LGBM_BoosterResetParameter_R", - ret = NULL, - private$handle, - params_str) + lgb.call( + "LGBM_BoosterResetParameter_R" + , ret = NULL + , private$handle + , params_str + ) # Return self return(invisible(self)) @@ -198,10 +221,12 @@ Booster <- R6::R6Class( } # Reset training data on booster - lgb.call("LGBM_BoosterResetTrainingData_R", - ret = NULL, - private$handle, - train_set$.__enclos_env__$private$get_handle()) + lgb.call( + "LGBM_BoosterResetTrainingData_R" + , ret = NULL + , private$handle + , train_set$.__enclos_env__$private$get_handle() + ) # Store private train set private$train_set = train_set @@ -230,18 +255,20 @@ Booster <- R6::R6Class( gpair <- fobj(private$inner_predict(1), private$train_set) # Check for gradient and hessian as list - if(is.null(gpair$grad) || is.null(gpair$hess)){ + if (is.null(gpair$grad) || is.null(gpair$hess)){ stop("lgb.Booster.update: custom objective should return a list with attributes (hess, grad)") } # Return custom boosting gradient/hessian - ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R", - ret = NULL, - private$handle, - gpair$grad, - gpair$hess, - length(gpair$grad)) + ret <- lgb.call( + "LGBM_BoosterUpdateOneIterCustom_R" + , ret = NULL + , private$handle + , gpair$grad + , gpair$hess + , length(gpair$grad) + ) } @@ -258,9 +285,11 @@ Booster <- R6::R6Class( rollback_one_iter = function() { # Return one iteration behind - lgb.call("LGBM_BoosterRollbackOneIter_R", - ret = NULL, - private$handle) + lgb.call( + "LGBM_BoosterRollbackOneIter_R" + , ret = NULL + , private$handle + ) # Loop through each iteration for (i in seq_along(private$is_predicted_cur_iter)) { @@ -276,9 +305,11 @@ Booster <- R6::R6Class( current_iter = function() { cur_iter <- 0L - lgb.call("LGBM_BoosterGetCurrentIteration_R", - ret = cur_iter, - private$handle) + lgb.call( + "LGBM_BoosterGetCurrentIteration_R" + , ret = cur_iter + , private$handle + ) }, @@ -349,7 +380,10 @@ Booster <- R6::R6Class( # Loop through each validation set for (i in seq_along(private$valid_sets)) { - ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)) + ret <- append( + x = ret + , values = private$inner_eval(private$name_valid_sets[[i]], i + 1, feval) + ) } # Return ret @@ -366,11 +400,13 @@ Booster <- R6::R6Class( } # Save booster model - lgb.call("LGBM_BoosterSaveModel_R", - ret = NULL, - private$handle, - as.integer(num_iteration), - lgb.c_str(filename)) + lgb.call( + "LGBM_BoosterSaveModel_R" + , ret = NULL + , private$handle + , as.integer(num_iteration) + , lgb.c_str(filename) + ) # Return self return(invisible(self)) @@ -385,9 +421,11 @@ Booster <- R6::R6Class( } # Return model string - return(lgb.call.return.str("LGBM_BoosterSaveModelToString_R", - private$handle, - as.integer(num_iteration))) + return(lgb.call.return.str( + "LGBM_BoosterSaveModelToString_R" + , private$handle + , as.integer(num_iteration) + )) }, @@ -400,9 +438,11 @@ Booster <- R6::R6Class( } # Return dumped model - lgb.call.return.str("LGBM_BoosterDumpModel_R", - private$handle, - as.integer(num_iteration)) + lgb.call.return.str( + "LGBM_BoosterDumpModel_R" + , private$handle + , as.integer(num_iteration) + ) }, @@ -478,10 +518,12 @@ Booster <- R6::R6Class( # Store predictions npred <- 0L - npred <- lgb.call("LGBM_BoosterGetNumPredict_R", - ret = npred, - private$handle, - as.integer(idx - 1)) + npred <- lgb.call( + "LGBM_BoosterGetNumPredict_R" + , ret = npred + , private$handle + , as.integer(idx - 1) + ) private$predict_buffer[[data_name]] <- numeric(npred) } @@ -490,10 +532,12 @@ Booster <- R6::R6Class( if (!private$is_predicted_cur_iter[[idx]]) { # Use buffer - private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R", - ret = private$predict_buffer[[data_name]], - private$handle, - as.integer(idx - 1)) + private$predict_buffer[[data_name]] <- lgb.call( + "LGBM_BoosterGetPredict_R" + , ret = private$predict_buffer[[data_name]] + , private$handle + , as.integer(idx - 1) + ) private$is_predicted_cur_iter[[idx]] <- TRUE } @@ -508,8 +552,10 @@ Booster <- R6::R6Class( if (is.null(private$eval_names)) { # Get evaluation names - names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R", - private$handle) + names <- lgb.call.return.str( + "LGBM_BoosterGetEvalNames_R" + , private$handle + ) # Check names' length if (nchar(names) > 0) { @@ -547,10 +593,12 @@ Booster <- R6::R6Class( # Create evaluation values tmp_vals <- numeric(length(private$eval_names)) - tmp_vals <- lgb.call("LGBM_BoosterGetEval_R", - ret = tmp_vals, - private$handle, - as.integer(data_idx - 1)) + tmp_vals <- lgb.call( + "LGBM_BoosterGetEval_R" + , ret = tmp_vals + , private$handle + , as.integer(data_idx - 1) + ) # Loop through all evaluation names for (i in seq_along(private$eval_names)) { @@ -587,7 +635,7 @@ Booster <- R6::R6Class( res <- feval(private$inner_predict(data_idx), data) # Check for name correctness - if(is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) { + if (is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) { stop("lgb.Booster.eval: custom eval function should return a list with attribute (name, value, higher_better)"); } @@ -614,13 +662,13 @@ Booster <- R6::R6Class( #' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename #' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration #' @param rawscore whether the prediction should be returned in the for of original untransformed -#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for -#' logistic regression would result in predictions for log-odds instead of probabilities. +#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} +#' for logistic regression would result in predictions for log-odds instead of probabilities. #' @param predleaf whether predict leaf index instead. #' @param predcontrib return per-feature contributions for each record. #' @param header only used for prediction for text file. True if text file has header #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several -#' prediction outputs per case. +#' prediction outputs per case. #' @param ... Additional named arguments passed to the \code{predict()} method of #' the \code{lgb.Booster} object passed to \code{object}. #' @return @@ -642,13 +690,15 @@ Booster <- R6::R6Class( #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' preds <- predict(model, test$data) #' #' @rdname predict.lgb.Booster @@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object, } # Return booster predictions - object$predict(data, - num_iteration, - rawscore, - predleaf, - predcontrib, - header, - reshape, ...) + object$predict( + data + , num_iteration + , rawscore + , predleaf + , predcontrib + , header + , reshape + , ... + ) } #' Load LightGBM model @@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object, #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' lgb.save(model, "model.txt") #' load_booster <- lgb.load(filename = "model.txt") #' model_string <- model$save_model_to_string(NULL) # saves best iteration @@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' lgb.save(model, "model.txt") #' #' @rdname lgb.save @@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' json_model <- lgb.dump(model) #' #' @rdname lgb.dump @@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' lgb.get.eval.result(model, "test", "l2") #' @rdname lgb.get.eval.result #' @export diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 6eb7e61d8980..e898b7985d97 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -97,16 +97,18 @@ Dataset <- R6::R6Class( ...) { # Create new dataset - ret <- Dataset$new(data, - private$params, - self, - private$colnames, - private$categorical_feature, - private$predictor, - private$free_raw_data, - NULL, - info, - ...) + ret <- Dataset$new( + data = data + , params = private$params + , reference = self + , colnames = private$colnames + , categorical_feature = private$categorical_feature + , predictor = private$predictor + , free_raw_data = private$free_raw_data + , used_indices = NULL + , info = info + , ... + ) # Return ret return(invisible(ret)) @@ -142,14 +144,23 @@ Dataset <- R6::R6Class( # Provided indices, but some indices are not existing? if (sum(is.na(cate_indices)) > 0) { - stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)])) + stop( + "lgb.self.get.handle: supplied an unknown feature in categorical_feature: " + , sQuote(private$categorical_feature[is.na(cate_indices)]) + ) } } else { # Check if more categorical features were output over the feature space if (max(private$categorical_feature) > length(private$colnames)) { - stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features") + stop( + "lgb.self.get.handle: supplied a too large value in categorical_feature: " + , max(private$categorical_feature) + , " but only " + , length(private$colnames) + , " features" + ) } # Store indices as [0, n-1] indexed instead of [1, n] indexed @@ -165,7 +176,9 @@ Dataset <- R6::R6Class( # Check has header or not has_header <- FALSE if (!is.null(private$params$has_header) || !is.null(private$params$header)) { - if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") { + params_has_header <- tolower(as.character(private$params$has_header)) == "true" + params_header <- tolower(as.character(private$params$header)) == "true" + if (params_has_header || params_header) { has_header <- TRUE } } @@ -186,43 +199,52 @@ Dataset <- R6::R6Class( # Are we using a data file? if (is.character(private$raw_data)) { - handle <- lgb.call("LGBM_DatasetCreateFromFile_R", - ret = handle, - lgb.c_str(private$raw_data), - params_str, - ref_handle) + handle <- lgb.call( + "LGBM_DatasetCreateFromFile_R" + , ret = handle + , lgb.c_str(private$raw_data) + , params_str + , ref_handle + ) } else if (is.matrix(private$raw_data)) { # Are we using a matrix? - handle <- lgb.call("LGBM_DatasetCreateFromMat_R", - ret = handle, - private$raw_data, - nrow(private$raw_data), - ncol(private$raw_data), - params_str, - ref_handle) + handle <- lgb.call( + "LGBM_DatasetCreateFromMat_R" + , ret = handle + , private$raw_data + , nrow(private$raw_data) + , ncol(private$raw_data) + , params_str + , ref_handle + ) } else if (methods::is(private$raw_data, "dgCMatrix")) { if (length(private$raw_data@p) > 2147483647) { stop("Cannot support large CSC matrix") } # Are we using a dgCMatrix (sparsed matrix column compressed) - handle <- lgb.call("LGBM_DatasetCreateFromCSC_R", - ret = handle, - private$raw_data@p, - private$raw_data@i, - private$raw_data@x, - length(private$raw_data@p), - length(private$raw_data@x), - nrow(private$raw_data), - params_str, - ref_handle) + handle <- lgb.call( + "LGBM_DatasetCreateFromCSC_R" + , ret = handle + , private$raw_data@p + , private$raw_data@i + , private$raw_data@x + , length(private$raw_data@p) + , length(private$raw_data@x) + , nrow(private$raw_data) + , params_str + , ref_handle + ) } else { # Unknown data type - stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data))) + stop( + "lgb.Dataset.construct: does not support constructing from " + , sQuote(class(private$raw_data)) + ) } @@ -234,12 +256,14 @@ Dataset <- R6::R6Class( } # Construct subset - handle <- lgb.call("LGBM_DatasetGetSubset_R", - ret = handle, - ref_handle, - c(private$used_indices), # Adding c() fixes issue in R v3.5 - length(private$used_indices), - params_str) + handle <- lgb.call( + "LGBM_DatasetGetSubset_R" + , ret = handle + , ref_handle + , c(private$used_indices) # Adding c() fixes issue in R v3.5 + , length(private$used_indices) + , params_str + ) } if (lgb.is.null.handle(handle)) { @@ -258,7 +282,11 @@ Dataset <- R6::R6Class( if (!is.null(private$predictor) && is.null(private$used_indices)) { # Setup initial scores - init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE) + init_score <- private$predictor$predict( + private$raw_data + , rawscore = TRUE + , reshape = TRUE + ) # Not needed to transpose, for is col_marjor init_score <- as.vector(init_score) @@ -316,7 +344,10 @@ Dataset <- R6::R6Class( } else { # Trying to work with unknown dimensions is not possible - stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly") + stop( + "dim: cannot get dimensions before dataset has been constructed, " + , "please call lgb.Dataset.construct explicitly" + ) } @@ -341,7 +372,10 @@ Dataset <- R6::R6Class( } else { # Trying to work with unknown dimensions is not possible - stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly") + stop( + "dim: cannot get dimensions before dataset has been constructed, please call " + , "lgb.Dataset.construct explicitly" + ) } @@ -367,10 +401,12 @@ Dataset <- R6::R6Class( # Merge names with tab separation merged_name <- paste0(as.list(private$colnames), collapse = "\t") - lgb.call("LGBM_DatasetSetFeatureNames_R", - ret = NULL, - private$handle, - lgb.c_str(merged_name)) + lgb.call( + "LGBM_DatasetSetFeatureNames_R" + , ret = NULL + , private$handle + , lgb.c_str(merged_name) + ) } @@ -399,10 +435,12 @@ Dataset <- R6::R6Class( # Get field size of info info_len <- 0L - info_len <- lgb.call("LGBM_DatasetGetFieldSize_R", - ret = info_len, - private$handle, - lgb.c_str(name)) + info_len <- lgb.call( + "LGBM_DatasetGetFieldSize_R" + , ret = info_len + , private$handle + , lgb.c_str(name) + ) # Check if info is not empty if (info_len > 0) { @@ -415,10 +453,12 @@ Dataset <- R6::R6Class( numeric(info_len) # Numeric } - ret <- lgb.call("LGBM_DatasetGetField_R", - ret = ret, - private$handle, - lgb.c_str(name)) + ret <- lgb.call( + "LGBM_DatasetGetField_R" + , ret = ret + , private$handle + , lgb.c_str(name) + ) private$info[[name]] <- ret @@ -454,12 +494,14 @@ Dataset <- R6::R6Class( if (length(info) > 0) { - lgb.call("LGBM_DatasetSetField_R", - ret = NULL, - private$handle, - lgb.c_str(name), - info, - length(info)) + lgb.call( + "LGBM_DatasetSetField_R" + , ret = NULL + , private$handle + , lgb.c_str(name) + , info + , length(info) + ) } @@ -474,16 +516,18 @@ Dataset <- R6::R6Class( slice = function(idxset, ...) { # Perform slicing - Dataset$new(NULL, - private$params, - self, - private$colnames, - private$categorical_feature, - private$predictor, - private$free_raw_data, - sort(idxset, decreasing = FALSE), - NULL, - ...) + Dataset$new( + data = NULL + , params = private$params + , reference = self + , colnames = private$colnames + , categorical_feature = private$categorical_feature + , predictor = private$predictor + , free_raw_data = private$free_raw_data + , used_indices = sort(idxset, decreasing = FALSE) + , info = NULL + , ... + ) }, @@ -492,7 +536,12 @@ Dataset <- R6::R6Class( # Parameter updating if (!lgb.is.null.handle(private$handle)) { - lgb.call("LGBM_DatasetUpdateParam_R", ret = NULL, private$handle, lgb.params2str(params)) + lgb.call( + "LGBM_DatasetUpdateParam_R" + , ret = NULL + , private$handle + , lgb.params2str(params) + ) return(invisible(self)) } private$params <- modifyList(private$params, params) @@ -568,10 +617,12 @@ Dataset <- R6::R6Class( # Store binary data self$construct() - lgb.call("LGBM_DatasetSaveBinary_R", - ret = NULL, - private$handle, - lgb.c_str(fname)) + lgb.call( + "LGBM_DatasetSaveBinary_R" + , ret = NULL + , private$handle + , lgb.c_str(fname) + ) return(invisible(self)) } @@ -671,16 +722,18 @@ lgb.Dataset <- function(data, ...) { # Create new dataset - invisible(Dataset$new(data, - params, - reference, - colnames, - categorical_feature, - NULL, - free_raw_data, - NULL, - info, - ...)) + invisible(Dataset$new( + data = data + , params = params + , reference = reference + , colnames = colnames + , categorical_feature = categorical_feature + , predictor = NULL + , free_raw_data = free_raw_data + , used_indices = NULL + , info = info + , ... + )) } @@ -784,7 +837,7 @@ dim.lgb.Dataset <- function(x, ...) { #' #' @param x object of class \code{lgb.Dataset} #' @param value a list of two elements: the first one is ignored -#' and the second one is column names +#' and the second one is column names #' #' @details #' Generic \code{dimnames} methods are used by \code{colnames}. @@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) { # Check for unmatching column size if (ncol(x) != length(value[[2]])) { - stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns") + stop( + "can't assign " + , sQuote(length(value[[2]])) + , " colnames to an lgb.Dataset with " + , sQuote(ncol(x)) + , " columns" + ) } # Set column names properly, and return diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index b44ab5f08193..333dd5057a16 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -13,7 +13,11 @@ Predictor <- R6::R6Class( if (private$need_free_handle && !lgb.is.null.handle(private$handle)) { # Freeing up handle - lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle) + lgb.call( + "LGBM_BoosterFree_R" + , ret = NULL + , private$handle + ) private$handle <- NULL } @@ -31,7 +35,11 @@ Predictor <- R6::R6Class( if (is.character(modelfile)) { # Create handle on it - handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile)) + handle <- lgb.call( + "LGBM_BoosterCreateFromModelfile_R" + , ret = handle + , lgb.c_str(modelfile) + ) private$need_free_handle <- TRUE } else if (methods::is(modelfile, "lgb.Booster.handle")) { @@ -57,7 +65,11 @@ Predictor <- R6::R6Class( current_iter = function() { cur_iter <- 0L - lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle) + lgb.call( + "LGBM_BoosterGetCurrentIteration_R" + , ret = cur_iter + , private$handle + ) }, @@ -86,14 +98,19 @@ Predictor <- R6::R6Class( on.exit(unlink(tmp_filename), add = TRUE) # Predict from temporary file - lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data, - as.integer(header), - as.integer(rawscore), - as.integer(predleaf), - as.integer(predcontrib), - as.integer(num_iteration), - private$params, - lgb.c_str(tmp_filename)) + lgb.call( + "LGBM_BoosterPredictForFile_R" + , ret = NULL + , private$handle + , data + , as.integer(header) + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , as.integer(num_iteration) + , private$params + , lgb.c_str(tmp_filename) + ) # Get predictions from file preds <- read.delim(tmp_filename, header = FALSE, sep = "\t") @@ -108,51 +125,57 @@ Predictor <- R6::R6Class( npred <- 0L # Check number of predictions to do - npred <- lgb.call("LGBM_BoosterCalcNumPredict_R", - ret = npred, - private$handle, - as.integer(num_row), - as.integer(rawscore), - as.integer(predleaf), - as.integer(predcontrib), - as.integer(num_iteration)) + npred <- lgb.call( + "LGBM_BoosterCalcNumPredict_R" + , ret = npred + , private$handle + , as.integer(num_row) + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , as.integer(num_iteration) + ) # Pre-allocate empty vector preds <- numeric(npred) # Check if data is a matrix if (is.matrix(data)) { - preds <- lgb.call("LGBM_BoosterPredictForMat_R", - ret = preds, - private$handle, - data, - as.integer(nrow(data)), - as.integer(ncol(data)), - as.integer(rawscore), - as.integer(predleaf), - as.integer(predcontrib), - as.integer(num_iteration), - private$params) + preds <- lgb.call( + "LGBM_BoosterPredictForMat_R" + , ret = preds + , private$handle + , data + , as.integer(nrow(data)) + , as.integer(ncol(data)) + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , as.integer(num_iteration) + , private$params + ) } else if (methods::is(data, "dgCMatrix")) { if (length(data@p) > 2147483647) { stop("Cannot support large CSC matrix") } # Check if data is a dgCMatrix (sparse matrix, column compressed format) - preds <- lgb.call("LGBM_BoosterPredictForCSC_R", - ret = preds, - private$handle, - data@p, - data@i, - data@x, - length(data@p), - length(data@x), - nrow(data), - as.integer(rawscore), - as.integer(predleaf), - as.integer(predcontrib), - as.integer(num_iteration), - private$params) + preds <- lgb.call( + "LGBM_BoosterPredictForCSC_R" + , ret = preds + , private$handle + , data@p + , data@i + , data@x + , length(data@p) + , length(data@x) + , nrow(data) + , as.integer(rawscore) + , as.integer(predleaf) + , as.integer(predcontrib) + , as.integer(num_iteration) + , private$params + ) } else { @@ -165,7 +188,12 @@ Predictor <- R6::R6Class( # Check if number of rows is strange (not a multiple of the dataset rows) if (length(preds) %% num_row != 0) { - stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row)) + stop( + "predict: prediction length " + , sQuote(length(preds)) + ," is not a multiple of nrows(data): " + , sQuote(num_row) + ) } # Get number of cases per row @@ -192,7 +220,9 @@ Predictor <- R6::R6Class( } ), - private = list(handle = NULL, - need_free_handle = FALSE, - params = "") + private = list( + handle = NULL + , need_free_handle = FALSE + , params = "" + ) ) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index fc58032dbd02..594b323df16d 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -25,23 +25,23 @@ CVBooster <- R6::R6Class( #' @param label vector of response values. Should be provided only when data is an R-matrix. #' @param weight vector of response values. If not NULL, will set to dataset #' @param obj objective function, can be character or custom objective function. Examples include -#' \code{regression}, \code{regression_l1}, \code{huber}, -#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} +#' \code{regression}, \code{regression_l1}, \code{huber}, +#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} #' @param eval evaluation function, can be (list of) character or custom eval function #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param showsd \code{boolean}, whether to show standard deviation of cross validation #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified -#' by the values of outcome labels. +#' by the values of outcome labels. #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds -#' (each element must be a vector of test fold's indices). When folds are supplied, -#' the \code{nfold} and \code{stratified} parameters are ignored. +#' (each element must be a vector of test fold's indices). When folds are supplied, +#' the \code{nfold} and \code{stratified} parameters are ignored. #' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param categorical_feature list of str or int -#' type int represents index, -#' type str represents feature names -#' @param callbacks list of callback functions -#' List of callback functions that are applied at each iteration. -#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets +#' type int represents index, +#' type str represents feature names +#' @param callbacks List of callback functions that are applied at each iteration. +#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model +#' into a predictor model which frees up memory and the original datasets #' @param ... other parameters, see Parameters.rst for more information. A few key parameters: #' \itemize{ #' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}} @@ -61,13 +61,15 @@ CVBooster <- R6::R6Class( #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) #' params <- list(objective = "regression", metric = "l2") -#' model <- lgb.cv(params, -#' dtrain, -#' 10, -#' nfold = 3, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.cv( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , nfold = 3 +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' @export lgb.cv <- function(params = list(), data, @@ -134,7 +136,17 @@ lgb.cv <- function(params = list(), begin_iteration <- predictor$current_iter() + 1 } # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one - n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators") + n_trees <- c( + "num_iterations" + , "num_iteration" + , "n_iter" + , "num_tree" + , "num_trees" + , "num_round" + , "num_rounds" + , "num_boost_round" + , "n_estimators" + ) if (any(names(params) %in% n_trees)) { end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1 } else { @@ -192,12 +204,14 @@ lgb.cv <- function(params = list(), } # Create folds - folds <- generate.cv.folds(nfold, - nrow(data), - stratified, - getinfo(data, "label"), - getinfo(data, "group"), - params) + folds <- generate.cv.folds( + nfold + , nrow(data) + , stratified + , getinfo(data, "label") + , getinfo(data, "group") + , params + ) } @@ -215,12 +229,24 @@ lgb.cv <- function(params = list(), early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change") if (any(names(params) %in% early_stop)) { if (params[[which(names(params) %in% early_stop)[1]]] > 0) { - callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose)) + callbacks <- add.cb( + callbacks + , cb.early.stop( + params[[which(names(params) %in% early_stop)[1]]] + , verbose = verbose + ) + ) } } else { if (!is.null(early_stopping_rounds)) { if (early_stopping_rounds > 0) { - callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose)) + callbacks <- add.cb( + callbacks + , cb.early.stop( + early_stopping_rounds + , verbose = verbose + ) + ) } } } @@ -292,7 +318,7 @@ lgb.cv <- function(params = list(), env$eval_list <- merged_msg$eval_list # Check for standard deviation requirement - if(showsd) { + if (showsd) { env$eval_err_list <- merged_msg$eval_err_list } @@ -319,9 +345,11 @@ lgb.cv <- function(params = list(), if (reset_data) { lapply(cv_booster$boosters, function(fd) { # Store temporarily model data elsewhere - booster_old <- list(best_iter = fd$booster$best_iter, - best_score = fd$booster$best_score, - record_evals = fd$booster$record_evals) + booster_old <- list( + best_iter = fd$booster$best_iter + , best_score = fd$booster$best_score, + , record_evals = fd$booster$record_evals + ) # Reload model fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string()) fd$booster$best_iter <- booster_old$best_iter @@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { # Loop through each fold for (i in seq_len(nfold)) { kstep <- length(rnd_idx) %/% (nfold - i + 1) - folds[[i]] <- list(fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]), - group = rnd_idx[seq_len(kstep)]) + folds[[i]] <- list( + fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]) + , group = rnd_idx[seq_len(kstep)] + ) rnd_idx <- rnd_idx[-seq_len(kstep)] } @@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) { if (is.numeric(y)) { cuts <- length(y) %/% k - if (cuts < 2) { cuts <- 2 } - if (cuts > 5) { cuts <- 5 } - y <- cut(y, - unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))), - include.lowest = TRUE) + if (cuts < 2) { + cuts <- 2 + } + if (cuts > 5) { + cuts <- 5 + } + y <- cut( + y + , unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))) + , include.lowest = TRUE + ) } @@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) { # Parse standard deviation for (j in seq_len(eval_len)) { - ret_eval_err <- c(ret_eval_err, - sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)) + ret_eval_err <- c( + ret_eval_err + , sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2) + ) } # Convert to list @@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) { } # Return errors - list(eval_list = ret_eval, - eval_err_list = ret_eval_err) + list( + eval_list = ret_eval + , eval_err_list = ret_eval_err + ) } diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index a395b9cf7a9f..ba9d02476166 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -21,9 +21,14 @@ #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) #' -#' params <- list(objective = "binary", -#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, -#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +#' params <- list( +#' objective = "binary" +#' , learning_rate = 0.01 +#' , num_leaves = 63 +#' , max_depth = -1 +#' , min_data_in_leaf = 1 +#' , min_sum_hessian_in_leaf = 1 +#' ) #' model <- lgb.train(params, dtrain, 10) #' #' tree_imp1 <- lgb.importance(model, percentage = TRUE) @@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) { # Check if relative values are requested if (percentage) { - tree_imp_dt[, ":="(Gain = Gain / sum(Gain), - Cover = Cover / sum(Cover), - Frequency = Frequency / sum(Frequency))] + tree_imp_dt[, `:=`( + Gain = Gain / sum(Gain) + , Cover = Cover / sum(Cover) + , Frequency = Frequency / sum(Frequency) + )] } # Return importance table diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index c35ffb0ef461..bc147f5adaf0 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -69,12 +69,21 @@ lgb.interprete <- function(model, ) # Get list of trees - tree_index_mat_list <- lapply(leaf_index_mat_list, - FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)) + tree_index_mat_list <- lapply( + X = leaf_index_mat_list + , FUN = function(x){ + matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE) + } + ) # Sequence over idxset for (i in seq_along(idxset)) { - tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]]) + tree_interpretation_dt_list[[i]] <- single.row.interprete( + tree_dt + , num_class + , tree_index_mat_list[[i]] + , leaf_index_mat_list[[i]] + ) } # Return interpretation list @@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt, leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]]) # Return formatted data.table - data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq)) + data.table::data.table( + Feature = feature_seq + , Contribution = diff.default(value_seq) + ) } @@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index } else { # Full interpretation elements - tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE), - x = tree_interpretation) + tree_interpretation_dt <- Reduce( + f = function(x, y){ + merge(x, y, by = "Feature", all = TRUE) + } + , x = tree_interpretation + ) # Loop throughout each tree for (j in 2:ncol(tree_interpretation_dt)) { - data.table::set(tree_interpretation_dt, - i = which(is.na(tree_interpretation_dt[[j]])), - j = j, - value = 0) + data.table::set( + tree_interpretation_dt + , i = which(is.na(tree_interpretation_dt[[j]])) + , j = j + , value = 0 + ) } diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R index 70824fd993dc..ef493ce7a1f6 100644 --- a/R-package/R/lgb.model.dt.tree.R +++ b/R-package/R/lgb.model.dt.tree.R @@ -35,9 +35,14 @@ #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) #' -#' params <- list(objective = "binary", -#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, -#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +#' params <- list( +#' objective = "binary" +#' , learning_rate = 0.01 +#' , num_leaves = 63 +#' , max_depth = -1 +#' , min_data_in_leaf = 1 +#' , min_sum_hessian_in_leaf = 1 +#' ) #' model <- lgb.train(params, dtrain, 10) #' #' tree_dt <- lgb.model.dt.tree(model) @@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) { json_model <- lgb.dump(model, num_iteration = num_iteration) # Parse json model second - parsed_json_model <- jsonlite::fromJSON(json_model, - simplifyVector = TRUE, - simplifyDataFrame = FALSE, - simplifyMatrix = FALSE, - flatten = FALSE) + parsed_json_model <- jsonlite::fromJSON( + json_model + , simplifyVector = TRUE + , simplifyDataFrame = FALSE + , simplifyMatrix = FALSE + , flatten = FALSE + ) # Parse tree model third tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse) @@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) { if (is.null(env)) { # Setup initial default data.table with default types env <- new.env(parent = emptyenv()) - env$single_tree_dt <- data.table::data.table(tree_index = integer(0), - depth = integer(0), - split_index = integer(0), - split_feature = integer(0), - node_parent = integer(0), - leaf_index = integer(0), - leaf_parent = integer(0), - split_gain = numeric(0), - threshold = numeric(0), - decision_type = character(0), - default_left = character(0), - internal_value = integer(0), - internal_count = integer(0), - leaf_value = integer(0), - leaf_count = integer(0)) + env$single_tree_dt <- data.table::data.table( + tree_index = integer(0) + , depth = integer(0) + , split_index = integer(0) + , split_feature = integer(0) + , node_parent = integer(0) + , leaf_index = integer(0) + , leaf_parent = integer(0) + , split_gain = numeric(0) + , threshold = numeric(0) + , decision_type = character(0) + , default_left = character(0) + , internal_value = integer(0) + , internal_count = integer(0) + , leaf_value = integer(0) + , leaf_count = integer(0) + ) # start tree traversal pre_order_traversal(env, tree_node_leaf, current_depth, parent_index) } else { @@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) { fill = TRUE) # Traverse tree again both left and right - pre_order_traversal(env, - tree_node_leaf$left_child, - current_depth = current_depth + 1L, - parent_index = tree_node_leaf$split_index) - pre_order_traversal(env, - tree_node_leaf$right_child, - current_depth = current_depth + 1L, - parent_index = tree_node_leaf$split_index) + pre_order_traversal( + env + , tree_node_leaf$left_child + , current_depth = current_depth + 1L + , parent_index = tree_node_leaf$split_index + ) + pre_order_traversal( + env + , tree_node_leaf$right_child + , current_depth = current_depth + 1L + , parent_index = tree_node_leaf$split_index + ) } else if (!is.null(tree_node_leaf$leaf_index)) { diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index 7940acb151cb..a101e439196a 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp, cex = NULL) { # Check for measurement (column names) correctness - measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE) + measure <- match.arg( + measure + , choices = c("Gain", "Cover", "Frequency") + , several.ok = FALSE + ) # Get top N importance (defaults to 10) top_n <- min(top_n, nrow(tree_imp)) @@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp, # Do plot tree_imp[.N:1, graphics::barplot( - height = get(measure), - names.arg = Feature, - horiz = TRUE, - border = NA, - main = "Feature Importance", - xlab = measure, - cex.names = cex, - las = 1 + height = get(measure) + , names.arg = Feature + , horiz = TRUE + , border = NA + , main = "Feature Importance" + , xlab = measure + , cex.names = cex + , las = 1 )] # Return invisibly diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 733b1c1be863..6309a5154cb2 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -9,8 +9,8 @@ #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}. #' #' @details -#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. -#' Features are shown ranked in a decreasing contribution order. +#' The graph represents each feature as a horizontal bar of length proportional to the defined +#' contribution of a feature. Features are shown ranked in a decreasing contribution order. #' #' @return #' The \code{lgb.plot.interpretation} function creates a \code{barplot}. @@ -26,9 +26,14 @@ #' data(agaricus.test, package = "lightgbm") #' test <- agaricus.test #' -#' params <- list(objective = "binary", -#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, -#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +#' params <- list( +#' objective = "binary" +#' , learning_rate = 0.01 +#' , num_leaves = 63 +#' , max_depth = -1 +#' , min_data_in_leaf = 1 +#' , min_sum_hessian_in_leaf = 1 +#' ) #' model <- lgb.train(params, dtrain, 10) #' #' tree_interpretation <- lgb.interprete(model, test$data, 1:5) @@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt, if (num_class == 1) { # Only one class, plot straight away - multiple.tree.plot.interpretation(tree_interpretation_dt, - top_n = top_n, - title = NULL, - cex = cex) + multiple.tree.plot.interpretation( + tree_interpretation_dt + , top_n = top_n + , title = NULL + , cex = cex + ) } else { # More than one class, shape data first - layout_mat <- matrix(seq.int(to = cols * ceiling(num_class / cols)), - ncol = cols, nrow = ceiling(num_class / cols)) + layout_mat <- matrix( + seq.int(to = cols * ceiling(num_class / cols)) + , ncol = cols + , nrow = ceiling(num_class / cols) + ) # Shape output graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat))) @@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation, # Do plot tree_interpretation[.N:1, graphics::barplot( - height = Contribution, - names.arg = Feature, - horiz = TRUE, - col = ifelse(Contribution > 0, "firebrick", "steelblue"), - border = NA, - main = title, - cex.names = cex, - las = 1 + height = Contribution + , names.arg = Feature + , horiz = TRUE + , col = ifelse(Contribution > 0, "firebrick", "steelblue") + , border = NA + , main = title + , cex.names = cex + , las = 1 )] # Return invisibly diff --git a/R-package/R/lgb.prepare.R b/R-package/R/lgb.prepare.R index 2f6d768c7ac4..432d484512df 100644 --- a/R-package/R/lgb.prepare.R +++ b/R-package/R/lgb.prepare.R @@ -1,10 +1,13 @@ #' Data preparator for LightGBM datasets (numeric) #' -#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets. +#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +#' Factors and characters are converted to numeric without integers. Please use +#' \code{lgb.prepare_rules} if you want to apply this transformation to other datasets. #' #' @param data A data.frame or data.table to prepare. #' -#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) +#' for input in \code{lgb.Dataset}. #' #' @examples #' library(lightgbm) @@ -71,7 +74,11 @@ lgb.prepare <- function(data) { } else { # What do you think you are doing here? Throw error. - stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") + stop( + "lgb.prepare2: you provided " + , paste(class(data), collapse = " & ") + , " but data should have class data.frame" + ) } diff --git a/R-package/R/lgb.prepare2.R b/R-package/R/lgb.prepare2.R index 59321f6912c8..8809936f3e05 100644 --- a/R-package/R/lgb.prepare2.R +++ b/R-package/R/lgb.prepare2.R @@ -1,10 +1,16 @@ #' Data preparator for LightGBM datasets (integer) #' -#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. +#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +#' Factors and characters are converted to numeric (specifically: integer). +#' Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. +#' This is useful if you have a specific need for integer dataset instead of numeric dataset. +#' Note that there are programs which do not support integer-only input. Consider this as a half +#' memory technique which is dangerous, especially for LightGBM. #' #' @param data A data.frame or data.table to prepare. #' -#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) +#' for input in \code{lgb.Dataset}. #' #' @examples #' library(lightgbm) diff --git a/R-package/R/lgb.prepare_rules.R b/R-package/R/lgb.prepare_rules.R index ea4860b493cd..2f51cd0e64e4 100644 --- a/R-package/R/lgb.prepare_rules.R +++ b/R-package/R/lgb.prepare_rules.R @@ -1,11 +1,15 @@ #' Data preparator for LightGBM datasets with rules (numeric) #' -#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. +#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +#' Factors and characters are converted to numeric. In addition, keeps rules created +#' so you can convert other datasets using this converter. #' #' @param data A data.frame or data.table to prepare. #' @param rules A set of rules from the data preparator, if already used. #' -#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). +#' The data must be converted to a matrix format (\code{as.matrix}) for input +#' in \code{lgb.Dataset}. #' #' @examples #' library(lightgbm) @@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) { } else { # What do you think you are doing here? Throw error. - stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") + stop( + "lgb.prepare: you provided " + , paste(class(data), collapse = " & ") + , " but data should have class data.frame" + ) } diff --git a/R-package/R/lgb.prepare_rules2.R b/R-package/R/lgb.prepare_rules2.R index c32b68c805c4..119cd32308ac 100644 --- a/R-package/R/lgb.prepare_rules2.R +++ b/R-package/R/lgb.prepare_rules2.R @@ -1,11 +1,18 @@ #' Data preparator for LightGBM datasets with rules (integer) #' -#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. +#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +#' Factors and characters are converted to numeric (specifically: integer). +#' In addition, keeps rules created so you can convert other datasets using this converter. +#' This is useful if you have a specific need for integer dataset instead of numeric dataset. +#' Note that there are programs which do not support integer-only input. +#' Consider this as a half memory technique which is dangerous, especially for LightGBM. #' #' @param data A data.frame or data.table to prepare. #' @param rules A set of rules from the data preparator, if already used. #' -#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). +#' The data must be converted to a matrix format (\code{as.matrix}) for input in +#' \code{lgb.Dataset}. #' #' @examples #' library(lightgbm) @@ -35,9 +42,13 @@ #' data(iris) # Erase iris dataset #' #' # We remapped values differently -#' personal_rules <- list(Species = c("setosa" = 3L, -#' "versicolor" = 2L, -#' "virginica" = 1L)) +#' personal_rules <- list( +#' Species = c( +#' "setosa" = 3L +#' , "versicolor" = 2L +#' , virginica" = 1L +#' ) +#' ) #' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules) #' str(newest_iris$data) # SUCCESS! #' @@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) { } else { # What do you think you are doing here? Throw error. - stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") + stop( + "lgb.prepare: you provided " + , paste(class(data), collapse = " & ") + , " but data should have class data.frame" + ) } diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 5e5d5d089a6f..79c6b4e4898a 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -4,17 +4,18 @@ #' @inheritParams lgb_shared_params #' @param valids a list of \code{lgb.Dataset} objects, used for validation #' @param obj objective function, can be character or custom objective function. Examples include -#' \code{regression}, \code{regression_l1}, \code{huber}, -#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} +#' \code{regression}, \code{regression_l1}, \code{huber}, +#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} #' @param eval evaluation function, can be (a list of) character or custom eval function #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param categorical_feature list of str or int -#' type int represents index, -#' type str represents feature names -#' @param callbacks list of callback functions -#' List of callback functions that are applied at each iteration. -#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets +#' type int represents index, +#' type str represents feature names +#' @param callbacks List of callback functions that are applied at each iteration. +#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the +#' booster model into a predictor model which frees up memory and the +#' original datasets #' @param ... other parameters, see Parameters.rst for more information. A few key parameters: #' \itemize{ #' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}} @@ -37,13 +38,15 @@ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' @export lgb.train <- function(params = list(), data, @@ -105,7 +108,17 @@ lgb.train <- function(params = list(), begin_iteration <- predictor$current_iter() + 1 } # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one - n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators") + n_rounds <- c( + "num_iterations" + , "num_iteration" + , "n_iter" + , "num_tree" + , "num_trees" + , "num_round" + , "num_rounds" + , "num_boost_round" + , "n_estimators" + ) if (any(names(params) %in% n_rounds)) { end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1 } else { @@ -198,12 +211,24 @@ lgb.train <- function(params = list(), early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change") if (any(names(params) %in% early_stop)) { if (params[[which(names(params) %in% early_stop)[1]]] > 0) { - callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose)) + callbacks <- add.cb( + callbacks + , cb.early.stop( + params[[which(names(params) %in% early_stop)[1]]] + , verbose = verbose + ) + ) } } else { if (!is.null(early_stopping_rounds)) { if (early_stopping_rounds > 0) { - callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose)) + callbacks <- add.cb( + callbacks + , cb.early.stop( + early_stopping_rounds + , verbose = verbose + ) + ) } } } @@ -267,7 +292,8 @@ lgb.train <- function(params = list(), } - # When early stopping is not activated, we compute the best iteration / score ourselves by selecting the first metric and the first dataset + # When early stopping is not activated, we compute the best iteration / score ourselves by + # selecting the first metric and the first dataset if (record && length(valids) > 0 && is.na(env$best_score)) { if (env$eval_list[[1]]$higher_better[1] == TRUE) { booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]]))) @@ -282,9 +308,11 @@ lgb.train <- function(params = list(), if (reset_data) { # Store temporarily model data elsewhere - booster_old <- list(best_iter = booster$best_iter, - best_score = booster$best_score, - record_evals = booster$record_evals) + booster_old <- list( + best_iter = booster$best_iter + , best_score = booster$best_score + , record_evals = booster$record_evals + ) # Reload model booster <- lgb.load(model_str = booster$save_model_to_string()) diff --git a/R-package/R/lgb.unloader.R b/R-package/R/lgb.unloader.R index 35139dffed2e..e4bfc99ad889 100644 --- a/R-package/R/lgb.unloader.R +++ b/R-package/R/lgb.unloader.R @@ -2,9 +2,13 @@ #' #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. #' -#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed. -#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them. -#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment. +#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. +#' Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once +#' unloading is performed. +#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global +#' environment. Defaults to \code{FALSE} which means to not remove them. +#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to +#' \code{.GlobalEnv} which is the global environment. #' #' @return NULL invisibly. #' @@ -18,13 +22,15 @@ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' #' \dontrun{ #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) @@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) { # Should we wipe variables? (lgb.Booster, lgb.Dataset) if (wipe) { - boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir)) - datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir)) + boosters <- Filter( + f = function(x){ + inherits(get(x, envir = envir), "lgb.Booster") + } + , x = ls(envir = envir) + ) + datasets <- Filter( + f = function(x){ + inherits(get(x, envir = envir), "lgb.Dataset") + } + , x = ls(envir = envir) + ) rm(list = c(boosters, datasets), envir = envir) gc(verbose = FALSE) } diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index ce8e932701b9..d93849d28927 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -4,12 +4,10 @@ #' @param callbacks list of callback functions #' List of callback functions that are applied at each iteration. #' @param data a \code{lgb.Dataset} object, used for training -#' @param early_stopping_rounds int -#' Activates early stopping. -#' Requires at least one validation data and one metric -#' If there's more than one, will check all of them except the training data -#' Returns the model with (best_iter + early_stopping_rounds) -#' If early stopping occurs, the model will have 'best_iter' field +#' @param early_stopping_rounds int. Activates early stopping. Requires at least one validation data +#' and one metric. If there's more than one, will check all of them +#' except the training data. Returns the model with (best_iter + early_stopping_rounds). +#' If early stopping occurs, the model will have 'best_iter' field. #' @param eval_freq evaluation output frequency, only effect when verbose > 0 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model #' @param nrounds number of training rounds @@ -76,9 +74,18 @@ lightgbm <- function(data, } # Train a model using the regular way - bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq, - early_stopping_rounds = early_stopping_rounds, - init_model = init_model, callbacks = callbacks, ...) + bst <- lgb.train( + params = params + , data = dtrain + , nrounds = nrounds + , valids = valids + , verbose = verbose + , eval_freq = eval_freq + , early_stopping_rounds = early_stopping_rounds + , init_model = init_model + , callbacks = callbacks + , ... + ) # Store model under a specific name bst$save_model(save_name) diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R index e654e0578316..0682fd56d52d 100644 --- a/R-package/R/readRDS.lgb.Booster.R +++ b/R-package/R/readRDS.lgb.Booster.R @@ -17,13 +17,15 @@ #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) -#' model <- lgb.train(params, -#' dtrain, -#' 10, -#' valids, -#' min_data = 1, -#' learning_rate = 1, -#' early_stopping_rounds = 5) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids +#' , min_data = 1 +#' , learning_rate = 1 +#' , early_stopping_rounds = 5 +#' ) #' saveRDS.lgb.Booster(model, "model.rds") #' new_model <- readRDS.lgb.Booster("model.rds") #' diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index 704a568de7e1..9ec4d6772123 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -1,12 +1,18 @@ #' saveRDS for \code{lgb.Booster} models #' -#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. +#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides +#' whether to save the raw model or not. #' #' @param object R object to serialize. #' @param file a connection or the name of the file where the R object is saved to or read from. -#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save. -#' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions. -#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection. +#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), +#' a binary one is used. See the comments in the help for save. +#' @param version the workspace format version to use. \code{NULL} specifies the current default +#' version (2). Versions prior to 2 are not supported, so this will only be relevant +#' when there are later versions. +#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, +#' or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of +#' compression to be used. Ignored if file is a connection. #' @param refhook a hook function for handling reference objects. #' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}. #' @@ -23,10 +29,10 @@ #' params <- list(objective = "regression", metric = "l2") #' valids <- list(test = dtest) #' model <- lgb.train( -#' params -#' , dtrain -#' , 10 -#' , valids +#' params = params +#' , data = dtrain +#' , nrounds = 10 +#' , valids = valids #' , min_data = 1 #' , learning_rate = 1 #' , early_stopping_rounds = 5 @@ -48,12 +54,14 @@ saveRDS.lgb.Booster <- function(object, object$save() # Save RDS - saveRDS(object, - file = file, - ascii = ascii, - version = version, - compress = compress, - refhook = refhook) + saveRDS( + object + , file = file + , ascii = ascii + , version = version + , compress = compress + , refhook = refhook + ) # Free model from memory object$raw <- NA @@ -61,12 +69,14 @@ saveRDS.lgb.Booster <- function(object, } else { # Save as usual - saveRDS(object, - file = file, - ascii = ascii, - version = version, - compress = compress, - refhook = refhook) + saveRDS( + object + , file = file + , ascii = ascii + , version = version + , compress = compress + , refhook = refhook + ) } diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 0661801acb7c..a22bc992e2a1 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -25,9 +25,20 @@ lgb.call <- function(fun_name, ret, ...) { # Check for a ret call if (!is.null(ret)) { - call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm") # Call with ret + call_state <- .Call( + fun_name + , ... + , ret + , call_state + , PACKAGE = "lib_lightgbm" + ) } else { - call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lib_lightgbm") # Call without ret + call_state <- .Call( + fun_name + , ... + , call_state + , PACKAGE = "lib_lightgbm" + ) } call_state <- as.integer(call_state) # Check for call state value post call @@ -37,17 +48,25 @@ lgb.call <- function(fun_name, ret, ...) { buf_len <- 200L act_len <- 0L err_msg <- raw(buf_len) - err_msg <- .Call("LGBM_GetLastError_R", buf_len, act_len, err_msg, PACKAGE = "lib_lightgbm") + err_msg <- .Call( + "LGBM_GetLastError_R" + , buf_len + , act_len + , err_msg + , PACKAGE = "lib_lightgbm" + ) # Check error buffer if (act_len > buf_len) { buf_len <- act_len err_msg <- raw(buf_len) - err_msg <- .Call("LGBM_GetLastError_R", - buf_len, - act_len, - err_msg, - PACKAGE = "lib_lightgbm") + err_msg <- .Call( + "LGBM_GetLastError_R" + , buf_len + , act_len + , err_msg + , PACKAGE = "lib_lightgbm" + ) } # Return error @@ -97,7 +116,13 @@ lgb.params2str <- function(params, ...) { # Check for identical parameters if (length(intersect(names(params), names(dot_params))) > 0) { - stop("Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list") + stop( + "Same parameters in " + , sQuote("params") + , " and in the call are not allowed. Please check your " + , sQuote("params") + , " list" + ) } # Merge parameters @@ -160,15 +185,43 @@ lgb.check.params <- function(params) { lgb.check.obj <- function(params, obj) { # List known objectives in a vector - OBJECTIVES <- c("regression", "regression_l1", "regression_l2", "mean_squared_error", "mse", "l2_root", "root_mean_squared_error", "rmse", - "mean_absolute_error", "mae", "quantile", - "huber", "fair", "poisson", "binary", "lambdarank", - "multiclass", "softmax", "multiclassova", "multiclass_ova", "ova", "ovr", - "xentropy", "cross_entropy", "xentlambda", "cross_entropy_lambda", "mean_absolute_percentage_error", "mape", - "gamma", "tweedie") + OBJECTIVES <- c( + "regression" + , "regression_l1" + , "regression_l2" + , "mean_squared_error" + , "mse" + , "l2_root" + , "root_mean_squared_error" + , "rmse" + , "mean_absolute_error" + , "mae" + , "quantile" + , "huber" + , "fair" + , "poisson" + , "binary" + , "lambdarank" + , "multiclass" + , "softmax" + , "multiclassova" + , "multiclass_ova" + , "ova" + , "ovr" + , "xentropy" + , "cross_entropy" + , "xentlambda" + , "cross_entropy_lambda" + , "mean_absolute_percentage_error" + , "mape" + , "gamma" + , "tweedie" + ) # Check whether the objective is empty or not, and take it from params if needed - if (!is.null(obj)) { params$objective <- obj } + if (!is.null(obj)) { + params$objective <- obj + } # Check whether the objective is a character if (is.character(params$objective)) { diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 4630c59f63f8..35c2621f09b7 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -17,62 +17,82 @@ class(train$data) # Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input # Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector) print("Training lightgbm with sparseMatrix") -bst <- lightgbm(data = train$data, - label = train$label, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - objective = "binary") +bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , objective = "binary" +) # Alternatively, you can put in dense matrix, i.e. basic R-matrix print("Training lightgbm with Matrix") -bst <- lightgbm(data = as.matrix(train$data), - label = train$label, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - objective = "binary") +bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , objective = "binary" +) # You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features print("Training lightgbm with lgb.Dataset") -dtrain <- lgb.Dataset(data = train$data, - label = train$label) -bst <- lightgbm(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - objective = "binary") +dtrain <- lgb.Dataset( + data = train$data + , label = train$label +) +bst <- lightgbm( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , objective = "binary" +) # Verbose = 0,1,2 print("Train lightgbm with verbose 0, no message") -bst <- lightgbm(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - objective = "binary", - verbose = 0) +bst <- lightgbm( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , objective = "binary" + , verbose = 0 +) print("Train lightgbm with verbose 1, print evaluation metric") -bst <- lightgbm(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - nthread = 2, - objective = "binary", - verbose = 1) +bst <- lightgbm( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , nthread = 2 + , objective = "binary" + , verbose = 1 +) print("Train lightgbm with verbose 2, also print information about tree") -bst <- lightgbm(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - nthread = 2, - objective = "binary", - verbose = 2) +bst <- lightgbm( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , nthread = 2 + , objective = "binary" + , verbose = 2 +) # You can also specify data as file path to a LibSVM/TCV/CSV format input # Since we do not have this file with us, the following line is just for illustration -# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary") +# bst <- lightgbm( +# data = "agaricus.train.svm" +# , num_leaves = 4 +# , learning_rate = 1 +# , nrounds = 2 +# , objective = "binary" +# ) #--------------------Basic prediction using lightgbm-------------- # You can do prediction using the following line @@ -104,37 +124,43 @@ valids <- list(train = dtrain, test = dtest) # To train with valids, use lgb.train, which contains more advanced features # valids allows us to monitor the evaluation result on all data in the list print("Train lightgbm using lgb.train with valids") -bst <- lgb.train(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - valids = valids, - nthread = 2, - objective = "binary") +bst <- lgb.train( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , valids = valids + , nthread = 2 + , objective = "binary" +) # We can change evaluation metrics, or use multiple evaluation metrics print("Train lightgbm using lgb.train with valids, watch logloss and error") -bst <- lgb.train(data = dtrain, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - valids = valids, - eval = c("binary_error", "binary_logloss"), - nthread = 2, - objective = "binary") +bst <- lgb.train( + data = dtrain + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , valids = valids + , eval = c("binary_error", "binary_logloss") + , nthread = 2 + , objective = "binary" +) # lgb.Dataset can also be saved using lgb.Dataset.save lgb.Dataset.save(dtrain, "dtrain.buffer") # To load it in, simply call lgb.Dataset dtrain2 <- lgb.Dataset("dtrain.buffer") -bst <- lgb.train(data = dtrain2, - num_leaves = 4, - learning_rate = 1, - nrounds = 2, - valids = valids, - nthread = 2, - objective = "binary") +bst <- lgb.train( + data = dtrain2 + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 2 + , valids = valids + , nthread = 2 + , objective = "binary" +) # information can be extracted from lgb.Dataset using getinfo label = getinfo(dtest, "label") diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R index 4a2b3d4fea40..391aa24ac0c8 100644 --- a/R-package/demo/boost_from_prediction.R +++ b/R-package/demo/boost_from_prediction.R @@ -13,10 +13,12 @@ valids <- list(eval = dtest, train = dtrain) print("Start running example to start from an initial prediction") # Train lightgbm for 1 round -param <- list(num_leaves = 4, - learning_rate = 1, - nthread = 2, - objective = "binary") +param <- list( + num_leaves = 4 + , learning_rate = 1 + , nthread = 2 + , objective = "binary" +) bst <- lgb.train(param, dtrain, 1, valids = valids) # Note: we need the margin value instead of transformed prediction in set_init_score @@ -29,7 +31,9 @@ setinfo(dtrain, "init_score", ptrain) setinfo(dtest, "init_score", ptest) print("This is result of boost from initial prediction") -bst <- lgb.train(params = param, - data = dtrain, - nrounds = 5, - valids = valids) +bst <- lgb.train( + params = param + , data = dtrain + , nrounds = 5 + , valids = valids +) diff --git a/R-package/demo/categorical_features_prepare.R b/R-package/demo/categorical_features_prepare.R index 69936913d3ae..a14646f503f0 100644 --- a/R-package/demo/categorical_features_prepare.R +++ b/R-package/demo/categorical_features_prepare.R @@ -60,21 +60,28 @@ my_data <- as.matrix(bank[, 1:16, with = FALSE]) # Creating the LightGBM dataset with categorical features # The categorical features must be indexed like in R (1-indexed, not 0-indexed) -lgb_data <- lgb.Dataset(data = my_data, - label = bank$y, - categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)) +lgb_data <- lgb.Dataset( + data = my_data + , label = bank$y + , categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16) +) # We can now train a model -model <- lgb.train(list(objective = "binary", - metric = "l2", - min_data = 1, - learning_rate = 0.1, - min_data = 0, - min_hessian = 1, - max_depth = 2), - lgb_data, - 100, - valids = list(train = lgb_data)) +params <- list( + objective = "binary" + , metric = "l2" + , min_data = 1 + , learning_rate = 0.1 + , min_data = 0 + , min_hessian = 1 + , max_depth = 2 +) +model <- lgb.train( + params = params + , data = lgb_data + , nrounds = 100 + , valids = list(train = lgb_data) +) # Try to find split_feature: 2 # If you find it, it means it used a categorical feature in the first tree diff --git a/R-package/demo/categorical_features_rules.R b/R-package/demo/categorical_features_rules.R index 27504022842e..26b92c2bc493 100644 --- a/R-package/demo/categorical_features_rules.R +++ b/R-package/demo/categorical_features_rules.R @@ -68,24 +68,33 @@ my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE]) # Creating the LightGBM dataset with categorical features # The categorical features can be passed to lgb.train to not copy and paste a lot -dtrain <- lgb.Dataset(data = my_data_train, - label = bank_train$y, - categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)) -dtest <- lgb.Dataset.create.valid(dtrain, - data = my_data_test, - label = bank_test$y) +dtrain <- lgb.Dataset( + data = my_data_train + , label = bank_train$y + , categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16) +) +dtest <- lgb.Dataset.create.valid( + dtrain + , data = my_data_test + , label = bank_test$y +) # We can now train a model -model <- lgb.train(list(objective = "binary", - metric = "l2", - min_data = 1, - learning_rate = 0.1, - min_data = 0, - min_hessian = 1, - max_depth = 2), - dtrain, - 100, - valids = list(train = dtrain, valid = dtest)) +params <- list( + objective = "binary" + , metric = "l2" + , min_data = 1 + , learning_rate = 0.1 + , min_data = 0 + , min_hessian = 1 + , max_depth = 2 +) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 100 + , valids = list(train = dtrain, valid = dtest) +) # Try to find split_feature: 11 # If you find it, it means it used a categorical feature in the first tree diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index 008acf4ca1bf..3d79a3b1da70 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -6,30 +6,36 @@ dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label) nrounds <- 2 -param <- list(num_leaves = 4, - learning_rate = 1, - objective = "binary") +param <- list( + num_leaves = 4 + , learning_rate = 1 + , objective = "binary" +) print("Running cross validation") # Do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -lgb.cv(param, - dtrain, - nrounds, - nfold = 5, - eval = "binary_error") +lgb.cv( + param + , dtrain + , nrounds + , nfold = 5 + , eval = "binary_error" +) print("Running cross validation, disable standard deviation display") # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric -lgb.cv(param, - dtrain, - nrounds, - nfold = 5, - eval = "binary_error", - showsd = FALSE) +lgb.cv( + param + , dtrain + , nrounds + , nfold = 5 + , eval = "binary_error" + , showsd = FALSE +) # You can also do cross validation with cutomized loss function print("Running cross validation, with cutomsized loss function") @@ -48,9 +54,11 @@ evalerror <- function(preds, dtrain) { } # train with customized objective -lgb.cv(params = param, - data = dtrain, - nrounds = nrounds, - obj = logregobj, - eval = evalerror, - nfold = 5) +lgb.cv( + params = param + , data = dtrain + , nrounds = nrounds + , obj = logregobj + , eval = evalerror + , nfold = 5 +) diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R index e37fc0e1a174..92470e7bb739 100644 --- a/R-package/demo/early_stopping.R +++ b/R-package/demo/early_stopping.R @@ -11,8 +11,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga # Note: for customized objective function, we leave objective as default # Note: what we are getting is margin value in prediction # You must know what you are doing -param <- list(num_leaves = 4, - learning_rate = 1) +param <- list( + num_leaves = 4 + , learning_rate = 1 +) valids <- list(eval = dtest) num_round <- 20 @@ -39,10 +41,12 @@ evalerror <- function(preds, dtrain) { } print("Start training with early Stopping setting") -bst <- lgb.train(param, - dtrain, - num_round, - valids, - objective = logregobj, - eval = evalerror, - early_stopping_round = 3) +bst <- lgb.train( + param + , dtrain + , num_round + , valids + , objective = logregobj + , eval = evalerror + , early_stopping_round = 3 +) diff --git a/R-package/demo/efficient_many_training.R b/R-package/demo/efficient_many_training.R index 0e11a13b38e0..50c45f3a0a55 100644 --- a/R-package/demo/efficient_many_training.R +++ b/R-package/demo/efficient_many_training.R @@ -26,9 +26,11 @@ gbm <- list() for (i in 1:1000) { print(i) - gbm[[i]] <- lgb.train(params = list(objective = "regression"), - data = data, - 1, - reset_data = TRUE) + gbm[[i]] <- lgb.train( + params = list(objective = "regression") + , data = data + , 1 + , reset_data = TRUE + ) gc(verbose = FALSE) } diff --git a/R-package/demo/leaf_stability.R b/R-package/demo/leaf_stability.R index 8d7982ff4dff..b97fc3e5e241 100644 --- a/R-package/demo/leaf_stability.R +++ b/R-package/demo/leaf_stability.R @@ -17,32 +17,47 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) # Third, we setup parameters and we train a model params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 50, - valids, - min_data = 1, - learning_rate = 0.1, - bagging_fraction = 0.1, - bagging_freq = 1, - bagging_seed = 1) +model <- lgb.train( + params + , dtrain + , 50 + , valids + , min_data = 1 + , learning_rate = 0.1 + , bagging_fraction = 0.1 + , bagging_freq = 1 + , bagging_seed = 1 +) # We create a data.frame with the following structure: # X = average leaf of the observation throughout all trees # Y = prediction probability (clamped to [1e-15, 1-1e-15]) # Z = logloss # binned = binned quantile of average leaf -new_data <- data.frame(X = rowMeans(predict(model, - agaricus.test$data, - predleaf = TRUE)), - Y = pmin(pmax(predict(model, - agaricus.test$data), 1e-15), 1 - 1e-15)) -new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y)) -new_data$binned <- .bincode(x = new_data$X, - breaks = quantile(x = new_data$X, - probs = (1:9)/10), - right = TRUE, - include.lowest = TRUE) +new_data <- data.frame( + X = rowMeans(predict( + model + , agaricus.test$data + , predleaf = TRUE + )) + , Y = pmin( + pmax( + predict(model, agaricus.test$data) + , 1e-15 + ) + , 1 - 1e-15 + ) +) +new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y)) +new_data$binned <- .bincode( + x = new_data$X + , breaks = quantile( + x = new_data$X + , probs = (1:9) / 10 + ) + , right = TRUE + , include.lowest = TRUE +) new_data$binned[is.na(new_data$binned)] <- 0 new_data$binned <- as.factor(new_data$binned) @@ -52,31 +67,64 @@ table(new_data$binned) # We can plot the binned content # On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss # On the third plot, it is smooth! -ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") -ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") -ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +ggplot( + data = new_data + , mapping = aes(x = X, y = Y, color = binned) +) + geom_point() + + theme_bw() + + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") +ggplot( + data = new_data + , mapping = aes(x = binned, y = Z, fill = binned, group = binned) +) + geom_boxplot() + + theme_bw() + + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") +ggplot( + data = new_data + , mapping = aes(x = Y, y = ..count.., fill = binned) +) + geom_density(position = "fill") + + theme_bw() + + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") # Now, let's show with other parameters -model2 <- lgb.train(params, - dtrain, - 100, - valids, - min_data = 1, - learning_rate = 1) +model2 <- lgb.train( + params + , dtrain + , 100 + , valids + , min_data = 1 + , learning_rate = 1 +) # We create the data structure, but for model2 -new_data2 <- data.frame(X = rowMeans(predict(model2, - agaricus.test$data, - predleaf = TRUE)), - Y = pmin(pmax(predict(model2, - agaricus.test$data), 1e-15), 1 - 1e-15)) -new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y)) -new_data2$binned <- .bincode(x = new_data2$X, - breaks = quantile(x = new_data2$X, - probs = (1:9)/10), - right = TRUE, - include.lowest = TRUE) +new_data2 <- data.frame( + X = rowMeans(predict( + model2 + , agaricus.test$data + , predleaf = TRUE + )) + , Y = pmin( + pmax( + predict( + model2 + , agaricus.test$data + ) + , 1e-15 + ) + , 1 - 1e-15 + ) +) +new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y)) +new_data2$binned <- .bincode( + x = new_data2$X + , breaks = quantile( + x = new_data2$X + , probs = (1:9) / 10 + ) + , right = TRUE + , include.lowest = TRUE +) new_data2$binned[is.na(new_data2$binned)] <- 0 new_data2$binned <- as.factor(new_data2$binned) @@ -87,31 +135,64 @@ table(new_data2$binned) # On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss # On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue # However, if the rules were not true, the loss would explode. -ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") -ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") -ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +ggplot( + data = new_data2 + , mapping = aes(x = X, y = Y, color = binned) +) + geom_point() + + theme_bw() + + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability") +ggplot( + data = new_data2 + , mapping = aes(x = binned, y = Z, fill = binned, group = binned) +) + geom_boxplot() + + theme_bw() + + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss") +ggplot( + data = new_data2 + , mapping = aes(x = Y, y = ..count.., fill = binned) +) + geom_density(position = "fill") + + theme_bw() + + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") # Now, try with very severe overfitting -model3 <- lgb.train(params, - dtrain, - 1000, - valids, - min_data = 1, - learning_rate = 1) +model3 <- lgb.train( + params + , dtrain + , 1000 + , valids + , min_data = 1 + , learning_rate = 1 +) # We create the data structure, but for model3 -new_data3 <- data.frame(X = rowMeans(predict(model3, - agaricus.test$data, - predleaf = TRUE)), - Y = pmin(pmax(predict(model3, - agaricus.test$data), 1e-15), 1 - 1e-15)) -new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y)) -new_data3$binned <- .bincode(x = new_data3$X, - breaks = quantile(x = new_data3$X, - probs = (1:9)/10), - right = TRUE, - include.lowest = TRUE) +new_data3 <- data.frame( + X = rowMeans(predict( + model3 + , agaricus.test$data + , predleaf = TRUE + )) + , Y = pmin( + pmax( + predict( + model3 + , agaricus.test$data + ) + , 1e-15 + ) + , 1 - 1e-15 + ) +) +new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y)) +new_data3$binned <- .bincode( + x = new_data3$X + , breaks = quantile( + x = new_data3$X + , probs = (1:9) / 10 + ) + , right = TRUE + , include.lowest = TRUE +) new_data3$binned[is.na(new_data3$binned)] <- 0 new_data3$binned <- as.factor(new_data3$binned) @@ -119,9 +200,21 @@ new_data3$binned <- as.factor(new_data3$binned) table(new_data3$binned) # We can plot the binned content -# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue. +# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules +# are real thus it is not an issue. # However, if the rules were not true, the loss would explode. See the sudden spikes? -ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +ggplot( + data = new_data3 + , mapping = aes(x = Y, y = ..count.., fill = binned) +) + + geom_density(position = "fill") + + theme_bw() + + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") # Compare with our second model, the difference is severe. This is smooth. -ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") +ggplot( + data = new_data2 + , mapping = aes(x = Y, y = ..count.., fill = binned) +) + geom_density(position = "fill") + + theme_bw() + + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density") diff --git a/R-package/demo/multiclass.R b/R-package/demo/multiclass.R index f1d14691ab89..2bf54bd54260 100644 --- a/R-package/demo/multiclass.R +++ b/R-package/demo/multiclass.R @@ -19,29 +19,33 @@ valids <- list(test = dtest) # Method 1 of training params <- list(objective = "multiclass", metric = "multi_error", num_class = 3) -model <- lgb.train(params, - dtrain, - 100, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10) +model <- lgb.train( + params + , dtrain + , 100 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 +) # We can predict on test data, outputs a 90-length vector # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... my_preds <- predict(model, test[, 1:4]) # Method 2 of training, identical -model <- lgb.train(list(), - dtrain, - 100, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10, - objective = "multiclass", - metric = "multi_error", - num_class = 3) +model <- lgb.train( + list() + , dtrain + , 100 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 + , objective = "multiclass" + , metric = "multi_error" + , num_class = 3 +) # We can predict on test data, identical my_preds <- predict(model, test[, 1:4]) diff --git a/R-package/demo/multiclass_custom_objective.R b/R-package/demo/multiclass_custom_objective.R index ccceca70a868..313fa8364782 100644 --- a/R-package/demo/multiclass_custom_objective.R +++ b/R-package/demo/multiclass_custom_objective.R @@ -20,17 +20,19 @@ valids <- list(train = dtrain, test = dtest) # Method 1 of training with built-in multiclass objective # Note: need to turn off boost from average to match custom objective # (https://github.com/microsoft/LightGBM/issues/1846) -model_builtin <- lgb.train(list(), - dtrain, - boost_from_average = FALSE, - 100, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10, - objective = "multiclass", - metric = "multi_logloss", - num_class = 3) +model_builtin <- lgb.train( + list() + , dtrain + , boost_from_average = FALSE + , 100 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 + , objective = "multiclass" + , metric = "multi_logloss" + , num_class = 3 +) preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE) probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin)) @@ -65,21 +67,25 @@ custom_multiclass_metric = function(preds, dtrain) { preds = preds - apply(preds, 1, max) prob = exp(preds) / rowSums(exp(preds)) - return(list(name = "error", - value = -mean(log(prob[cbind(1:length(labels), labels + 1)])), - higher_better = FALSE)) + return(list( + name = "error" + , value = -mean(log(prob[cbind(1:length(labels), labels + 1)])) + , higher_better = FALSE + )) } -model_custom <- lgb.train(list(), - dtrain, - 100, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10, - objective = custom_multiclass_obj, - eval = custom_multiclass_metric, - num_class = 3) +model_custom <- lgb.train( + list() + , dtrain + , 100 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 + , objective = custom_multiclass_obj + , eval = custom_multiclass_metric + , num_class = 3 +) preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE) probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom)) @@ -87,4 +93,3 @@ probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom)) # compare predictions stopifnot(identical(probs_builtin, probs_custom)) stopifnot(identical(preds_builtin, preds_custom)) - diff --git a/R-package/demo/weight_param.R b/R-package/demo/weight_param.R index c680f71e88dd..8d9b52b80878 100644 --- a/R-package/demo/weight_param.R +++ b/R-package/demo/weight_param.R @@ -11,8 +11,8 @@ library(lightgbm) # - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning) # Setup small weights -weights1 <- rep(1/100000, 6513) -weights2 <- rep(1/100000, 1611) +weights1 <- rep(1 / 100000, 6513) +weights2 <- rep(1 / 100000, 1611) # Load data and create datasets data(agaricus.train, package = "lightgbm") @@ -26,40 +26,48 @@ valids <- list(test = dtest) # Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning) # It cannot learn because regularization is too large! # min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything -params <- list(objective = "regression", - metric = "l2", - device = "cpu", - min_sum_hessian = 10, - num_leaves = 7, - max_depth = 3, - nthread = 1) -model <- lgb.train(params, - dtrain, - 50, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10) +params <- list( + objective = "regression" + , metric = "l2" + , device = "cpu" + , min_sum_hessian = 10 + , num_leaves = 7 + , max_depth = 3 + , nthread = 1 +) +model <- lgb.train( + params + , dtrain + , 50 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 +) weight_loss <- as.numeric(model$record_evals$test$l2$eval) plot(weight_loss) # Shows how poor the learning was: a straight line! # Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning) # Adjusted regularization just consisting in multiplicating results by 1e4 (x10000) # Notice how it learns, there is no issue as we adjusted regularization ourselves -params <- list(objective = "regression", - metric = "l2", - device = "cpu", - min_sum_hessian = 1e-4, - num_leaves = 7, - max_depth = 3, - nthread = 1) -model <- lgb.train(params, - dtrain, - 50, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10) +params <- list( + objective = "regression" + , metric = "l2" + , device = "cpu" + , min_sum_hessian = 1e-4 + , num_leaves = 7 + , max_depth = 3 + , nthread = 1 +) +model <- lgb.train( + params + , dtrain + , 50 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 +) small_weight_loss <- as.numeric(model$record_evals$test$l2$eval) plot(small_weight_loss) # It learns! @@ -78,24 +86,28 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) valids <- list(test = dtest) # Setup parameters and run model... -params <- list(objective = "regression", - metric = "l2", - device = "cpu", - min_sum_hessian = 10, - num_leaves = 7, - max_depth = 3, - nthread = 1) -model <- lgb.train(params, - dtrain, - 50, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 10) +params <- list( + objective = "regression" + , metric = "l2" + , device = "cpu" + , min_sum_hessian = 10 + , num_leaves = 7 + , max_depth = 3 + , nthread = 1 +) +model <- lgb.train( + params + , dtrain + , 50 + , valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 +) large_weight_loss <- as.numeric(model$record_evals$test$l2$eval) plot(large_weight_loss) # It learns! # Do you want to compare the learning? They both converge. plot(small_weight_loss, large_weight_loss) -curve(1*x, from = 0, to = 0.02, add = TRUE) +curve(1 * x, from = 0, to = 0.02, add = TRUE) diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index b5b8112bd2d3..8bdca02ca940 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -12,9 +12,9 @@ getinfo(dataset, ...) \arguments{ \item{dataset}{Object of class \code{lgb.Dataset}} -\item{...}{other parameters} - \item{name}{the name of the information field to get (see details)} + +\item{...}{other parameters} } \value{ info data diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index dd74fdf6626f..1aa7197f8237 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -4,9 +4,16 @@ \alias{lgb.Dataset} \title{Construct \code{lgb.Dataset} object} \usage{ -lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL, - categorical_feature = NULL, free_raw_data = TRUE, info = list(), - ...) +lgb.Dataset( + data, + params = list(), + reference = NULL, + colnames = NULL, + categorical_feature = NULL, + free_raw_data = TRUE, + info = list(), + ... +) } \arguments{ \item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 5be785fa18d2..a419122592ff 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -4,13 +4,29 @@ \alias{lgb.cv} \title{Main CV logic for LightGBM} \usage{ -lgb.cv(params = list(), data, nrounds = 10, nfold = 3, - label = NULL, weight = NULL, obj = NULL, eval = NULL, - verbose = 1, record = TRUE, eval_freq = 1L, showsd = TRUE, - stratified = TRUE, folds = NULL, init_model = NULL, - colnames = NULL, categorical_feature = NULL, - early_stopping_rounds = NULL, callbacks = list(), - reset_data = FALSE, ...) +lgb.cv( + params = list(), + data, + nrounds = 10, + nfold = 3, + label = NULL, + weight = NULL, + obj = NULL, + eval = NULL, + verbose = 1, + record = TRUE, + eval_freq = 1L, + showsd = TRUE, + stratified = TRUE, + folds = NULL, + init_model = NULL, + colnames = NULL, + categorical_feature = NULL, + early_stopping_rounds = NULL, + callbacks = list(), + reset_data = FALSE, + ... +) } \arguments{ \item{params}{List of parameters} @@ -27,7 +43,7 @@ lgb.cv(params = list(), data, nrounds = 10, nfold = 3, \item{obj}{objective function, can be character or custom objective function. Examples include \code{regression}, \code{regression_l1}, \code{huber}, -\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} + \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} \item{eval}{evaluation function, can be (list of) character or custom eval function} @@ -54,17 +70,15 @@ the \code{nfold} and \code{stratified} parameters are ignored.} type int represents index, type str represents feature names} -\item{early_stopping_rounds}{int -Activates early stopping. -Requires at least one validation data and one metric -If there's more than one, will check all of them except the training data -Returns the model with (best_iter + early_stopping_rounds) -If early stopping occurs, the model will have 'best_iter' field} +\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data +and one metric. If there's more than one, will check all of them +except the training data. Returns the model with (best_iter + early_stopping_rounds). +If early stopping occurs, the model will have 'best_iter' field.} -\item{callbacks}{list of callback functions -List of callback functions that are applied at each iteration.} +\item{callbacks}{List of callback functions that are applied at each iteration.} -\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets} +\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model +into a predictor model which frees up memory and the original datasets} \item{...}{other parameters, see Parameters.rst for more information. A few key parameters: \itemize{ @@ -89,11 +103,13 @@ data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2") -model <- lgb.cv(params, - dtrain, - 10, - nfold = 3, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.cv( + params = params + , data = dtrain + , nrounds = 10 + , nfold = 3 + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) } diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index 73226a36b1b2..c03dcdd7fc6b 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -27,13 +27,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) json_model <- lgb.dump(model) } diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index b13c151c9b57..0b124ffb62cb 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -4,8 +4,13 @@ \alias{lgb.get.eval.result} \title{Get record evaluation result from booster} \usage{ -lgb.get.eval.result(booster, data_name, eval_name, iters = NULL, - is_err = FALSE) +lgb.get.eval.result( + booster, + data_name, + eval_name, + iters = NULL, + is_err = FALSE +) } \arguments{ \item{booster}{Object of class \code{lgb.Booster}} @@ -34,12 +39,14 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) lgb.get.eval.result(model, "test", "l2") } diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 6955fad34129..2a75e1c14eb3 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -29,9 +29,14 @@ data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) -params <- list(objective = "binary", - learning_rate = 0.01, num_leaves = 63, max_depth = -1, - min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +params <- list( + objective = "binary" + , learning_rate = 0.01 + , num_leaves = 63 + , max_depth = -1 + , min_data_in_leaf = 1 + , min_sum_hessian_in_leaf = 1 +) model <- lgb.train(params, dtrain, 10) tree_imp1 <- lgb.importance(model, percentage = TRUE) diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index 62f5c58caa93..798f39c2eef7 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -11,7 +11,7 @@ lgb.interprete(model, data, idxset, num_iteration = NULL) \item{data}{a matrix object or a dgCMatrix object.} -\item{idxset}{a integer vector of indices of rows needed.} +\item{idxset}{an integer vector of indices of rows needed.} \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.} } diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index 943eb81ee2e3..ebe14d0e39bc 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -29,13 +29,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) lgb.save(model, "model.txt") load_booster <- lgb.load(filename = "model.txt") model_string <- model$save_model_to_string(NULL) # saves best iteration diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd index c25d139fa12c..fc85105eee50 100644 --- a/R-package/man/lgb.model.dt.tree.Rd +++ b/R-package/man/lgb.model.dt.tree.Rd @@ -44,9 +44,14 @@ data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) -params <- list(objective = "binary", - learning_rate = 0.01, num_leaves = 63, max_depth = -1, - min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +params <- list( + objective = "binary" + , learning_rate = 0.01 + , num_leaves = 63 + , max_depth = -1 + , min_data_in_leaf = 1 + , min_sum_hessian_in_leaf = 1 +) model <- lgb.train(params, dtrain, 10) tree_dt <- lgb.model.dt.tree(model) diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 84493b1e9e6c..cc419ab0ae8d 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -4,8 +4,13 @@ \alias{lgb.plot.importance} \title{Plot feature importance as a bar graph} \usage{ -lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain", - left_margin = 10, cex = NULL) +lgb.plot.importance( + tree_imp, + top_n = 10, + measure = "Gain", + left_margin = 10, + cex = NULL +) } \arguments{ \item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.} diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index 6d3637e9ea41..a026d619c8f9 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -4,8 +4,13 @@ \alias{lgb.plot.interpretation} \title{Plot feature contribution as a bar graph} \usage{ -lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1, - left_margin = 10, cex = NULL) +lgb.plot.interpretation( + tree_interpretation_dt, + top_n = 10, + cols = 1, + left_margin = 10, + cex = NULL +) } \arguments{ \item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.} @@ -25,8 +30,8 @@ The \code{lgb.plot.interpretation} function creates a \code{barplot}. Plot previously calculated feature contribution as a bar graph. } \details{ -The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. -Features are shown ranked in a decreasing contribution order. +The graph represents each feature as a horizontal bar of length proportional to the defined +contribution of a feature. Features are shown ranked in a decreasing contribution order. } \examples{ library(lightgbm) @@ -39,9 +44,14 @@ setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)) data(agaricus.test, package = "lightgbm") test <- agaricus.test -params <- list(objective = "binary", - learning_rate = 0.01, num_leaves = 63, max_depth = -1, - min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) +params <- list( + objective = "binary" + , learning_rate = 0.01 + , num_leaves = 63 + , max_depth = -1 + , min_data_in_leaf = 1 + , min_sum_hessian_in_leaf = 1 +) model <- lgb.train(params, dtrain, 10) tree_interpretation <- lgb.interprete(model, test$data, 1:5) diff --git a/R-package/man/lgb.prepare.Rd b/R-package/man/lgb.prepare.Rd index fd38d5046eb6..262a52aa1f21 100644 --- a/R-package/man/lgb.prepare.Rd +++ b/R-package/man/lgb.prepare.Rd @@ -10,10 +10,13 @@ lgb.prepare(data) \item{data}{A data.frame or data.table to prepare.} } \value{ -The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) + for input in \code{lgb.Dataset}. } \description{ -Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets. +Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +Factors and characters are converted to numeric without integers. Please use +\code{lgb.prepare_rules} if you want to apply this transformation to other datasets. } \examples{ library(lightgbm) diff --git a/R-package/man/lgb.prepare2.Rd b/R-package/man/lgb.prepare2.Rd index 106ab21d587f..1d8400220b64 100644 --- a/R-package/man/lgb.prepare2.Rd +++ b/R-package/man/lgb.prepare2.Rd @@ -10,10 +10,16 @@ lgb.prepare2(data) \item{data}{A data.frame or data.table to prepare.} } \value{ -The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) + for input in \code{lgb.Dataset}. } \description{ -Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. +Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +Factors and characters are converted to numeric (specifically: integer). +Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. +This is useful if you have a specific need for integer dataset instead of numeric dataset. +Note that there are programs which do not support integer-only input. Consider this as a half +memory technique which is dangerous, especially for LightGBM. } \examples{ library(lightgbm) diff --git a/R-package/man/lgb.prepare_rules.Rd b/R-package/man/lgb.prepare_rules.Rd index 939cb1fd58ef..d8caa74475aa 100644 --- a/R-package/man/lgb.prepare_rules.Rd +++ b/R-package/man/lgb.prepare_rules.Rd @@ -12,10 +12,14 @@ lgb.prepare_rules(data, rules = NULL) \item{rules}{A set of rules from the data preparator, if already used.} } \value{ -A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). + The data must be converted to a matrix format (\code{as.matrix}) for input + in \code{lgb.Dataset}. } \description{ -Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. +Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +Factors and characters are converted to numeric. In addition, keeps rules created +so you can convert other datasets using this converter. } \examples{ library(lightgbm) diff --git a/R-package/man/lgb.prepare_rules2.Rd b/R-package/man/lgb.prepare_rules2.Rd index 5e2d909e9d6f..64ccd2041e1e 100644 --- a/R-package/man/lgb.prepare_rules2.Rd +++ b/R-package/man/lgb.prepare_rules2.Rd @@ -12,43 +12,15 @@ lgb.prepare_rules2(data, rules = NULL) \item{rules}{A set of rules from the data preparator, if already used.} } \value{ -A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. +A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). + The data must be converted to a matrix format (\code{as.matrix}) for input in + \code{lgb.Dataset}. } \description{ -Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. -} -\examples{ -library(lightgbm) -data(iris) - -str(iris) - -new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter -str(new_iris$data) - -data(iris) # Erase iris dataset -iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA) - -# Use conversion using known rules -# Unknown factors become 0, excellent for sparse datasets -newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules) - -# Unknown factor is now zero, perfect for sparse datasets -newer_iris$data[1, ] # Species became 0 as it is an unknown factor - -newer_iris$data[1, 5] <- 1 # Put back real initial value - -# Is the newly created dataset equal? YES! -all.equal(new_iris$data, newer_iris$data) - -# Can we test our own rules? -data(iris) # Erase iris dataset - -# We remapped values differently -personal_rules <- list(Species = c("setosa" = 3L, - "versicolor" = 2L, - "virginica" = 1L)) -newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules) -str(newest_iris$data) # SUCCESS! - +Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. +Factors and characters are converted to numeric (specifically: integer). +In addition, keeps rules created so you can convert other datasets using this converter. +This is useful if you have a specific need for integer dataset instead of numeric dataset. +Note that there are programs which do not support integer-only input. +Consider this as a half memory technique which is dangerous, especially for LightGBM. } diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index f0ab4c593b42..df483f6513bb 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -29,13 +29,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) lgb.save(model, "model.txt") } diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 79b16ef88910..f7096389d428 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -4,11 +4,24 @@ \alias{lgb.train} \title{Main training logic for LightGBM} \usage{ -lgb.train(params = list(), data, nrounds = 10, valids = list(), - obj = NULL, eval = NULL, verbose = 1, record = TRUE, - eval_freq = 1L, init_model = NULL, colnames = NULL, - categorical_feature = NULL, early_stopping_rounds = NULL, - callbacks = list(), reset_data = FALSE, ...) +lgb.train( + params = list(), + data, + nrounds = 10, + valids = list(), + obj = NULL, + eval = NULL, + verbose = 1, + record = TRUE, + eval_freq = 1L, + init_model = NULL, + colnames = NULL, + categorical_feature = NULL, + early_stopping_rounds = NULL, + callbacks = list(), + reset_data = FALSE, + ... +) } \arguments{ \item{params}{List of parameters} @@ -39,17 +52,16 @@ lgb.train(params = list(), data, nrounds = 10, valids = list(), type int represents index, type str represents feature names} -\item{early_stopping_rounds}{int -Activates early stopping. -Requires at least one validation data and one metric -If there's more than one, will check all of them except the training data -Returns the model with (best_iter + early_stopping_rounds) -If early stopping occurs, the model will have 'best_iter' field} +\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data +and one metric. If there's more than one, will check all of them +except the training data. Returns the model with (best_iter + early_stopping_rounds). +If early stopping occurs, the model will have 'best_iter' field.} -\item{callbacks}{list of callback functions -List of callback functions that are applied at each iteration.} +\item{callbacks}{List of callback functions that are applied at each iteration.} -\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets} +\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the +booster model into a predictor model which frees up memory and the +original datasets} \item{...}{other parameters, see Parameters.rst for more information. A few key parameters: \itemize{ @@ -78,11 +90,13 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) } diff --git a/R-package/man/lgb.unloader.Rd b/R-package/man/lgb.unloader.Rd index ab12dfe2cf23..b77046cf5644 100644 --- a/R-package/man/lgb.unloader.Rd +++ b/R-package/man/lgb.unloader.Rd @@ -7,11 +7,15 @@ lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) } \arguments{ -\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.} +\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. +Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once +unloading is performed.} -\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.} +\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global +environment. Defaults to \code{FALSE} which means to not remove them.} -\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.} +\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to +\code{.GlobalEnv} which is the global environment.} } \value{ NULL invisibly. @@ -29,13 +33,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) \dontrun{ lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd index 0a40c2ae31ed..3dcad6f47544 100644 --- a/R-package/man/lgb_shared_params.Rd +++ b/R-package/man/lgb_shared_params.Rd @@ -9,12 +9,10 @@ List of callback functions that are applied at each iteration.} \item{data}{a \code{lgb.Dataset} object, used for training} -\item{early_stopping_rounds}{int -Activates early stopping. -Requires at least one validation data and one metric -If there's more than one, will check all of them except the training data -Returns the model with (best_iter + early_stopping_rounds) -If early stopping occurs, the model will have 'best_iter' field} +\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data +and one metric. If there's more than one, will check all of them +except the training data. Returns the model with (best_iter + early_stopping_rounds). +If early stopping occurs, the model will have 'best_iter' field.} \item{eval_freq}{evaluation output frequency, only effect when verbose > 0} diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 2e660ac8e16e..d85a922e67c3 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -4,10 +4,20 @@ \alias{lightgbm} \title{Train a LightGBM model} \usage{ -lightgbm(data, label = NULL, weight = NULL, params = list(), - nrounds = 10, verbose = 1, eval_freq = 1L, - early_stopping_rounds = NULL, save_name = "lightgbm.model", - init_model = NULL, callbacks = list(), ...) +lightgbm( + data, + label = NULL, + weight = NULL, + params = list(), + nrounds = 10, + verbose = 1, + eval_freq = 1L, + early_stopping_rounds = NULL, + save_name = "lightgbm.model", + init_model = NULL, + callbacks = list(), + ... +) } \arguments{ \item{data}{a \code{lgb.Dataset} object, used for training} @@ -24,12 +34,10 @@ lightgbm(data, label = NULL, weight = NULL, params = list(), \item{eval_freq}{evaluation output frequency, only effect when verbose > 0} -\item{early_stopping_rounds}{int -Activates early stopping. -Requires at least one validation data and one metric -If there's more than one, will check all of them except the training data -Returns the model with (best_iter + early_stopping_rounds) -If early stopping occurs, the model will have 'best_iter' field} +\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data +and one metric. If there's more than one, will check all of them +except the training data. Returns the model with (best_iter + early_stopping_rounds). +If early stopping occurs, the model will have 'best_iter' field.} \item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".} diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 480311889c76..e2359c225390 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -4,9 +4,17 @@ \alias{predict.lgb.Booster} \title{Predict method for LightGBM model} \usage{ -\method{predict}{lgb.Booster}(object, data, num_iteration = NULL, - rawscore = FALSE, predleaf = FALSE, predcontrib = FALSE, - header = FALSE, reshape = FALSE, ...) +\method{predict}{lgb.Booster}( + object, + data, + num_iteration = NULL, + rawscore = FALSE, + predleaf = FALSE, + predcontrib = FALSE, + header = FALSE, + reshape = FALSE, + ... +) } \arguments{ \item{object}{Object of class \code{lgb.Booster}} @@ -16,8 +24,8 @@ \item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration} \item{rawscore}{whether the prediction should be returned in the for of original untransformed -sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for -logistic regression would result in predictions for log-odds instead of probabilities.} +sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} +for logistic regression would result in predictions for log-odds instead of probabilities.} \item{predleaf}{whether predict leaf index instead.} @@ -53,13 +61,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) preds <- predict(model, test$data) } diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd index cef1679ce53d..a8fc219ffd95 100644 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ b/R-package/man/readRDS.lgb.Booster.Rd @@ -27,13 +27,15 @@ test <- agaricus.test dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) -model <- lgb.train(params, - dtrain, - 10, - valids, - min_data = 1, - learning_rate = 1, - early_stopping_rounds = 5) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10 + , valids = valids + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 5 +) saveRDS.lgb.Booster(model, "model.rds") new_model <- readRDS.lgb.Booster("model.rds") diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd index 54fd1d07867c..afa639d4245c 100644 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ b/R-package/man/saveRDS.lgb.Booster.Rd @@ -4,19 +4,31 @@ \alias{saveRDS.lgb.Booster} \title{saveRDS for \code{lgb.Booster} models} \usage{ -saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL, - compress = TRUE, refhook = NULL, raw = TRUE) +saveRDS.lgb.Booster( + object, + file = "", + ascii = FALSE, + version = NULL, + compress = TRUE, + refhook = NULL, + raw = TRUE +) } \arguments{ \item{object}{R object to serialize.} \item{file}{a connection or the name of the file where the R object is saved to or read from.} -\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.} +\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), +a binary one is used. See the comments in the help for save.} -\item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.} +\item{version}{the workspace format version to use. \code{NULL} specifies the current default +version (2). Versions prior to 2 are not supported, so this will only be relevant +when there are later versions.} -\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.} +\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, +or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of +compression to be used. Ignored if file is a connection.} \item{refhook}{a hook function for handling reference objects.} @@ -26,7 +38,8 @@ saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL, NULL invisibly. } \description{ -Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. +Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides +whether to save the raw model or not. } \examples{ library(lightgbm) @@ -39,10 +52,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) params <- list(objective = "regression", metric = "l2") valids <- list(test = dtest) model <- lgb.train( - params - , dtrain - , 10 - , valids + params = params + , data = dtrain + , nrounds = 10 + , valids = valids , min_data = 1 , learning_rate = 1 , early_stopping_rounds = 5 diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index 861e5f0219ef..5b0a23f6992b 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -12,11 +12,11 @@ setinfo(dataset, ...) \arguments{ \item{dataset}{Object of class \code{lgb.Dataset}} -\item{...}{other parameters} - \item{name}{the name of the field to get} \item{info}{the specific field of information to set} + +\item{...}{other parameters} } \value{ passed object diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 10040d11a2bc..e13aacaf4034 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -12,9 +12,9 @@ slice(dataset, ...) \arguments{ \item{dataset}{Object of class \code{lgb.Dataset}} -\item{...}{other parameters (currently not used)} +\item{idxset}{an integer vector of indices of rows needed} -\item{idxset}{a integer vector of indices of rows needed} +\item{...}{other parameters (currently not used)} } \value{ constructed sub dataset diff --git a/R-package/src/install.libs.R b/R-package/src/install.libs.R index 6226575b19b2..2ab2a6a77af0 100644 --- a/R-package/src/install.libs.R +++ b/R-package/src/install.libs.R @@ -8,7 +8,7 @@ if (.Machine$sizeof.pointer != 8){ } R_int_UUID <- .Internal(internalsID()) -R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor)/10 +R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor) / 10 if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262" || R_int_UUID == "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327")){ @@ -74,7 +74,7 @@ if (!use_precompile) { try_vs <- 0 local_vs_def <- "" vs_versions <- c("Visual Studio 16 2019", "Visual Studio 15 2017", "Visual Studio 14 2015") - for(vs in vs_versions){ + for (vs in vs_versions){ vs_def <- paste0(" -G \"", vs, "\" -A x64") tmp_cmake_cmd <- paste0(cmake_cmd, vs_def) try_vs <- system(paste0(tmp_cmake_cmd, " ..")) @@ -106,14 +106,29 @@ if (!use_precompile) { # Has precompiled package lib_folder <- file.path(R_PACKAGE_SOURCE, "../", fsep = "/") - if (file.exists(file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/"))) { - src <- file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/") - } else if (file.exists(file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/"))) { - src <- file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/") + shared_object_file <- file.path( + lib_folder + , paste0("lib_lightgbm", SHLIB_EXT) + , fsep = "/" + ) + release_file <- file.path( + lib_folder + , paste0("Release/lib_lightgbm", SHLIB_EXT) + , fsep = "/" + ) + windows_shared_object_file <- file.path( + lib_folder + , paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT) + , fsep = "/" + ) + if (file.exists(shared_object_file)) { + src <- shared_object_file + } else if (file.exists(release_file)) { + src <- release_file } else { - src <- file.path(lib_folder, paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT), fsep = "/") # Expected result: installation will fail if it is not here or any other + # Expected result: installation will fail if it is not here or any other + src <- windows_shared_object_file } - } # Check installation correctness diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 11cd0a51ef4a..90925196a5f1 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1,7 +1,7 @@ context("basic functions") -data(agaricus.train, package='lightgbm') -data(agaricus.test, package='lightgbm') +data(agaricus.train, package = 'lightgbm') +data(agaricus.test, package = 'lightgbm') train <- agaricus.train test <- agaricus.test @@ -9,8 +9,14 @@ windows_flag = grepl('Windows', Sys.info()[['sysname']]) test_that("train and predict binary classification", { nrounds = 10 - bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5, - nrounds = nrounds, objective = "binary", metric="binary_error") + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 5 + , nrounds = nrounds + , objective = "binary" + , metric = "binary_error" + ) expect_false(is.null(bst$record_evals)) record_results <- lgb.get.eval.result(bst, "train", "binary_error") expect_lt(min(record_results), 0.02) @@ -20,7 +26,7 @@ test_that("train and predict binary classification", { pred1 <- predict(bst, train$data, num_iteration = 1) expect_equal(length(pred1), 6513) - err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label) + err_pred1 <- sum( (pred1 > 0.5) != train$label) / length(train$label) err_log <- record_results[1] expect_lt(abs(err_pred1 - err_log), 10e-6) }) @@ -29,9 +35,18 @@ test_that("train and predict binary classification", { test_that("train and predict softmax", { lb <- as.numeric(iris$Species) - 1 - bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb, - num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20, - objective = "multiclass", metric="multi_error", num_class=3) + bst <- lightgbm( + data = as.matrix(iris[, -5]) + , label = lb + , num_leaves = 4 + , learning_rate = 0.1 + , nrounds = 20 + , min_data = 20 + , min_hess = 20 + , objective = "multiclass" + , metric = "multi_error" + , num_class = 3 + ) expect_false(is.null(bst$record_evals)) record_results <- lgb.get.eval.result(bst, "train", "multi_error") @@ -43,18 +58,33 @@ test_that("train and predict softmax", { test_that("use of multiple eval metrics works", { - bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4, - learning_rate=1, nrounds = 10, objective = "binary", - metric = list("binary_error","auc","binary_logloss") ) + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 4 + , learning_rate = 1 + , nrounds = 10 + , objective = "binary" + , metric = list("binary_error","auc","binary_logloss") + ) expect_false(is.null(bst$record_evals)) }) test_that("training continuation works", { testthat::skip("This test is currently broken. See issue #2468 for details.") - dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE) - watchlist = list(train=dtrain) - param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1) + dtrain <- lgb.Dataset( + train$data + , label = train$label + , free_raw_data = FALSE + ) + watchlist = list(train = dtrain) + param <- list( + objective = "binary" + , metric = "binary_logloss" + , num_leaves = 5 + , learning_rate = 1 + ) # for the reference, use 10 iterations at once: bst <- lgb.train(param, dtrain, nrounds = 10, watchlist) @@ -75,8 +105,16 @@ test_that("training continuation works", { test_that("cv works", { - dtrain <- lgb.Dataset(train$data, label=train$label) - params <- list(objective="regression", metric="l2,l1") - bst <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10) + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", metric = "l2,l1") + bst <- lgb.cv( + params + , dtrain + , 10 + , nfold = 5 + , min_data = 1 + , learning_rate = 1 + , early_stopping_rounds = 10 + ) expect_false(is.null(bst$record_evals)) }) diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index 9b5e512b1d2c..4d0d52d73251 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -1,7 +1,7 @@ context('Test models with custom objective') -data(agaricus.train, package='lightgbm') -data(agaricus.test, package='lightgbm') +data(agaricus.train, package = 'lightgbm') +data(agaricus.test, package = 'lightgbm') dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) watchlist <- list(eval = dtest, train = dtrain) @@ -17,11 +17,19 @@ logregobj <- function(preds, dtrain) { evalerror <- function(preds, dtrain) { labels <- getinfo(dtrain, "label") err <- as.numeric(sum(labels != (preds > 0))) / length(labels) - return(list(name = "error", value = err, higher_better=FALSE)) + return(list( + name = "error" + , value = err + , higher_better = FALSE + )) } -param <- list(num_leaves=8, learning_rate=1, - objective=logregobj, metric="auc") +param <- list( + num_leaves = 8 + , learning_rate = 1 + , objective = logregobj + , metric = "auc" +) num_round <- 10 test_that("custom objective works", { diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index ffd2d60eb801..6bad414bc010 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -3,15 +3,15 @@ require(Matrix) context("testing lgb.Dataset functionality") -data(agaricus.test, package='lightgbm') +data(agaricus.test, package = 'lightgbm') test_data <- agaricus.test$data[1:100,] test_label <- agaricus.test$label[1:100] test_that("lgb.Dataset: basic construction, saving, loading", { # from sparse matrix - dtest1 <- lgb.Dataset(test_data, label=test_label) + dtest1 <- lgb.Dataset(test_data, label = test_label) # from dense matrix - dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label) + dtest2 <- lgb.Dataset(as.matrix(test_data), label = test_label) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) # save to a local file @@ -40,7 +40,7 @@ test_that("lgb.Dataset: getinfo & setinfo", { }) test_that("lgb.Dataset: slice, dim", { - dtest <- lgb.Dataset(test_data, label=test_label) + dtest <- lgb.Dataset(test_data, label = test_label) lgb.Dataset.construct(dtest) expect_equal(dim(dtest), dim(test_data)) dsub1 <- slice(dtest, 1:42) @@ -50,7 +50,7 @@ test_that("lgb.Dataset: slice, dim", { }) test_that("lgb.Dataset: colnames", { - dtest <- lgb.Dataset(test_data, label=test_label) + dtest <- lgb.Dataset(test_data, label = test_label) expect_equal(colnames(dtest), colnames(test_data)) lgb.Dataset.construct(dtest) expect_equal(colnames(dtest), colnames(test_data)) @@ -62,7 +62,7 @@ test_that("lgb.Dataset: colnames", { test_that("lgb.Dataset: nrow is correct for a very sparse matrix", { nr <- 1000 - x <- Matrix::rsparsematrix(nr, 100, density=0.0005) + x <- Matrix::rsparsematrix(nr, 100, density = 0.0005) # we want it very sparse, so that last rows are empty expect_lt(max(x@i), nr) dtest <- lgb.Dataset(x) @@ -70,15 +70,17 @@ test_that("lgb.Dataset: nrow is correct for a very sparse matrix", { }) test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", { - rawData <- matrix(runif(1000),ncol=10) + rawData <- matrix(runif(1000), ncol = 10) handle <- NA_real_ ref_handle <- NULL - handle <- lightgbm:::lgb.call("LGBM_DatasetCreateFromMat_R" - , ret = handle - , rawData - , nrow(rawData) - , ncol(rawData) - , lightgbm:::lgb.params2str(params=list()) - , ref_handle) + handle <- lightgbm:::lgb.call( + "LGBM_DatasetCreateFromMat_R" + , ret = handle + , rawData + , nrow(rawData) + , ncol(rawData) + , lightgbm:::lgb.params2str(params = list()) + , ref_handle + ) expect_false(is.na(handle)) }) diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R index 8100deff59bb..60a762de2e59 100644 --- a/R-package/tests/testthat/test_parameters.R +++ b/R-package/tests/testthat/test_parameters.R @@ -1,5 +1,8 @@ -data(agaricus.train, package='lightgbm') -data(agaricus.test, package='lightgbm') + +context("feature penalties") + +data(agaricus.train, package = 'lightgbm') +data(agaricus.test, package = 'lightgbm') train <- agaricus.train test <- agaricus.test @@ -12,15 +15,15 @@ test_that("Feature penalties work properly", { feature_penalties <- rep(1, ncol(train$data)) feature_penalties[var_index] <- x lightgbm( - data = train$data, - label = train$label, - num_leaves = 5, - learning_rate = 0.05, - nrounds = 20, - objective = "binary", - feature_penalty = paste0(feature_penalties, collapse = ","), - metric="binary_error", - verbose = -1 + data = train$data + , label = train$label + , num_leaves = 5 + , learning_rate = 0.05 + , nrounds = 20 + , objective = "binary" + , feature_penalty = paste0(feature_penalties, collapse = ",") + , metric = "binary_error" + , verbose = -1 ) })