diff --git a/R-package/.Rhistory b/R-package/.Rhistory new file mode 100644 index 000000000000..d67b85b7ca3e --- /dev/null +++ b/R-package/.Rhistory @@ -0,0 +1,512 @@ +backgroundSize = "100% 90%", +backgroundRepeat = "no-repeat", +backgroundPosition = "center") %>% +formatStyle("Number of Projects", +background = styleColorBar(c(0, 7), color = "lightblue"), +backgroundSize = "100% 90%", +backgroundRepeat = "no-repeat", +backgroundPosition = "center") %>% +formatRound("Average Weekly Hours", digits = 2) %>% +formatStyle("Average Weekly Hours", +background = styleColorBar(c(0, 72), color = "lightblue"), +backgroundSize = "100% 90%", +backgroundRepeat = "no-repeat", +backgroundPosition = "center") %>% +formatStyle("Years in Company", +background = styleColorBar(c(0, 10), color = "lightblue"), +backgroundSize = "100% 90%", +backgroundRepeat = "no-repeat", +backgroundPosition = "center") %>% +formatStyle(c("Work Accident"), +backgroundColor = styleEqual(c("No Accident", "Accident"), c("lightgrey", "pink"))) %>% +formatStyle("Has Left", +backgroundColor = styleEqual(c("Not Left", "Left"), c("lightgrey", "pink"))) %>% +formatStyle("Promotion in Last 5 Years", +backgroundColor = styleEqual(c("No Promotion", "Promotion"), c("lightgrey", "orange"))) %>% +formatStyle("Salary", +backgroundColor = styleEqual(c("Low", "Medium", "High"), c("orange", "yellow", "lightblue"))) +}) +# Add Correlation plot +output$corrplot <- renderPlotly({ +plot_data <- dist_data() +plot_data[upper.tri(plot_data)] <- NA +plot_data <- melt(plot_data, na.rm = TRUE) +plot_data$value <- plot_data$value / nrow(better_data()) +plot_data$value <- -(plot_data$value - 0.5) * 2 +colnames(plot_data) <- c("Variable_1", "Variable_2", "Agreement") +if (input$check_corr == TRUE) { +plot_data$Text <- sprintf("%0.2f", round(plot_data$Agreement, digits = 2)) +return(ggplotly(ggplot(data = plot_data, aes_string(x = "Variable_1", y = "Variable_2", fill = "Agreement")) + geom_tile(color = "white") + geom_text(aes_string(x = "Variable_1", y = "Variable_2", label = "Text")) + scale_fill_gradient2(low = "red", high = "green", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Agreement
Strength") + theme_bw() + theme(axis.text.x = element_text(angle = 45)) + labs(x = "Variable 1", y = "Variable 2"), autosize = TRUE, margin = list(l = 20, r = 20, b = 250, t = 20, p = 4))) +} else { +return(ggplotly(ggplot(data = plot_data, aes_string(x = "Variable_1", y = "Variable_2", fill = "Agreement")) + geom_tile(color = "white") + scale_fill_gradient2(low = "green", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Agreement
Strength") + theme_bw() + theme(axis.text.x = element_text(angle = 45)) + labs(x = "Variable 1", y = "Variable 2"), autosize = TRUE, margin = list(l = 20, r = 20, b = 250, t = 20, p = 4))) +} +}) +# Add Graph plot +output$graphplot <- renderPlot({ +plot_data <- dist_data() +plot_data <- plot_data / nrow(better_data()) +plot_data <- -(plot_data - 0.5) * 2 +features_name <- c("Satisfaction Level", "Last Evaluation Score", "Number of Projects", "Average Weekly Hours", "Years in Company", "Work Accident", "Has Left", "Promotion in Last 5 Years", "Department: Accounting", "Department: Human Resources", "Department: IT", "Department: Management", "Department: Marketing", "Department: Product Management", "Department: Research and Development", "Department: Sales", "Department: Support", "Department: Technical", "Salary: Low", "Salary: Medium", "Salary: High") +features_selected <- which(features_name %in% input$feat_corr) +colnames(plot_data) <- c("A1", "A2", "B", "C", "D", "E", "F", "G", "H01", "H02", "H03", "H04", "H05", "H06", "H07", "H08", "H09", "H10", "I1", "I2", "I3")[features_selected] +features_name <- c("A1: Satisfaction Level", "A2: Last Evaluation Score", "B: Number of Projects", "C: Average Weekly Hours", "D: Years in Company", "E: Work Accident", "F: Has Left", "G: Promotion in Last 5 Years", "H1 Department: Accounting", "H2 Department: Human Resources", "H3 Department: IT", "H4 Department: Management", "H5 Department: Marketing", "H6 Department: Product Management", "H7 Department: Research and Development", "H8 Department: Sales", "H9 Department: Support", "H10 Department: Technical", "I1 Salary: Low", "I2 Salary: Medium", "I3 Salary: High") +qgraph(plot_data, layout = "spring", groups = features_name[features_selected], palette = "pastel", theme = "classic", shape = "ellipse", borders = FALSE, vTrans = 100, vsize = 12, title = paste0("Agreement: [", paste(sprintf("%.03f", range(plot_data)), collapse = ", "), "]"), edge.labels = TRUE, XKCD = TRUE) +}) +# Plot tree +output$tree <- renderPlot({ +tree_data <- copy(better_data()) +levels(tree_data$`Department`) <- c("Accounting", "HR", "IT", "Mgmt", "Marketing", "Product Mgmt", "R&D", "Sales", "Support", "Tech") +tree_label <- copy(tree_data[[input$label]]) +#tree_data[[input$label]] <- NULL +tree_data <- tree_data[, unique(c(input$ban, input$label)), with = FALSE] +#tree_data <- tree_data[, input$ban[which(!input$ban %in% input$label)], with = FALSE] +formula <- reformulate(termlabels = paste0("`", input$ban[which(!input$ban %in% input$label)], "`"), response = input$label) +temp_model <- rpart(formula = formula, +data = tree_data, +method = ifelse(input$label %in% c("Satisfaction Level", "Last Evaluation Score", "Average Weekly Hours"), "anova", ifelse(input$label %in% c("Number of Projects", "Years in Company"), "poisson", "class")), +control = rpart.control(minsplit = input$min_split, +minbucket = input$min_bucket, +cp = input$min_improve, +maxcompete = 0, +maxsurrogate = input$surrogate_search, +usesurrogate = input$surrogate_type, +xval = 3, +surrogatestyle = input$surrogate_style, +maxdepth = input$max_depth)) +# temp_model <- Laurae::FeatureLookup(data = tree_data, +# label = tree_label, +# ban = NULL, +# antiban = FALSE, +# type = ifelse(input$label %in% c("Satisfaction Level", "Last Evaluation Score", "Average Weekly Hours"), "anova", ifelse(input$label %in% c("Number of Projects", "Years in Company"), "poisson", "class")), +# split = "information", +# folds = 3, +# seed = input$seed, +# verbose = FALSE, +# plots = FALSE, +# max_depth = input$max_depth, +# min_split = input$min_split, +# min_bucket = input$min_bucket, +# min_improve = input$min_improve, +# competing_splits = 0, +# surrogate_search = input$surrogate_search, +# surrogate_type = input$surrogate_type, +# surrogate_style = input$surrogate_style) +rpart.plot(temp_model, main = "Decision Tree", tweak = input$size/100) +}) +# Need to stop using a button? +observeEvent(input$done, { +stopApp(TRUE) +}) +} +runGadget(shinyApp(ui, server), viewer = paneViewer()) +library(lightgbm) +data(agaricus.train, package='lightgbm') +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label=train$label) +data(agaricus.test, package='lightgbm') +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label) +params <- list(objective="regression", metric="l2") +valids <- list(test=dtest) +model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10) +getwd() +saveRDS("D:/model.rds") +saveRDS(model, "D:/model.rds") +model$raw +model$raw() +unlockBinding("raw", model) +model$raw() +unlockBinding(model) +bindingIsLocked("raw", model) +model$whatever <- "yes" +?makeActiveBinding +f <- local( { +x <- 1 +function(v) { +if (missing(v)) +cat("get\n") +else { +cat("set\n") +x <<- v +} +x +} +}) +makeActiveBinding("fred", f, .GlobalEnv) +bindingIsActive("fred", .GlobalEnv) +fred +fred <- 2 +fred +makeActiveBinding("model", whatever, .GlobalEnv) +makeActiveBinding("whatever", model, .GlobalEnv) +makeActiveBinding("raw", model, .GlobalEnv) +makeActiveBinding("save", model, .GlobalEnv) +makeActiveBinding("save", raw, model) +with(model, save <- raw) +model[["save"]] <- model$raw +model["save"] <- model$raw +?lockEnvironment +environmentIsLocked(model) +model$best_iter +model$best_iter <- 1 +model$best_iter +install_github("Laurae2/LightGBM/R-package@patch-10") +devtools::install_github("Laurae2/LightGBM/R-package@patch-10") +?lightgbm::lgb.train +library(lightgbm) +data(agaricus.train, package='lightgbm') +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label=train$label) +data(agaricus.test, package='lightgbm') +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label) +params <- list(objective="regression", metric="l2") +valids <- list(test=dtest) +model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10) +saveRDS(model, "D:/model.rds") +model$raw +model$save() +model$raw +Booster <- R6Class( +"lgb.Booster", +cloneable = FALSE, +public = list( +best_iter = -1, +record_evals = list(), +finalize = function() { +if (!lgb.is.null.handle(private$handle)) { +lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle) +private$handle <- NULL +} +}, +initialize = function(params = list(), +train_set = NULL, +modelfile = NULL, +...) { +params <- append(params, list(...)) +params_str <- lgb.params2str(params) +handle <- lgb.new.handle() +if (!is.null(train_set)) { +if (!lgb.check.r6.class(train_set, "lgb.Dataset")) { +stop("lgb.Booster: Can only use lgb.Dataset as training data") +} +handle <- +lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str) +private$train_set <- train_set +private$num_dataset <- 1 +private$init_predictor <- train_set$.__enclos_env__$private$predictor +if (!is.null(private$init_predictor)) { +lgb.call("LGBM_BoosterMerge_R", ret = NULL, +handle, +private$init_predictor$.__enclos_env__$private$handle) +} +private$is_predicted_cur_iter <- c(private$is_predicted_cur_iter, FALSE) +} else if (!is.null(modelfile)) { +if (!is.character(modelfile)) { +stop("lgb.Booster: Can only use a string as model file path") +} +handle <- +lgb.call("LGBM_BoosterCreateFromModelfile_R", +ret = handle, +lgb.c_str(modelfile)) +} else { +stop( +"lgb.Booster: Need at least either training dataset or model file to create booster instance" +) +} +class(handle) <- "lgb.Booster.handle" +private$handle <- handle +private$num_class <- 1L +private$num_class <- +lgb.call("LGBM_BoosterGetNumClasses_R", ret = private$num_class, private$handle) +}, +set_train_data_name = function(name) { +private$name_train_set <- name +self +}, +add_valid = function(data, name) { +if (!lgb.check.r6.class(data, "lgb.Dataset")) { +stop("lgb.Booster.add_valid: Can only use lgb.Dataset as validation data") +} +if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) { +stop( +"lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data" +) +} +if (!is.character(name)) { +stop("lgb.Booster.add_valid: Can only use characters as data name") +} +lgb.call("LGBM_BoosterAddValidData_R", ret = NULL, private$handle, data$.__enclos_env__$private$get_handle()) +private$valid_sets <- c(private$valid_sets, data) +private$name_valid_sets <- c(private$name_valid_sets, name) +private$num_dataset <- private$num_dataset + 1 +private$is_predicted_cur_iter <- +c(private$is_predicted_cur_iter, FALSE) +self +}, +reset_parameter = function(params, ...) { +params <- append(params, list(...)) +params_str <- algb.params2str(params) +lgb.call("LGBM_BoosterResetParameter_R", ret = NULL, +private$handle, +params_str) +self +}, +update = function(train_set = NULL, fobj = NULL) { +if (!is.null(train_set)) { +if (!lgb.check.r6.class(train_set, "lgb.Dataset")) { +stop("lgb.Booster.update: Only can use lgb.Dataset as training data") +} +if (!identical(train_set$predictor, private$init_predictor)) { +stop( +"lgb.Booster.update: Change train_set failed, you should use the same predictor for these data" +) +} +lgb.call("LGBM_BoosterResetTrainingData_R", ret = NULL, +private$handle, +train_set$.__enclos_env__$private$get_handle()) +private$train_set = train_set +} +if (is.null(fobj)) { +ret <- lgb.call("LGBM_BoosterUpdateOneIter_R", ret = NULL, private$handle) +} else { +if (!is.function(fobj)) { stop("lgb.Booster.update: fobj should be a function") } +gpair <- fobj(private$inner_predict(1), private$train_set) +if(is.null(gpair$grad) | is.null(gpair$hess)){ +stop("lgb.Booster.update: custom objective should +return a list with attributes (hess, grad)") +} +ret <- lgb.call( +"LGBM_BoosterUpdateOneIterCustom_R", ret = NULL, +private$handle, +gpair$grad, +gpair$hess, +length(gpair$grad) +) +} +for (i in seq_along(private$is_predicted_cur_iter)) { +private$is_predicted_cur_iter[[i]] <- FALSE +} +ret +}, +rollback_one_iter = function() { +lgb.call("LGBM_BoosterRollbackOneIter_R", ret = NULL, private$handle) +for (i in seq_along(private$is_predicted_cur_iter)) { +private$is_predicted_cur_iter[[i]] <- FALSE +} +self +}, +current_iter = function() { +cur_iter <- 0L +lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle) +}, +eval = function(data, name, feval = NULL) { +if (!lgb.check.r6.class(data, "lgb.Dataset")) { +stop("lgb.Booster.eval: Can only use lgb.Dataset to eval") +} +data_idx <- 0 +if (identical(data, private$train_set)) { data_idx <- 1 } else { +if (length(private$valid_sets) > 0) { +for (i in seq_along(private$valid_sets)) { +if (identical(data, private$valid_sets[[i]])) { +data_idx <- i + 1 +break +} +} +} +} +if (data_idx == 0) { +self$add_valid(data, name) +data_idx <- private$num_dataset +} +private$inner_eval(name, data_idx, feval) +}, +eval_train = function(feval = NULL) { +private$inner_eval(private$name_train_set, 1, feval) +}, +eval_valid = function(feval = NULL) { +ret = list() +if (length(private$valid_sets) <= 0) { return(ret) } +for (i in seq_along(private$valid_sets)) { +ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)) +} +ret +}, +save_model = function(filename, num_iteration = NULL) { +if (is.null(num_iteration)) { num_iteration <- self$best_iter } +lgb.call( +"LGBM_BoosterSaveModel_R", +ret = NULL, +private$handle, +as.integer(num_iteration), +lgb.c_str(filename) +) +self +}, +dump_model = function(num_iteration = NULL) { +if (is.null(num_iteration)) { num_iteration <- self$best_iter } +lgb.call.return.str( +"LGBM_BoosterDumpModel_R", +private$handle, +as.integer(num_iteration) +) +}, +predict = function(data, +num_iteration = NULL, +rawscore = FALSE, +predleaf = FALSE, +header = FALSE, +reshape = FALSE) { +if (is.null(num_iteration)) { num_iteration <- self$best_iter } +predictor <- Predictor$new(private$handle) +predictor$predict(data, num_iteration, rawscore, predleaf, header, reshape) +}, +to_predictor = function() { Predictor$new(private$handle) }, +raw = NA, +save = function() { +self$raw <- self$dump_model() +} +), +private = list( +handle = NULL, +train_set = NULL, +name_train_set = "training", +valid_sets = list(), +name_valid_sets = list(), +predict_buffer = list(), +is_predicted_cur_iter = list(), +num_class = 1, +num_dataset = 0, +init_predictor = NULL, +eval_names = NULL, +higher_better_inner_eval = NULL, +inner_predict = function(idx) { +data_name <- private$name_train_set +if (idx > 1) { data_name <- private$name_valid_sets[[idx - 1]] } +if (idx > private$num_dataset) { +stop("data_idx should not be greater than num_dataset") +} +if (is.null(private$predict_buffer[[data_name]])) { +npred <- 0L +npred <- lgb.call("LGBM_BoosterGetNumPredict_R", +ret = npred, +private$handle, +as.integer(idx - 1)) +private$predict_buffer[[data_name]] <- rep(0.0, npred) +} +if (!private$is_predicted_cur_iter[[idx]]) { +private$predict_buffer[[data_name]] <- lgb.call( +"LGBM_BoosterGetPredict_R", +ret = private$predict_buffer[[data_name]], +private$handle, +as.integer(idx - 1) +) +private$is_predicted_cur_iter[[idx]] <- TRUE +} +private$predict_buffer[[data_name]] +}, +get_eval_info = function() { +if (is.null(private$eval_names)) { +names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R", private$handle) +if (nchar(names) > 0) { +names <- strsplit(names, "\t")[[1]] +private$eval_names <- names +private$higher_better_inner_eval <- rep(FALSE, length(names)) +for (i in seq_along(names)) { +if ((names[i] == "auc") | grepl("^ndcg", names[i])) { +private$higher_better_inner_eval[i] <- TRUE +} +} +} +} +private$eval_names +}, +inner_eval = function(data_name, data_idx, feval = NULL) { +if (data_idx > private$num_dataset) { +stop("data_idx should not be greater than num_dataset") +} +private$get_eval_info() +ret <- list() +if (length(private$eval_names) > 0) { +tmp_vals <- rep(0.0, length(private$eval_names)) +tmp_vals <- lgb.call("LGBM_BoosterGetEval_R", ret = tmp_vals, +private$handle, +as.integer(data_idx - 1)) +for (i in seq_along(private$eval_names)) { +res <- list() +res$data_name <- data_name +res$name <- private$eval_names[i] +res$value <- tmp_vals[i] +res$higher_better <- private$higher_better_inner_eval[i] +ret <- append(ret, list(res)) +} +} +if (!is.null(feval)) { +if (!is.function(feval)) { +stop("lgb.Booster.eval: feval should be a function") +} +data <- private$train_set +if (data_idx > 1) { data <- private$valid_sets[[data_idx - 1]] } +res <- feval(private$inner_predict(data_idx), data) +if(is.null(res$name) | is.null(res$value) | +is.null(res$higher_better)) { +stop("lgb.Booster.eval: custom eval function should return a +list with attribute (name, value, higher_better)"); +} +res$data_name <- data_name +ret <- append(ret, list(res)) +} +ret +} +) +) +library(lightgbm) +data(agaricus.train, package='lightgbm') +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label=train$label) +data(agaricus.test, package='lightgbm') +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label) +params <- list(objective="regression", metric="l2") +valids <- list(test=dtest) +model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10) +model$save() +model$raw +my_model <- readRDS("D:/model.rds") +predict(my_model, agaricus.test$data) +predict(agaricus.test$data, my_model) +predict(model, agaricus.test$data) +predict(test$data, agaricus.test$data) +predict(train$data, agaricus.test$data) +head(predict(model, train$data)) +head(model$train$data) +head(model$predict(train$data)) +head(model$predict(dtrain)) +head(model$predict(dtrain, 1, FALSE, FALSE, FALSE, FALSE)) +model$predict +predict(model, test$data) +devtools::install_github("Laurae2/LightGBM/R-package@patch-10") +library(lightgbm) +data(agaricus.train, package='lightgbm') +train <- agaricus.train +dtrain <- lgb.Dataset(train$data, label=train$label) +data(agaricus.test, package='lightgbm') +test <- agaricus.test +dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label) +params <- list(objective="regression", metric="l2") +valids <- list(test=dtest) +model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10) +preds <- predict(model, test$data) +saveRDS(model, "D:/model.rds") +preds <- predict(model, test$data) +new_model <- readRDS("D:/model.rds") +preds <- predict(new_model, test$data) +preds <- predict(model, test$data) +new_m$odel$raw +new_model$raw +model$save() +model$raw +?readRDS +setwd("D:/Data Science/LightGBM_GitHub/LightGBM/R-package") +devtools::document() +devtools::document() diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 305243b9b77a..9766534789a8 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -195,7 +195,11 @@ Booster <- R6Class( predictor <- Predictor$new(private$handle) predictor$predict(data, num_iteration, rawscore, predleaf, header, reshape) }, - to_predictor = function() { Predictor$new(private$handle) } + to_predictor = function() { Predictor$new(private$handle) }, + raw = NA, + save = function() { + self$raw <- self$dump_model() + } ), private = list( handle = NULL, diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 8b08851c8c2e..2fbcc60628e3 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -1,4 +1,4 @@ -#' Main training logic for LightGBM +#' Main training logic for LightGBM #' #' @param params List of parameters #' @param data a \code{lgb.Dataset} object, used for training