From 9bd8fd5ceca44a46c6246d19b11e0a15110589e5 Mon Sep 17 00:00:00 2001 From: Alexander Kowarik Date: Thu, 9 Nov 2023 08:10:10 +0100 Subject: [PATCH] First kind of working version xgboostImpute, some basic tests --- R/xgboostImpute.R | 57 ++++++++++++++++++------------ inst/tinytest/test_xgboostImpute.R | 41 +++++++++++++++++++++ 2 files changed, 76 insertions(+), 22 deletions(-) create mode 100644 inst/tinytest/test_xgboostImpute.R diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R index 3e833b3..f77a070 100644 --- a/R/xgboostImpute.R +++ b/R/xgboostImpute.R @@ -1,32 +1,32 @@ #' Xgboost Imputation #' -#' Impute missing values based on a random forest model using [ranger::ranger()] +#' Impute missing values based on a random forest model using [xgboost::xgboost()] #' @param formula model formula for the imputation #' @param data A `data.frame` containing the data #' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed #' variable should be created show the imputation status #' @param imp_suffix suffix used for TF imputation variables -#' @param ... Arguments passed to [ranger::ranger()] +#' @param ... Arguments passed to [xgboost::xgboost()] #' @param verbose Show the number of observations used for training #' and evaluating the RF-Model. This parameter is also passed down to -#' [ranger::ranger()] to show computation status. -#' @param median Use the median (rather than the arithmetic mean) to average -#' the values of individual trees for a more robust estimate. +#' [xgboost::xgboost()] to show computation status. #' @return the imputed data set. #' @family imputation methods #' @examples #' data(sleep) #' xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep) #' xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep) +#' xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep) #' #' sleepx <- sleep -#' sleepx$Pred <- as.factor(sleepx$Pred) +#' sleepx$Pred <- as.factor(LETTERS[sleepx$Pred]) #' sleepx$Pred[1] <- NA +#' xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx) #' @export xgboostImpute <- function(formula, data, imp_var = TRUE, - imp_suffix = "imp", ..., verbose = FALSE, - nrounds=2, objective=NULL, - median = FALSE){ + imp_suffix = "imp", verbose = FALSE, + nrounds=100, objective=NULL, + ...){ check_data(data) formchar <- as.character(formula) lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]]) @@ -39,7 +39,9 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, stopifnot(length(objective)!=length(lhs)) } for (lhsV in lhs) { - form <- as.formula(paste(lhsV, "~", rhs)) + form <- as.formula(paste(lhsV, "~", rhs,"-1")) + # formula without left side for prediction + formPred <- as.formula(paste( "~", rhs,"-1")) lhs_vector <- data[[lhsV]] num_class <- NULL if (!any(is.na(lhs_vector))) { @@ -47,18 +49,22 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, } else { lhs_na <- is.na(lhs_vector) if (verbose) - message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations") + message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations") dattmp <- subset(data, !rhs_na & !lhs_na) labtmp <- dattmp[[lhsV]] + currentClass <- NULL if(inherits(labtmp,"factor")){ + currentClass <- "factor" labtmp <- as.integer(labtmp)-1 if(length(unique(labtmp))==2){ objective <- "binary:logistic" }else if(length(unique(labtmp))>2){ objective <- "multi:softmax" + num_class <- max(labtmp)+1 } - num_class <- max(labtmp)+1 + }else if(inherits(labtmp,"numeric")){ + currentClass <- "numeric" if(length(unique(labtmp))==2){ warning("binary factor detected but not probably stored as factor.") objective <- "binary:logistic" @@ -66,6 +72,7 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, objective <- "reg:squarederror" } }else if(inherits(labtmp,"integer")){ + currentClass <- "integer" if(length(unique(labtmp))==2){ warning("binary factor detected but not probably stored as factor.") objective <- "binary:logistic" @@ -76,18 +83,24 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, mm <- model.matrix(form,dattmp) - mod <- xgboost::xgboost(data = mm, label = labtmp, - nrounds=nrounds, objective=objective, num_class = num_class) + if(!is.null(num_class)){ + mod <- xgboost::xgboost(data = mm, label = labtmp, + nrounds=nrounds, objective=objective, num_class = num_class, verbose = FALSE,...) + }else{ + mod <- xgboost::xgboost(data = mm, label = labtmp, + nrounds=nrounds, objective=objective, verbose = FALSE,...) + } + if (verbose) message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations") - if (median & inherits(lhs_vector, "numeric")) { - predictions <- apply( - predict(mod, model.matrix(form,subset(data, !rhs_na & lhs_na)), predict.all = TRUE)$predictions, - 1, median) - } else { - predictions <- predict(mod, model.matrix(as.formula(paste0("~",rhs)),subset(data, !rhs_na & lhs_na))) + predictions <- + predict(mod, model.matrix(formPred,subset(data, !rhs_na & lhs_na))) + if(currentClass=="factor"){ + data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1] + }else{ + data[!rhs_na & lhs_na, lhsV] <- predictions } - data[!rhs_na & lhs_na, lhsV] <- predictions + } if (imp_var) { @@ -102,4 +115,4 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, } } data -} \ No newline at end of file +} diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R new file mode 100644 index 0000000..c51a6f9 --- /dev/null +++ b/inst/tinytest/test_xgboostImpute.R @@ -0,0 +1,41 @@ +library(VIM) +set.seed(104) +x <- rnorm(100) +df <- data.frame( + y = x + rnorm(100, sd = .01), + x = x, + fac = as.factor(x >= 0) +) + +max_dist <- function(x, y) { + max(abs(x - y)) +} + +df$y[1:3] <- NA +df$fac[3:5] <- NA + +# xgboostImpute accuracy", { + df.out <- xgboostImpute(y ~ x, df) + expect_true( + max_dist(df.out$y, df$x)< + 0.06 + ) + + # xgboostImpute should do nothing for no missings", { + df.out <- xgboostImpute(x ~ y, df) + expect_identical(df.out$x, df$x) +# + +# factor response predicted accurately", { + df.out <- xgboostImpute(fac ~ x, df) + df.out[df.out$fac_imp,] + expect_identical(df.out$fac, as.factor(df$x >= 0)) +# + +# factor regressor used reasonably", { + df2 <- df + df2$x[1:10] <- NA + df.out <- xgboostImpute(x ~ fac, df2) + expect_identical(as.factor(df.out$x >= 0), df$fac) +# +