From 9bd8fd5ceca44a46c6246d19b11e0a15110589e5 Mon Sep 17 00:00:00 2001
From: Alexander Kowarik <alex@kowarik.net>
Date: Thu, 9 Nov 2023 08:10:10 +0100
Subject: [PATCH] First kind of working version xgboostImpute, some basic tests

---
 R/xgboostImpute.R                  | 57 ++++++++++++++++++------------
 inst/tinytest/test_xgboostImpute.R | 41 +++++++++++++++++++++
 2 files changed, 76 insertions(+), 22 deletions(-)
 create mode 100644 inst/tinytest/test_xgboostImpute.R

diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R
index 3e833b3..f77a070 100644
--- a/R/xgboostImpute.R
+++ b/R/xgboostImpute.R
@@ -1,32 +1,32 @@
 #' Xgboost Imputation
 #'
-#' Impute missing values based on a random forest model using [ranger::ranger()]
+#' Impute missing values based on a random forest model using [xgboost::xgboost()]
 #' @param formula model formula for the imputation
 #' @param data A `data.frame` containing the data
 #' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed
 #' variable should be created show the imputation status
 #' @param imp_suffix suffix used for TF imputation variables
-#' @param ... Arguments passed to [ranger::ranger()]
+#' @param ... Arguments passed to [xgboost::xgboost()]
 #' @param verbose Show the number of observations used for training
 #'   and evaluating the RF-Model. This parameter is also passed down to
-#'   [ranger::ranger()] to show computation status.
-#' @param median Use the median (rather than the arithmetic mean) to average
-#'   the values of individual trees for a more robust estimate.
+#'   [xgboost::xgboost()] to show computation status.
 #' @return the imputed data set.
 #' @family imputation methods
 #' @examples 
 #' data(sleep)
 #' xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep)
 #' xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep)
+#' xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep)
 #' 
 #' sleepx <- sleep
-#' sleepx$Pred <- as.factor(sleepx$Pred)
+#' sleepx$Pred <- as.factor(LETTERS[sleepx$Pred])
 #' sleepx$Pred[1] <- NA
+#' xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx)
 #' @export
 xgboostImpute <- function(formula, data, imp_var = TRUE,
-                         imp_suffix = "imp", ..., verbose = FALSE,
-                         nrounds=2, objective=NULL,
-                         median = FALSE){
+                         imp_suffix = "imp", verbose = FALSE,
+                         nrounds=100, objective=NULL,
+                         ...){
   check_data(data)
   formchar <- as.character(formula)
   lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]])
@@ -39,7 +39,9 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
     stopifnot(length(objective)!=length(lhs))
   }
   for (lhsV in lhs) {
-    form <- as.formula(paste(lhsV, "~", rhs))
+    form <- as.formula(paste(lhsV, "~", rhs,"-1"))
+    # formula without left side for prediction
+    formPred <- as.formula(paste( "~", rhs,"-1"))
     lhs_vector <- data[[lhsV]]
     num_class <- NULL
     if (!any(is.na(lhs_vector))) {
@@ -47,18 +49,22 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
     } else {
       lhs_na <- is.na(lhs_vector)
       if (verbose)
-        message("Training   model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
+        message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
       dattmp <- subset(data, !rhs_na & !lhs_na)
       labtmp <- dattmp[[lhsV]]
+      currentClass <- NULL
       if(inherits(labtmp,"factor")){
+        currentClass <- "factor"
         labtmp <- as.integer(labtmp)-1
         if(length(unique(labtmp))==2){
           objective <- "binary:logistic"
         }else if(length(unique(labtmp))>2){
           objective <- "multi:softmax"
+          num_class <- max(labtmp)+1
         }
-        num_class <- max(labtmp)+1
+        
       }else if(inherits(labtmp,"numeric")){
+        currentClass <- "numeric"
         if(length(unique(labtmp))==2){
           warning("binary factor detected but not probably stored as factor.")
           objective <- "binary:logistic"
@@ -66,6 +72,7 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
           objective <- "reg:squarederror"
         }
       }else if(inherits(labtmp,"integer")){
+        currentClass <- "integer"
         if(length(unique(labtmp))==2){
           warning("binary factor detected but not probably stored as factor.")
           objective <- "binary:logistic"
@@ -76,18 +83,24 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
         
       
       mm <- model.matrix(form,dattmp)
-      mod <- xgboost::xgboost(data = mm, label = labtmp,
-                       nrounds=nrounds, objective=objective, num_class = num_class)
+      if(!is.null(num_class)){
+        mod <- xgboost::xgboost(data = mm, label = labtmp,
+                                nrounds=nrounds, objective=objective, num_class = num_class, verbose = FALSE,...)
+      }else{
+        mod <- xgboost::xgboost(data = mm, label = labtmp,
+                                nrounds=nrounds, objective=objective, verbose = FALSE,...)
+      }
+      
       if (verbose)
         message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
-      if (median & inherits(lhs_vector, "numeric")) {
-        predictions <- apply(
-          predict(mod, model.matrix(form,subset(data, !rhs_na & lhs_na)), predict.all = TRUE)$predictions,
-          1, median)
-      } else {
-        predictions <- predict(mod, model.matrix(as.formula(paste0("~",rhs)),subset(data, !rhs_na & lhs_na)))
+      predictions <- 
+        predict(mod, model.matrix(formPred,subset(data, !rhs_na & lhs_na)))
+      if(currentClass=="factor"){
+        data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1]
+      }else{
+        data[!rhs_na & lhs_na, lhsV] <- predictions  
       }
-      data[!rhs_na & lhs_na, lhsV] <- predictions
+      
     }
     
     if (imp_var) {
@@ -102,4 +115,4 @@ xgboostImpute <- function(formula, data, imp_var = TRUE,
     }
   }
   data
-}
\ No newline at end of file
+}
diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R
new file mode 100644
index 0000000..c51a6f9
--- /dev/null
+++ b/inst/tinytest/test_xgboostImpute.R
@@ -0,0 +1,41 @@
+library(VIM)
+set.seed(104)
+x <- rnorm(100)
+df <- data.frame(
+  y = x + rnorm(100, sd = .01),
+  x = x,
+  fac = as.factor(x >= 0)
+)
+
+max_dist <- function(x, y) {
+  max(abs(x - y))
+}
+
+df$y[1:3] <- NA
+df$fac[3:5] <- NA
+
+# xgboostImpute accuracy", {
+  df.out <- xgboostImpute(y ~ x, df)
+  expect_true(
+    max_dist(df.out$y, df$x)<
+    0.06
+  )
+  
+  # xgboostImpute should do nothing for no missings", {
+  df.out <- xgboostImpute(x ~ y, df)
+  expect_identical(df.out$x, df$x)
+# 
+
+# factor response predicted accurately", {
+  df.out <- xgboostImpute(fac ~ x, df)
+  df.out[df.out$fac_imp,]
+  expect_identical(df.out$fac, as.factor(df$x >= 0))
+# 
+
+# factor regressor used reasonably", {
+  df2 <- df
+  df2$x[1:10] <- NA
+  df.out <- xgboostImpute(x ~ fac, df2)
+  expect_identical(as.factor(df.out$x >= 0), df$fac)
+# 
+