diff --git a/vignettes/impPCA.Rmd b/vignettes/impPCA.Rmd new file mode 100644 index 0000000..64cd5b2 --- /dev/null +++ b/vignettes/impPCA.Rmd @@ -0,0 +1,89 @@ +--- +title: "Imputation Method based on Iterative EM PCA" +author: "Birgit Karlhuber" +date: "2024-07-08" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Imputation Method based on Iterative EM PCA} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height=4, + fig.align = "center" +) +``` + + +This vignette showcases the function `impPCA()`, which can be used to impute missing values with the help of a greedy algorithm for EM-PCA including robust methods. + +### Data + +The following example demonstrates the functionality of `impPCA()` using a subset of `sleep`. The columns have been selected deliberately to include some interactions between the missing values + +```{r, message=FALSE} +library(VIM) + +data(iris) +df <- iris[1:30,c(1,4)] +colnames(df) <- c("S.Length", "P.Width") # select two numerical variables +df_na <- df + +# randomly produce some missing values in the column P.Width +set.seed(1) +nbr_missing <- 10 +index_na <- sample(nrow(df_na),size = nbr_missing,replace = T) +index_na <- index_na[!duplicated(index_na)] +df_na[index_na,2] <- NA + +w <- is.na(df_na$`P.Width`) + +aggr(df_na) + +``` + + +## Imputation + +By setting method to "mcd" the robust estimation is used (instead of the default "classical"). With boot=FALSE imputed data set would be a data.frame else (boot=TRUE) it is a list where each list element contains a data.frame. + +```{r, message=FALSE, results='hide'} +imputed <- impPCA(df_na, method = "mcd", boot=TRUE)[[1]] +aggr(imputed) + +``` + +The plot shows that all missing values of the variable `P.Width` were imputed by the `impPCA()` function. + + +## Performance of method + +In the next plot the non-missing data points, the ones set to missing and the associated imputed values are visualized. Furthermore the **MAPE** short for **M**ean **A**bsolute **P**ercentage **E**rror and the **NRMSE** short for **N**ormalized **R**oot **M**ean **S**quared **E**rror are visualized in the plot. + +```{r, message=FALSE, fig.height=5} +# create plot +plot(`P.Width` ~ `S.Length`, data = df, type = "n", ylab = "P.Width", xlab="S.Length") +mtext(text = "impPCA robust", side = 3) +points(df$`S.Length`[!w], df$`P.Width`[!w]) +points(df$`S.Length`[w], df$`P.Width`[w], col = "grey", pch = 17) +points(imputed$`S.Length`[w], imputed$`P.Width`[w], col = "red", pch = 20, cex = 1.4) +segments(x0 = df$`S.Length`[w], x1 = imputed$`S.Length`[w], y0 = df$`P.Width`[w], +y1 = imputed$`P.Width`[w], lty = 2, col = "grey") +legend("topleft", legend = c("non-missings", "set to missing", "imputed values"), +pch = c(1,17,20), col = c("black","grey","red"), cex = 0.7) +mape <- round(100* 1/sum(is.na(df_na$`P.Width`)) * sum(abs((df$`P.Width` - +imputed$`P.Width`) / df$`P.Width`)), 2) +s2 <- var(df$`P.Width`) +nrmse <- round(sqrt(1/sum(is.na(df_na$`P.Width`)) * sum(abs((df$`P.Width` - +imputed$`P.Width`) / s2))), 2) +text(x = 5.6, y = 0.16, labels = paste("MAPE =", mape)) +text(x = 5.6, y = 0.12, labels = paste("NRMSE =", nrmse)) + +``` + diff --git a/vignettes/xgboostImpute.Rmd b/vignettes/xgboostImpute.Rmd new file mode 100644 index 0000000..e1d3727 --- /dev/null +++ b/vignettes/xgboostImpute.Rmd @@ -0,0 +1,141 @@ +--- +title: "Imputation Method based on Random Forest Model" +author: "Birgit Karlhuber" +date: "2024-07-08" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Imputation Method based on Random Forest Model} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height=4, + fig.align = "center" +) +``` + + +This vignette showcases the function `xgboostImpute()`, which can be used to impute missing values based on a random forest model using `[xgboost::xgboost()]. + +### Data + +The following example demonstrates the functionality of `xgboostImpute()` using a subset of `sleep`. The columns have been selected deliberately to include some interactions between the missing values + +```{r, message=FALSE} +library(VIM) +dataset <- sleep[, c("Dream", "NonD", "BodyWgt", "Span")] # dataset with missings +dataset$BodyWgt <- log(dataset$BodyWgt) +dataset$Span <- log(dataset$Span) +aggr(dataset) +str(dataset) +``` + + +## Imputation + +In order to invoke the imputation methods, a formula is used to specify which +variables are to be estimated and which variables should be used as regressors.First `Dream` will be imputed based on `BodyWgt`. + +```{r, message=FALSE} +imp_xgboost <- xgboostImpute(formula=Dream~BodyWgt,data = dataset) +aggr(imp_xgboost, delimiter = "_imp") + +``` + +The plot shows that all missing values of the variable `Dream` were imputed by the `xgboostImpute()` function. + + +## Diagnosing the result + +As we can see in the next plot, the correlation structure of `Dream` and +`BodyWgt` is preserved by the imputation method. + +```{r, fig.height=5} +imp_xgboost[, c("Dream", "BodyWgt", "Dream_imp")] |> + marginplot(delimiter = "_imp") + +``` + + +## Imputing multiple variables + +To impute several variables at once, the formula can be specified with more than one column name on the +left hand side. + +```{r, message=FALSE} +imp_xgboost <- xgboostImpute(Dream+NonD+Span~BodyWgt,data=dataset) +aggr(imp_xgboost, delimiter = "_imp") + +``` + + +## Performance of method + +In order to validate the performance of `xgboostImpute()` the `iris` dataset is used. Firstly, some values are randomly set to `NA`. + +```{r} +library(reactable) + +data(iris) +df <- iris +colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species") +# randomly produce some missing values in the data +set.seed(1) +nbr_missing <- 50 +y <- data.frame(row=sample(nrow(iris),size = nbr_missing,replace = T), + col=sample(ncol(iris)-1,size = nbr_missing,replace = T)) +y<-y[!duplicated(y),] +df[as.matrix(y)]<-NA + +aggr(df) +sapply(df, function(x)sum(is.na(x))) +``` + +We can see that there are missings in all variables and some observations reveal missing values on several points. In the next step we perform a multiple variable imputation and `Species` serves as a regressor. + +```{r, message=FALSE} +imp_xgboost <- xgboostImpute(S.Length + S.Width + P.Length + P.Width ~ Species, df) +aggr(imp_xgboost, delimiter = "imp") + +``` + +The plot indicates that all missing values have been imputed by the `xgboostImpute()` algorithm. The following table displays the rounded first five results of the imputation for all variables. + +```{r echo=F,warning=F} +results <- cbind("TRUE1" = as.numeric(iris[as.matrix(y[which(y$col==1),])]), + "IMPUTED1" = round(as.numeric(imp_xgboost[as.matrix(y[which(y$col==1),])]),2), + "TRUE2" = as.numeric(iris[as.matrix(y[which(y$col==2),])]), + "IMPUTED2" = round(as.numeric(imp_xgboost[as.matrix(y[which(y$col==2),])]),2), + "TRUE3" = as.numeric(iris[as.matrix(y[which(y$col==3),])]), + "IMPUTED3" = round(as.numeric(imp_xgboost[as.matrix(y[which(y$col==3),])]),2), + "TRUE4" = as.numeric(iris[as.matrix(y[which(y$col==4),])]), + "IMPUTED4" = round(as.numeric(imp_xgboost[as.matrix(y[which(y$col==4),])]),2))[1:5,] + +reactable(results, columns = list( + TRUE1 = colDef(name = "True"), + IMPUTED1 = colDef(name = "Imputed"), + TRUE2 = colDef(name = "True"), + IMPUTED2 = colDef(name = "Imputed"), + TRUE3 = colDef(name = "True"), + IMPUTED3 = colDef(name = "Imputed"), + TRUE4 = colDef(name = "True"), + IMPUTED4 = colDef(name = "Imputed") + ), + columnGroups = list( + colGroup(name = "S.Length", columns = c("TRUE1", "IMPUTED1")), + colGroup(name = "S.Width", columns = c("TRUE2", "IMPUTED2")), + colGroup(name = "P.Length", columns = c("TRUE3", "IMPUTED3")), + colGroup(name = "P.Width", columns = c("TRUE4", "IMPUTED4")) + ), + striped = TRUE, + highlight = TRUE, + bordered = TRUE +) + +```