From 8233d589b64a7c487d8413cc032ce921789cc7f7 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:47:12 +0100 Subject: [PATCH 1/4] Improve predict function documentation --- R-package/R/predict.xgb.Booster.R | 11 +++++++++++ R-package/man/predict-xgb.Booster-method.Rd | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index abdb94e754b2..d608f3465177 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -20,6 +20,17 @@ setClass("xgb.Booster", #' only valid for gbtree, but not for gblinear. set it to be value bigger #' than 0. It will use all trees by default. #' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object. +#' +#' @details +#' The option \code{ntreelimit} purpose is to let the user train a model with lots +#' of trees but use only the first trees for prediction to avoid overfitting +#' (without having to train a new model with less trees). +#' +#' The option \code{predleaf} purpose is inspired from §3.1 of the paper +#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +#' The idea is to use the model as a generator of new features which capture non linear link +#' from original features. +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 13f37802e993..341ced8c6ac7 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -31,6 +31,16 @@ than 0. It will use all trees by default.} \description{ Predicted values based on xgboost model object. } +\details{ +The option \code{ntreelimit} purpose is to let the user train a model with lots +of trees but use only the first trees for prediction to avoid overfitting +(without having to train a new model with less trees). + +The option \code{predleaf} purpose is inspired from §3.1 of the paper +\code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +The idea is to use the model as a generator of new features which capture non linear link +from original features. +} \examples{ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') From e384f549f4148f3daf650a8c1ccc701478d1b636 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:47:45 +0100 Subject: [PATCH 2/4] Cleaning of demo --- R-package/demo/basic_walkthrough.R | 4 ++-- R-package/demo/boost_from_prediction.R | 2 +- R-package/demo/create_sparse_matrix.R | 3 +-- R-package/demo/cross_validation.R | 4 ++-- R-package/demo/predict_leaf_indices.R | 8 ++++---- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 0b1e5b8172f1..193618be30e3 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -102,9 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T) # Finally, you can check which features are the most important. print("Most important features (look at column Gain):") -imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt") +imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst) print(imp_matrix) # Feature importance bar plot by gain print("Feature importance Plot : ") -print(xgb.plot.importance(imp_matrix)) +print(xgb.plot.importance(importance_matrix = imp_matrix)) diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R index 9d7db806b9aa..7fa7d8545de4 100644 --- a/R-package/demo/boost_from_prediction.R +++ b/R-package/demo/boost_from_prediction.R @@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain) setinfo(dtest, "base_margin", ptest) print('this is result of boost from initial prediction') -bst <- xgb.train( param, dtrain, 1, watchlist ) +bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index 2fbf41772029..7a8dfaa82532 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -67,10 +67,9 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] cat("Learning...\n") bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") -xgb.dump(bst, 'xgb.model.dump', with.stats = T) # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. -importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') +importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) print(importance) # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column). diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index c3148ae215b5..5d748f6797c9 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) { param <- list(max.depth=2,eta=1,silent=1, objective = logregobj, eval_metric = evalerror) # train with customized objective -xgb.cv(param, dtrain, nround, nfold = 5) +xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5) # do cross validation with prediction values for each fold -res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE) +res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE) res$dt length(res$pred) diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index c03a17955f9d..110bf9602554 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -2,15 +2,15 @@ require(xgboost) # load in the agaricus dataset data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) +dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) -param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic') +param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') watchlist <- list(eval = dtest, train = dtrain) nround = 5 # training the model for two rounds -bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist) +bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2, watchlist = watchlist) cat('start testing prediction from first n trees\n') ### predict using first 2 tree From 0abb4338a9b01310dbabefb572fe04acee613b81 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:48:01 +0100 Subject: [PATCH 3/4] Cleaning in documentation --- R-package/vignettes/discoverYourData.Rmd | 4 ++-- R-package/vignettes/xgboostPresentation.Rmd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 22d996b08f3c..08d6bfdf5144 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -190,7 +190,7 @@ Measure feature importance In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature). ```{r} -importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst) +importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) head(importance) ``` @@ -213,7 +213,7 @@ One simple solution is to count the co-occurrences of a feature and a class of t For that purpose we will execute the same function as above but using two more parameters, `data` and `label`. ```{r} -importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) +importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) # Cleaning for better display importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 45d2e8b8ea27..7534240ac287 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -345,7 +345,7 @@ Feature importance is similar to R gbm package's relative influence (rel.inf). ``` importance_matrix <- xgb.importance(model = bst) print(importance_matrix) -xgb.plot.importance(importance_matrix) +xgb.plot.importance(importance_matrix = importance_matrix) ``` View the trees from a model From db922e8c88ff69413af288fbfb3586f5ca784874 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:48:22 +0100 Subject: [PATCH 4/4] Small rewording function xgb.importance --- R-package/R/xgb.importance.R | 13 ++++++++----- R-package/man/xgb.importance.Rd | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 07211ff59ca5..e003277f07f3 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -25,14 +25,17 @@ #' Results are returned for both linear and tree models. #' #' \code{data.table} is returned by the function. -#' There are 3 columns : +#' The columns are : #' \itemize{ -#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. -#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; -#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; -#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. +#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; +#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); +#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); +#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. #' } #' +#' If you don't provide name, index of the features are used. +#' They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R). +#' #' Co-occurence count #' ------------------ #' diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index c144bb85f8ff..0d59ba556e9e 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -31,14 +31,17 @@ This is the function to understand the model trained (and through your model, yo Results are returned for both linear and tree models. \code{data.table} is returned by the function. -There are 3 columns : +The columns are : \itemize{ - \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. - \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; - \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; - \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. + \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; + \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); + \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); + \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. } +If you don't provide name, index of the features are used. +They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R). + Co-occurence count ------------------