From 1d504d6c6c6ca94724840a9ae39367aebde4ee00 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:11:22 -0500 Subject: [PATCH 1/6] added XGDMatrixNumCol_R function --- R-package/src/xgboost_R.cc | 8 ++++++++ R-package/src/xgboost_R.h | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 665fb5faa284..846ad9f47f5f 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -195,6 +195,14 @@ SEXP XGDMatrixNumRow_R(SEXP handle) { return ScalarInteger(static_cast(nrow)); } +SEXP XGDMatrixNumCol_R(SEXP handle) { + bst_ulong ncol; + R_API_BEGIN(); + CHECK_CALL(XGDMatrixNumCol(R_ExternalPtrAddr(handle), &ncol)); + R_API_END(); + return ScalarInteger(static_cast(ncol)); +} + // functions related to booster void _BoosterFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 123ebe028a9f..6e173946468a 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -83,10 +83,16 @@ XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field); /*! * \brief return number of rows - * \param handle a instance of data matrix + * \param handle an instance of data matrix */ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle); +/*! + * \brief return number of columns + * \param handle an instance of data matrix + */ +XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle); + /*! * \brief create xgboost learner * \param dmats a list of dmatrix handles that will be cached From d27bfb61b07bbe234bce9f7718f1b42420865d12 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:17:13 -0500 Subject: [PATCH 2/6] consolidated DMatrix&Booster stuff into xgb.DMatrix.R & xgb.Booster.R --- R-package/R/getinfo.xgb.DMatrix.R | 55 ------------- R-package/R/nrow.xgb.DMatrix.R | 19 ----- R-package/R/predict.xgb.Booster.R | 80 ------------------- R-package/R/predict.xgb.Booster.handle.R | 18 ----- R-package/R/setinfo.xgb.DMatrix.R | 37 --------- R-package/R/slice.xgb.DMatrix.R | 44 ---------- R-package/man/nrow-xgb.DMatrix-method.Rd | 23 ------ R-package/man/predict-xgb.Booster-method.Rd | 53 ------------ .../man/predict-xgb.Booster.handle-method.Rd | 18 ----- R-package/man/slice.Rd | 31 ------- 10 files changed, 378 deletions(-) delete mode 100644 R-package/R/getinfo.xgb.DMatrix.R delete mode 100644 R-package/R/nrow.xgb.DMatrix.R delete mode 100644 R-package/R/predict.xgb.Booster.R delete mode 100644 R-package/R/predict.xgb.Booster.handle.R delete mode 100644 R-package/R/setinfo.xgb.DMatrix.R delete mode 100644 R-package/R/slice.xgb.DMatrix.R delete mode 100644 R-package/man/nrow-xgb.DMatrix-method.Rd delete mode 100644 R-package/man/predict-xgb.Booster-method.Rd delete mode 100644 R-package/man/predict-xgb.Booster.handle-method.Rd delete mode 100644 R-package/man/slice.Rd diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R deleted file mode 100644 index 3000a1e7d209..000000000000 --- a/R-package/R/getinfo.xgb.DMatrix.R +++ /dev/null @@ -1,55 +0,0 @@ -setClass('xgb.DMatrix') - -#' Get information of an xgb.DMatrix object -#' -#' Get information of an xgb.DMatrix object -#' -#' The information can be one of the following: -#' -#' \itemize{ -#' \item \code{label}: label Xgboost learn from ; -#' \item \code{weight}: to do a weight rescale ; -#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; -#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}. -#' } -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' train <- agaricus.train -#' dtrain <- xgb.DMatrix(train$data, label=train$label) -#' labels <- getinfo(dtrain, 'label') -#' setinfo(dtrain, 'label', 1-labels) -#' labels2 <- getinfo(dtrain, 'label') -#' stopifnot(all(labels2 == 1-labels)) -#' @rdname getinfo -#' @export -getinfo <- function(object, ...){ - UseMethod("getinfo") -} - - - -#' @param object Object of class \code{xgb.DMatrix} -#' @param name the name of the field to get -#' @param ... other parameters -#' @rdname getinfo -#' @method getinfo xgb.DMatrix -setMethod("getinfo", signature = "xgb.DMatrix", - definition = function(object, name) { - if (typeof(name) != "character") { - stop("xgb.getinfo: name must be character") - } - if (class(object) != "xgb.DMatrix") { - stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix") - } - if (name != "label" && name != "weight" && - name != "base_margin" && name != "nrow") { - stop(paste("xgb.getinfo: unknown info name", name)) - } - if (name != "nrow"){ - ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost") - } else { - ret <- xgb.numrow(object) - } - return(ret) - }) diff --git a/R-package/R/nrow.xgb.DMatrix.R b/R-package/R/nrow.xgb.DMatrix.R deleted file mode 100644 index 9ea039764945..000000000000 --- a/R-package/R/nrow.xgb.DMatrix.R +++ /dev/null @@ -1,19 +0,0 @@ -setGeneric("nrow") - -#' @title Number of xgb.DMatrix rows -#' @description \code{nrow} return the number of rows present in the \code{xgb.DMatrix}. -#' @param x Object of class \code{xgb.DMatrix} -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' train <- agaricus.train -#' dtrain <- xgb.DMatrix(train$data, label=train$label) -#' stopifnot(nrow(dtrain) == nrow(train$data)) -#' -#' @export -setMethod("nrow", - signature = "xgb.DMatrix", - definition = function(x) { - xgb.numrow(x) - } -) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R deleted file mode 100644 index d608f3465177..000000000000 --- a/R-package/R/predict.xgb.Booster.R +++ /dev/null @@ -1,80 +0,0 @@ -setClass("xgb.Booster.handle") -setClass("xgb.Booster", - slots = c(handle = "xgb.Booster.handle", - raw = "raw")) - -#' Predict method for eXtreme Gradient Boosting model -#' -#' Predicted values based on xgboost model object. -#' -#' @param object Object of class "xgb.Boost" -#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or -#' \code{xgb.DMatrix}. -#' @param missing Missing is only used when input is dense matrix, pick a float -#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. -#' @param outputmargin whether the prediction should be shown in the original -#' value of sum of functions, when outputmargin=TRUE, the prediction is -#' untransformed margin value. In logistic regression, outputmargin=T will -#' output value before logistic transformation. -#' @param ntreelimit limit number of trees used in prediction, this parameter is -#' only valid for gbtree, but not for gblinear. set it to be value bigger -#' than 0. It will use all trees by default. -#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object. -#' -#' @details -#' The option \code{ntreelimit} purpose is to let the user train a model with lots -#' of trees but use only the first trees for prediction to avoid overfitting -#' (without having to train a new model with less trees). -#' -#' The option \code{predleaf} purpose is inspired from §3.1 of the paper -#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}. -#' The idea is to use the model as a generator of new features which capture non linear link -#' from original features. -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' data(agaricus.test, package='xgboost') -#' train <- agaricus.train -#' test <- agaricus.test -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, -#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -#' pred <- predict(bst, test$data) -#' @export -setMethod("predict", signature = "xgb.Booster", - definition = function(object, newdata, missing = NA, - outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) { - if (class(object) != "xgb.Booster"){ - stop("predict: model in prediction must be of class xgb.Booster") - } else { - object <- xgb.Booster.check(object, saveraw = FALSE) - } - if (class(newdata) != "xgb.DMatrix") { - newdata <- xgb.DMatrix(newdata, missing = missing) - } - if (is.null(ntreelimit)) { - ntreelimit <- 0 - } else { - if (ntreelimit < 1){ - stop("predict: ntreelimit must be equal to or greater than 1") - } - } - option <- 0 - if (outputmargin) { - option <- option + 1 - } - if (predleaf) { - option <- option + 2 - } - ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), - as.integer(ntreelimit), PACKAGE = "xgboost") - if (predleaf){ - len <- getinfo(newdata, "nrow") - if (length(ret) == len){ - ret <- matrix(ret,ncol = 1) - } else { - ret <- matrix(ret, ncol = len) - ret <- t(ret) - } - } - return(ret) -}) diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R deleted file mode 100644 index 3e4013b759dc..000000000000 --- a/R-package/R/predict.xgb.Booster.handle.R +++ /dev/null @@ -1,18 +0,0 @@ -#' Predict method for eXtreme Gradient Boosting model handle -#' -#' Predicted values based on xgb.Booster.handle object. -#' -#' @param object Object of class "xgb.Boost.handle" -#' @param ... Parameters pass to \code{predict.xgb.Booster} -#' -setMethod("predict", signature = "xgb.Booster.handle", - definition = function(object, ...) { - if (class(object) != "xgb.Booster.handle"){ - stop("predict: model in prediction must be of class xgb.Booster.handle") - } - - bst <- xgb.handleToBooster(object) - - ret <- predict(bst, ...) - return(ret) -}) diff --git a/R-package/R/setinfo.xgb.DMatrix.R b/R-package/R/setinfo.xgb.DMatrix.R deleted file mode 100644 index 427de08d4ae7..000000000000 --- a/R-package/R/setinfo.xgb.DMatrix.R +++ /dev/null @@ -1,37 +0,0 @@ -#' Set information of an xgb.DMatrix object -#' -#' Set information of an xgb.DMatrix object -#' -#' It can be one of the following: -#' -#' \itemize{ -#' \item \code{label}: label Xgboost learn from ; -#' \item \code{weight}: to do a weight rescale ; -#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; -#' \item \code{group}. -#' } -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' train <- agaricus.train -#' dtrain <- xgb.DMatrix(train$data, label=train$label) -#' labels <- getinfo(dtrain, 'label') -#' setinfo(dtrain, 'label', 1-labels) -#' labels2 <- getinfo(dtrain, 'label') -#' stopifnot(all(labels2 == 1-labels)) -#' @rdname setinfo -#' @export -setinfo <- function(object, ...){ - UseMethod("setinfo") -} - -#' @param object Object of class "xgb.DMatrix" -#' @param name the name of the field to get -#' @param info the specific field of information to set -#' @param ... other parameters -#' @rdname setinfo -#' @method setinfo xgb.DMatrix -setMethod("setinfo", signature = "xgb.DMatrix", - definition = function(object, name, info) { - xgb.setinfo(object, name, info) - }) diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R deleted file mode 100644 index 4626c2b4d80a..000000000000 --- a/R-package/R/slice.xgb.DMatrix.R +++ /dev/null @@ -1,44 +0,0 @@ -setClass('xgb.DMatrix') - -#' Get a new DMatrix containing the specified rows of -#' orginal xgb.DMatrix object -#' -#' Get a new DMatrix containing the specified rows of -#' orginal xgb.DMatrix object -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' train <- agaricus.train -#' dtrain <- xgb.DMatrix(train$data, label=train$label) -#' dsub <- slice(dtrain, 1:3) -#' @rdname slice -#' @export -slice <- function(object, ...){ - UseMethod("slice") -} - -#' @param object Object of class "xgb.DMatrix" -#' @param idxset a integer vector of indices of rows needed -#' @param ... other parameters -#' @rdname slice -#' @method slice xgb.DMatrix -setMethod("slice", signature = "xgb.DMatrix", - definition = function(object, idxset, ...) { - if (class(object) != "xgb.DMatrix") { - stop("slice: first argument dtrain must be xgb.DMatrix") - } - ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, - PACKAGE = "xgboost") - - attr_list <- attributes(object) - nr <- xgb.numrow(object) - len <- sapply(attr_list,length) - ind <- which(len == nr) - if (length(ind) > 0) { - nms <- names(attr_list)[ind] - for (i in 1:length(ind)) { - attr(ret,nms[i]) <- attr(object,nms[i])[idxset] - } - } - return(structure(ret, class = "xgb.DMatrix")) - }) diff --git a/R-package/man/nrow-xgb.DMatrix-method.Rd b/R-package/man/nrow-xgb.DMatrix-method.Rd deleted file mode 100644 index 1fd52b9c1a8d..000000000000 --- a/R-package/man/nrow-xgb.DMatrix-method.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/nrow.xgb.DMatrix.R -\docType{methods} -\name{nrow,xgb.DMatrix-method} -\alias{nrow,xgb.DMatrix-method} -\title{Number of xgb.DMatrix rows} -\usage{ -\S4method{nrow}{xgb.DMatrix}(x) -} -\arguments{ -\item{x}{Object of class \code{xgb.DMatrix}} -} -\description{ -\code{nrow} return the number of rows present in the \code{xgb.DMatrix}. -} -\examples{ -data(agaricus.train, package='xgboost') -train <- agaricus.train -dtrain <- xgb.DMatrix(train$data, label=train$label) -stopifnot(nrow(dtrain) == nrow(train$data)) - -} - diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd deleted file mode 100644 index 341ced8c6ac7..000000000000 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/predict.xgb.Booster.R -\docType{methods} -\name{predict,xgb.Booster-method} -\alias{predict,xgb.Booster-method} -\title{Predict method for eXtreme Gradient Boosting model} -\usage{ -\S4method{predict}{xgb.Booster}(object, newdata, missing = NA, - outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) -} -\arguments{ -\item{object}{Object of class "xgb.Boost"} - -\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or -\code{xgb.DMatrix}.} - -\item{missing}{Missing is only used when input is dense matrix, pick a float -value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} - -\item{outputmargin}{whether the prediction should be shown in the original -value of sum of functions, when outputmargin=TRUE, the prediction is -untransformed margin value. In logistic regression, outputmargin=T will -output value before logistic transformation.} - -\item{ntreelimit}{limit number of trees used in prediction, this parameter is -only valid for gbtree, but not for gblinear. set it to be value bigger -than 0. It will use all trees by default.} - -\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.} -} -\description{ -Predicted values based on xgboost model object. -} -\details{ -The option \code{ntreelimit} purpose is to let the user train a model with lots -of trees but use only the first trees for prediction to avoid overfitting -(without having to train a new model with less trees). - -The option \code{predleaf} purpose is inspired from §3.1 of the paper -\code{Practical Lessons from Predicting Clicks on Ads at Facebook}. -The idea is to use the model as a generator of new features which capture non linear link -from original features. -} -\examples{ -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -train <- agaricus.train -test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, - eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -pred <- predict(bst, test$data) -} - diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd deleted file mode 100644 index 34454e555fd9..000000000000 --- a/R-package/man/predict-xgb.Booster.handle-method.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/predict.xgb.Booster.handle.R -\docType{methods} -\name{predict,xgb.Booster.handle-method} -\alias{predict,xgb.Booster.handle-method} -\title{Predict method for eXtreme Gradient Boosting model handle} -\usage{ -\S4method{predict}{xgb.Booster.handle}(object, ...) -} -\arguments{ -\item{object}{Object of class "xgb.Boost.handle"} - -\item{...}{Parameters pass to \code{predict.xgb.Booster}} -} -\description{ -Predicted values based on xgb.Booster.handle object. -} - diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd deleted file mode 100644 index b177221157ed..000000000000 --- a/R-package/man/slice.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slice.xgb.DMatrix.R -\docType{methods} -\name{slice} -\alias{slice} -\alias{slice,xgb.DMatrix-method} -\title{Get a new DMatrix containing the specified rows of -orginal xgb.DMatrix object} -\usage{ -slice(object, ...) - -\S4method{slice}{xgb.DMatrix}(object, idxset, ...) -} -\arguments{ -\item{object}{Object of class "xgb.DMatrix"} - -\item{...}{other parameters} - -\item{idxset}{a integer vector of indices of rows needed} -} -\description{ -Get a new DMatrix containing the specified rows of -orginal xgb.DMatrix object -} -\examples{ -data(agaricus.train, package='xgboost') -train <- agaricus.train -dtrain <- xgb.DMatrix(train$data, label=train$label) -dsub <- slice(dtrain, 1:3) -} - From 71f402ac16ea408321c41473609a1a537570f4f4 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:22:22 -0500 Subject: [PATCH 3/6] convert S4 to S3; add some extra methods to DMatrix --- R-package/NAMESPACE | 12 +- R-package/R/utils.R | 129 +---------- R-package/R/xgb.Booster.R | 144 ++++++++++++ R-package/R/xgb.DMatrix.R | 322 +++++++++++++++++++++++++- R-package/R/xgb.cv.R | 4 +- R-package/man/dim.xgb.DMatrix.Rd | 29 +++ R-package/man/dimnames.xgb.DMatrix.Rd | 36 +++ R-package/man/getinfo.Rd | 13 +- R-package/man/predict.xgb.Booster.Rd | 58 +++++ R-package/man/print.xgb.DMatrix.Rd | 18 ++ R-package/man/setinfo.Rd | 12 +- R-package/man/slice.xgb.DMatrix.Rd | 41 ++++ R-package/man/xgb.DMatrix.Rd | 6 +- 13 files changed, 675 insertions(+), 149 deletions(-) create mode 100644 R-package/R/xgb.Booster.R create mode 100644 R-package/man/dim.xgb.DMatrix.Rd create mode 100644 R-package/man/dimnames.xgb.DMatrix.Rd create mode 100644 R-package/man/predict.xgb.Booster.Rd create mode 100644 R-package/man/print.xgb.DMatrix.Rd create mode 100644 R-package/man/slice.xgb.DMatrix.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 3cd80d5c2f5b..dd788ca2f38a 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,6 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method("[",xgb.DMatrix) +S3method("dimnames<-",xgb.DMatrix) +S3method(dim,xgb.DMatrix) +S3method(dimnames,xgb.DMatrix) +S3method(getinfo,xgb.DMatrix) +S3method(predict,xgb.Booster) +S3method(predict,xgb.Booster.handle) +S3method(setinfo,xgb.DMatrix) +S3method(slice,xgb.DMatrix) export(getinfo) +export(print.xgb.DMatrix) export(setinfo) export(slice) export(xgb.DMatrix) @@ -19,8 +29,6 @@ export(xgb.save) export(xgb.save.raw) export(xgb.train) export(xgboost) -exportMethods(nrow) -exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 926e82994d10..514826b3c414 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -1,4 +1,4 @@ - #' @importClassesFrom Matrix dgCMatrix dgeMatrix +#' @importClassesFrom Matrix dgCMatrix dgeMatrix #' @import methods # depends on matrix @@ -9,131 +9,10 @@ library.dynam.unload("xgboost", libpath) } -# set information into dmatrix, this mutate dmatrix -xgb.setinfo <- function(dmat, name, info) { - if (class(dmat) != "xgb.DMatrix") { - stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix") - } - if (name == "label") { - if (length(info) != xgb.numrow(dmat)) - stop("The length of labels must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "weight") { - if (length(info) != xgb.numrow(dmat)) - stop("The length of weights must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "base_margin") { - # if (length(info)!=xgb.numrow(dmat)) - # stop("The length of base margin must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "group") { - if (sum(info) != xgb.numrow(dmat)) - stop("The sum of groups must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), - PACKAGE = "xgboost") - return(TRUE) - } - stop(paste("xgb.setinfo: unknown info name", name)) - return(FALSE) -} - -# construct a Booster from cachelist -xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { - if (typeof(cachelist) != "list") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - for (dm in cachelist) { - if (class(dm) != "xgb.DMatrix") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - } - handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") - if (length(params) != 0) { - for (i in 1:length(params)) { - p <- params[i] - .Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p), - PACKAGE = "xgboost") - } - } - if (!is.null(modelfile)) { - if (typeof(modelfile) == "character") { - .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") - } else if (typeof(modelfile) == "raw") { - .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") - } else { - stop("xgb.Booster: modelfile must be character or raw vector") - } - } - return(structure(handle, class = "xgb.Booster.handle")) -} -# convert xgb.Booster.handle to xgb.Booster -xgb.handleToBooster <- function(handle, raw = NULL) -{ - bst <- list(handle = handle, raw = raw) - class(bst) <- "xgb.Booster" - return(bst) -} - -# Check whether an xgb.Booster object is complete -xgb.Booster.check <- function(bst, saveraw = TRUE) -{ - isnull <- is.null(bst$handle) - if (!isnull) { - isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost") - } - if (isnull) { - bst$handle <- xgb.Booster(modelfile = bst$raw) - } else { - if (is.null(bst$raw) && saveraw) - bst$raw <- xgb.save.raw(bst$handle) - } - return(bst) -} - -## ----the following are low level iteratively function, not needed if +## ----the following are low level iterative functions, not needed if ## you do not want to use them --------------------------------------- -# get dmatrix from data, label -xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { - inClass <- class(data) - if (inClass == "dgCMatrix" || inClass == "matrix") { - if (is.null(label)) { - stop("xgboost: need label when data is a matrix") - } - dtrain <- xgb.DMatrix(data, label = label, missing = missing) - if (!is.null(weight)){ - xgb.setinfo(dtrain, "weight", weight) - } - } else { - if (!is.null(label)) { - warning("xgboost: label will be ignored.") - } - if (inClass == "character") { - dtrain <- xgb.DMatrix(data) - } else if (inClass == "xgb.DMatrix") { - dtrain <- data - } else if (inClass == "data.frame") { - stop("xgboost only support numerical matrix input, - use 'data.matrix' to transform the data.") - } else { - stop("xgboost: Invalid input of data") - } - } - return (dtrain) -} -xgb.numrow <- function(dmat) { - nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost") - return(nrow) -} + # iteratively update booster with customized statistics xgb.iter.boost <- function(booster, dtrain, gpair) { if (class(booster) != "xgb.Booster.handle") { @@ -227,7 +106,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { "\tConsider providing pre-computed CV-folds through the folds parameter.") } y <- getinfo(dall, 'label') - randidx <- sample(1 : xgb.numrow(dall)) + randidx <- sample(1 : nrow(dall)) if (stratified & length(y) == length(randidx)) { y <- y[randidx] # diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R new file mode 100644 index 000000000000..97f5ea37fd86 --- /dev/null +++ b/R-package/R/xgb.Booster.R @@ -0,0 +1,144 @@ +# Construct a Booster from cachelist +# internal utility function +xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { + if (typeof(cachelist) != "list") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + for (dm in cachelist) { + if (class(dm) != "xgb.DMatrix") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + } + handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") + if (length(params) != 0) { + for (i in 1:length(params)) { + p <- params[i] + .Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p), + PACKAGE = "xgboost") + } + } + if (!is.null(modelfile)) { + if (typeof(modelfile) == "character") { + .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") + } else if (typeof(modelfile) == "raw") { + .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") + } else { + stop("xgb.Booster: modelfile must be character or raw vector") + } + } + return(structure(handle, class = "xgb.Booster.handle")) +} + +# Convert xgb.Booster.handle to xgb.Booster +# internal utility function +xgb.handleToBooster <- function(handle, raw = NULL) +{ + bst <- list(handle = handle, raw = raw) + class(bst) <- "xgb.Booster" + return(bst) +} + +# Check whether an xgb.Booster object is complete +# internal utility function +xgb.Booster.check <- function(bst, saveraw = TRUE) +{ + isnull <- is.null(bst$handle) + if (!isnull) { + isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost") + } + if (isnull) { + bst$handle <- xgb.Booster(modelfile = bst$raw) + } else { + if (is.null(bst$raw) && saveraw) + bst$raw <- xgb.save.raw(bst$handle) + } + return(bst) +} + +#' Predict method for eXtreme Gradient Boosting model +#' +#' Predicted values based on either xgboost model or model handle object. +#' +#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle} +#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or +#' \code{xgb.DMatrix}. +#' @param missing Missing is only used when input is dense matrix, pick a float +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param outputmargin whether the prediction should be shown in the original +#' value of sum of functions, when outputmargin=TRUE, the prediction is +#' untransformed margin value. In logistic regression, outputmargin=T will +#' output value before logistic transformation. +#' @param ntreelimit limit number of trees used in prediction, this parameter is +#' only valid for gbtree, but not for gblinear. set it to be value bigger +#' than 0. It will use all trees by default. +#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object. +#' @param ... Parameters pass to \code{predict.xgb.Booster} +#' +#' @details +#' The option \code{ntreelimit} purpose is to let the user train a model with lots +#' of trees but use only the first trees for prediction to avoid overfitting +#' (without having to train a new model with less trees). +#' +#' The option \code{predleaf} purpose is inspired from §3.1 of the paper +#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +#' The idea is to use the model as a generator of new features which capture non linear link +#' from original features. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' data(agaricus.test, package='xgboost') +#' train <- agaricus.train +#' test <- agaricus.test +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +#' pred <- predict(bst, test$data) +#' @rdname predict.xgb.Booster +#' @export +predict.xgb.Booster <- function(object, newdata, missing = NA, + outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) { + if (class(object) != "xgb.Booster"){ + stop("predict: model in prediction must be of class xgb.Booster") + } else { + object <- xgb.Booster.check(object, saveraw = FALSE) + } + if (class(newdata) != "xgb.DMatrix") { + newdata <- xgb.DMatrix(newdata, missing = missing) + } + if (is.null(ntreelimit)) { + ntreelimit <- 0 + } else { + if (ntreelimit < 1){ + stop("predict: ntreelimit must be equal to or greater than 1") + } + } + option <- 0 + if (outputmargin) { + option <- option + 1 + } + if (predleaf) { + option <- option + 2 + } + ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), + as.integer(ntreelimit), PACKAGE = "xgboost") + if (predleaf){ + len <- getinfo(newdata, "nrow") + if (length(ret) == len){ + ret <- matrix(ret,ncol = 1) + } else { + ret <- matrix(ret, ncol = len) + ret <- t(ret) + } + } + return(ret) +} + +#' @rdname predict.xgb.Booster +#' @export +predict.xgb.Booster.handle <- function(object, ...) { + + bst <- xgb.handleToBooster(object) + + ret <- predict(bst, ...) + return(ret) +} diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index c34c65d95b4a..744c2a05cee8 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -1,9 +1,9 @@ #' Contruct xgb.DMatrix object #' -#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. +#' Contruct xgb.DMatrix object from dense matrix, sparse matrix +#' or local file (that was created previously by saving an \code{xgb.DMatrix}). #' -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character -#' indicating the data file. +#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename #' @param info a list of information of the xgb.DMatrix object #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. @@ -18,27 +18,339 @@ #' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' @export xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { + cnames <- NULL if (typeof(data) == "character") { handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE = "xgboost") } else if (is.matrix(data)) { handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE = "xgboost") + cnames <- colnames(data) } else if (class(data) == "dgCMatrix") { handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE = "xgboost") + cnames <- colnames(data) } else { stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) } - dmat <- structure(handle, class = "xgb.DMatrix") + dmat <- handle + attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix") + #dmat <- list(handle = handle, colnames = cnames) + #attr(dmat, 'class') <- "xgb.DMatrix" info <- append(info, list(...)) if (length(info) == 0) return(dmat) for (i in 1:length(info)) { p <- info[i] - xgb.setinfo(dmat, names(p), p[[1]]) + setinfo(dmat, names(p), p[[1]]) } return(dmat) } + + +# get dmatrix from data, label +# internal helper method +xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { + inClass <- class(data) + if (inClass == "dgCMatrix" || inClass == "matrix") { + if (is.null(label)) { + stop("xgboost: need label when data is a matrix") + } + dtrain <- xgb.DMatrix(data, label = label, missing = missing) + if (!is.null(weight)){ + setinfo(dtrain, "weight", weight) + } + } else { + if (!is.null(label)) { + warning("xgboost: label will be ignored.") + } + if (inClass == "character") { + dtrain <- xgb.DMatrix(data) + } else if (inClass == "xgb.DMatrix") { + dtrain <- data + } else if (inClass == "data.frame") { + stop("xgboost only support numerical matrix input, + use 'data.matrix' to transform the data.") + } else { + stop("xgboost: Invalid input of data") + } + } + return (dtrain) +} + + +#' Dimensions of xgb.DMatrix +#' +#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. +#' @param x Object of class \code{xgb.DMatrix} +#' +#' @details +#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also +#' be directly used with an \code{xgb.DMatrix} object. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' stopifnot(nrow(dtrain) == nrow(train$data)) +#' stopifnot(ncol(dtrain) == ncol(train$data)) +#' stopifnot(all(dim(dtrain) == dim(train$data))) +#' +#' @export +dim.xgb.DMatrix <- function(x) { + c(.Call("XGDMatrixNumRow_R", x, PACKAGE="xgboost"), + .Call("XGDMatrixNumCol_R", x, PACKAGE="xgboost")) +} + + +#' Handling of column names of \code{xgb.DMatrix} +#' +#' Only column names are supported for \code{xgb.DMatrix}, thus setting of +#' row names would have no effect and returnten row names would be NULL. +#' +#' @param x object of class \code{xgb.DMatrix} +#' @param value a list of two elements: the first one is ignored +#' and the second one is column names +#' +#' @details +#' Generic \code{dimnames} methods are used by \code{colnames}. +#' Since row names are irrelevant, it is recommended to use \code{colnames} directly. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' dimnames(dtrain) +#' colnames(dtrain) +#' colnames(dtrain) <- make.names(1:ncol(train$data)) +#' print(dtrain, verbose=TRUE) +#' +#' @rdname dimnames.xgb.DMatrix +#' @export +dimnames.xgb.DMatrix <- function(x) { + attr(x, '.Dimnames') +} + +#' @rdname dimnames.xgb.DMatrix +#' @export +`dimnames<-.xgb.DMatrix` <- function(x, value) { + if (!is.list(value) || length(value) != 2L) + stop("invalid 'dimnames' given: must be a list of two elements") + if (!is.null(value[[1L]])) + stop("xgb.DMatrix does not have rownames") + if (is.null(value[[2]])) { + attr(x, '.Dimnames') <- NULL + return(x) + } + if (ncol(x) != length(value[[2]])) + stop("can't assign ", length(value[[2]]), " colnames to a ", + ncol(x), " column xgb.DMatrix") + attr(x, '.Dimnames') <- value + x +} + + +#' Get information of an xgb.DMatrix object +#' +#' Get information of an xgb.DMatrix object +#' @param object Object of class \code{xgb.DMatrix} +#' @param name the name of the information field to get (see details) +#' @param ... other parameters +#' +#' @details +#' The \code{name} field can be one of the following: +#' +#' \itemize{ +#' \item \code{label}: label Xgboost learn from ; +#' \item \code{weight}: to do a weight rescale ; +#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; +#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}. +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' labels <- getinfo(dtrain, 'label') +#' setinfo(dtrain, 'label', 1-labels) +#' +#' labels2 <- getinfo(dtrain, 'label') +#' stopifnot(all(labels2 == 1-labels)) +#' @rdname getinfo +#' @export +getinfo <- function(object, ...) UseMethod("getinfo") + +#' @rdname getinfo +#' @export +getinfo.xgb.DMatrix <- function(object, name) { + if (typeof(name) != "character") { + stop("getinfo: name must be character") + } + if (name != "label" && name != "weight" && + name != "base_margin" && name != "nrow") { + stop(paste("getinfo: unknown info name", name)) + } + if (name != "nrow"){ + ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost") + } else { + ret <- nrow(object) + } + return(ret) +} + + +#' Set information of an xgb.DMatrix object +#' +#' Set information of an xgb.DMatrix object +#' +#' @param object Object of class "xgb.DMatrix" +#' @param name the name of the field to get +#' @param info the specific field of information to set +#' @param ... other parameters +#' +#' @details +#' The \code{name} field can be one of the following: +#' +#' \itemize{ +#' \item \code{label}: label Xgboost learn from ; +#' \item \code{weight}: to do a weight rescale ; +#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; +#' \item \code{group}. +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' labels <- getinfo(dtrain, 'label') +#' setinfo(dtrain, 'label', 1-labels) +#' labels2 <- getinfo(dtrain, 'label') +#' stopifnot(all.equal(labels2, 1-labels)) +#' @rdname setinfo +#' @export +setinfo <- function(object, ...) UseMethod("setinfo") + +#' @rdname setinfo +#' @export +setinfo.xgb.DMatrix <- function(object, name, info) { + if (name == "label") { + if (length(info) != nrow(object)) + stop("The length of labels must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "weight") { + if (length(info) != nrow(object)) + stop("The length of weights must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "base_margin") { + # if (length(info)!=nrow(object)) + # stop("The length of base margin must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "group") { + if (sum(info) != nrow(object)) + stop("The sum of groups must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.integer(info), + PACKAGE = "xgboost") + return(TRUE) + } + stop(paste("setinfo: unknown info name", name)) + return(FALSE) +} + + +#' Get a new DMatrix containing the specified rows of +#' orginal xgb.DMatrix object +#' +#' Get a new DMatrix containing the specified rows of +#' orginal xgb.DMatrix object +#' +#' @param object Object of class "xgb.DMatrix" +#' @param idxset a integer vector of indices of rows needed +#' @param colset currently not used (columns subsetting is not available) +#' @param ... other parameters (currently not used) +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' dsub <- slice(dtrain, 1:42) +#' labels1 <- getinfo(dsub, 'label') +#' dsub <- dtrain[1:42, ] +#' labels2 <- getinfo(dsub, 'label') +#' all.equal(labels1, labels2) +#' +#' @rdname slice.xgb.DMatrix +#' @export +slice <- function(object, ...) UseMethod("slice") + +#' @rdname slice.xgb.DMatrix +#' @export +slice.xgb.DMatrix <- function(object, idxset, ...) { + if (class(object) != "xgb.DMatrix") { + stop("slice: first argument dtrain must be xgb.DMatrix") + } + ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") + + attr_list <- attributes(object) + nr <- nrow(object) + len <- sapply(attr_list, length) + ind <- which(len == nr) + if (length(ind) > 0) { + nms <- names(attr_list)[ind] + for (i in 1:length(ind)) { + attr(ret, nms[i]) <- attr(object, nms[i])[idxset] + } + } + return(structure(ret, class = "xgb.DMatrix")) +} + +#' @rdname slice.xgb.DMatrix +#' @export +`[.xgb.DMatrix` <- function(object, idxset, colset=NULL) { + slice(object, idxset) +} + + +#' Print xgb.DMatrix +#' +#' Print information about an xgb.DMatrix. +#' Currently is would display +#' @param x an xgb.DMatrix object +#' @param verbose whether to print colnames (when present) +#' +#' @export +print.xgb.DMatrix <- function(x, verbose=FALSE, ...) { + cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') + infos <- c() + if(length(getinfo(x, 'label')) > 0) infos <- 'label' + if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') + if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') + if (length(infos) == 0) infos <- 'NA' + cat(infos) + cnames <- colnames(x) + cat(' colnames:') + if (verbose & !is.null(cnames)) { + cat("\n'") + cat(cnames, sep="','") + cat("'") + } else { + if (is.null(cnames)) cat(' no') + else cat(' yes') + } + cat("\n") + invisible(x) +} diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 7ad2e5c41a62..e3faf33a08c8 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -169,11 +169,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = num_class <- params[['num_class']] if (is.null(num_class)) stop('must set num_class to use softmax') - predictValues <- matrix(0,xgb.numrow(dtrain),num_class) + predictValues <- matrix(0, nrow(dtrain), num_class) mat_pred <- TRUE } else - predictValues <- rep(0,xgb.numrow(dtrain)) + predictValues <- rep(0, nrow(dtrain)) history <- c() print.every.n <- max(as.integer(print.every.n), 1L) for (i in 1:nrounds) { diff --git a/R-package/man/dim.xgb.DMatrix.Rd b/R-package/man/dim.xgb.DMatrix.Rd new file mode 100644 index 000000000000..168782dec008 --- /dev/null +++ b/R-package/man/dim.xgb.DMatrix.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{dim.xgb.DMatrix} +\alias{dim.xgb.DMatrix} +\title{Dimensions of xgb.DMatrix} +\usage{ +\method{dim}{xgb.DMatrix}(x) +} +\arguments{ +\item{x}{Object of class \code{xgb.DMatrix}} +} +\description{ +Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. +} +\details{ +Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also +be directly used with an \code{xgb.DMatrix} object. +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) + +stopifnot(nrow(dtrain) == nrow(train$data)) +stopifnot(ncol(dtrain) == ncol(train$data)) +stopifnot(all(dim(dtrain) == dim(train$data))) + +} + diff --git a/R-package/man/dimnames.xgb.DMatrix.Rd b/R-package/man/dimnames.xgb.DMatrix.Rd new file mode 100644 index 000000000000..0877f294b952 --- /dev/null +++ b/R-package/man/dimnames.xgb.DMatrix.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{dimnames.xgb.DMatrix} +\alias{dimnames.xgb.DMatrix} +\alias{dimnames<-.xgb.DMatrix} +\title{Handling of column names of \code{xgb.DMatrix}} +\usage{ +\method{dimnames}{xgb.DMatrix}(x) + +\method{dimnames}{xgb.DMatrix}(x) <- value +} +\arguments{ +\item{x}{object of class \code{xgb.DMatrix}} + +\item{value}{a list of two elements: the first one is ignored +and the second one is column names} +} +\description{ +Only column names are supported for \code{xgb.DMatrix}, thus setting of +row names would have no effect and returnten row names would be NULL. +} +\details{ +Generic \code{dimnames} methods are used by \code{colnames}. +Since row names are irrelevant, it is recommended to use \code{colnames} directly. +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) +dimnames(dtrain) +colnames(dtrain) +colnames(dtrain) <- make.names(1:ncol(train$data)) +print(dtrain, verbose=TRUE) + +} + diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index f8b4f6b991e8..108d7edf2039 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,27 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getinfo.xgb.DMatrix.R -\docType{methods} +% Please edit documentation in R/xgb.DMatrix.R \name{getinfo} \alias{getinfo} -\alias{getinfo,xgb.DMatrix-method} +\alias{getinfo.xgb.DMatrix} \title{Get information of an xgb.DMatrix object} \usage{ getinfo(object, ...) -\S4method{getinfo}{xgb.DMatrix}(object, name) +\method{getinfo}{xgb.DMatrix}(object, name) } \arguments{ \item{object}{Object of class \code{xgb.DMatrix}} \item{...}{other parameters} -\item{name}{the name of the field to get} +\item{name}{the name of the information field to get (see details)} } \description{ Get information of an xgb.DMatrix object } \details{ -The information can be one of the following: +The \code{name} field can be one of the following: \itemize{ \item \code{label}: label Xgboost learn from ; @@ -34,8 +33,10 @@ The information can be one of the following: data(agaricus.train, package='xgboost') train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) + labels <- getinfo(dtrain, 'label') setinfo(dtrain, 'label', 1-labels) + labels2 <- getinfo(dtrain, 'label') stopifnot(all(labels2 == 1-labels)) } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd new file mode 100644 index 000000000000..504037937a6f --- /dev/null +++ b/R-package/man/predict.xgb.Booster.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{predict.xgb.Booster} +\alias{predict.xgb.Booster} +\alias{predict.xgb.Booster.handle} +\title{Predict method for eXtreme Gradient Boosting model} +\usage{ +\method{predict}{xgb.Booster}(object, newdata, missing = NA, + outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) + +\method{predict}{xgb.Booster.handle}(object, ...) +} +\arguments{ +\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}} + +\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or +\code{xgb.DMatrix}.} + +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} + +\item{outputmargin}{whether the prediction should be shown in the original +value of sum of functions, when outputmargin=TRUE, the prediction is +untransformed margin value. In logistic regression, outputmargin=T will +output value before logistic transformation.} + +\item{ntreelimit}{limit number of trees used in prediction, this parameter is +only valid for gbtree, but not for gblinear. set it to be value bigger +than 0. It will use all trees by default.} + +\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.} + +\item{...}{Parameters pass to \code{predict.xgb.Booster}} +} +\description{ +Predicted values based on either xgboost model or model handle object. +} +\details{ +The option \code{ntreelimit} purpose is to let the user train a model with lots +of trees but use only the first trees for prediction to avoid overfitting +(without having to train a new model with less trees). + +The option \code{predleaf} purpose is inspired from §3.1 of the paper +\code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +The idea is to use the model as a generator of new features which capture non linear link +from original features. +} +\examples{ +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +train <- agaricus.train +test <- agaricus.test + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +pred <- predict(bst, test$data) +} + diff --git a/R-package/man/print.xgb.DMatrix.Rd b/R-package/man/print.xgb.DMatrix.Rd new file mode 100644 index 000000000000..92c87eec88f5 --- /dev/null +++ b/R-package/man/print.xgb.DMatrix.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{print.xgb.DMatrix} +\alias{print.xgb.DMatrix} +\title{Print xgb.DMatrix} +\usage{ +print.xgb.DMatrix(x, verbose = FALSE, ...) +} +\arguments{ +\item{x}{an xgb.DMatrix object} + +\item{verbose}{whether to print colnames (when present)} +} +\description{ +Print information about an xgb.DMatrix. +Currently is would display +} + diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index cb939721e964..b182a90422e8 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,14 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/setinfo.xgb.DMatrix.R -\docType{methods} +% Please edit documentation in R/xgb.DMatrix.R \name{setinfo} \alias{setinfo} -\alias{setinfo,xgb.DMatrix-method} +\alias{setinfo.xgb.DMatrix} \title{Set information of an xgb.DMatrix object} \usage{ setinfo(object, ...) -\S4method{setinfo}{xgb.DMatrix}(object, name, info) +\method{setinfo}{xgb.DMatrix}(object, name, info) } \arguments{ \item{object}{Object of class "xgb.DMatrix"} @@ -23,7 +22,7 @@ setinfo(object, ...) Set information of an xgb.DMatrix object } \details{ -It can be one of the following: +The \code{name} field can be one of the following: \itemize{ \item \code{label}: label Xgboost learn from ; @@ -36,9 +35,10 @@ It can be one of the following: data(agaricus.train, package='xgboost') train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) + labels <- getinfo(dtrain, 'label') setinfo(dtrain, 'label', 1-labels) labels2 <- getinfo(dtrain, 'label') -stopifnot(all(labels2 == 1-labels)) +stopifnot(all.equal(labels2, 1-labels)) } diff --git a/R-package/man/slice.xgb.DMatrix.Rd b/R-package/man/slice.xgb.DMatrix.Rd new file mode 100644 index 000000000000..c352203fbc80 --- /dev/null +++ b/R-package/man/slice.xgb.DMatrix.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{slice} +\alias{[.xgb.DMatrix} +\alias{slice} +\alias{slice.xgb.DMatrix} +\title{Get a new DMatrix containing the specified rows of +orginal xgb.DMatrix object} +\usage{ +slice(object, ...) + +\method{slice}{xgb.DMatrix}(object, idxset, ...) + +\method{[}{xgb.DMatrix}(object, idxset, colset = NULL) +} +\arguments{ +\item{object}{Object of class "xgb.DMatrix"} + +\item{...}{other parameters (currently not used)} + +\item{idxset}{a integer vector of indices of rows needed} + +\item{colset}{currently not used (columns subsetting is not available)} +} +\description{ +Get a new DMatrix containing the specified rows of +orginal xgb.DMatrix object +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) + +dsub <- slice(dtrain, 1:42) +labels1 <- getinfo(dsub, 'label') +dsub <- dtrain[1:42, ] +labels2 <- getinfo(dsub, 'label') +all.equal(labels1, labels2) + +} + diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 2e892cc6d952..a960d585a7ae 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -7,8 +7,7 @@ xgb.DMatrix(data, info = list(), missing = NA, ...) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character -indicating the data file.} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} \item{info}{a list of information of the xgb.DMatrix object} @@ -18,7 +17,8 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item{...}{other information to pass to \code{info}.} } \description{ -Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. +Contruct xgb.DMatrix object from dense matrix, sparse matrix +or local file (that is a previously saved \code{xgb.DMatrix}). } \examples{ data(agaricus.train, package='xgboost') From 4b760762f9e43f9edf29e2e0398edfd89c0f7843 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:23:08 -0500 Subject: [PATCH 4/6] added unit tests for xgb.DMatrix --- R-package/tests/testthat/test_dmatrix.R | 67 +++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 R-package/tests/testthat/test_dmatrix.R diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R new file mode 100644 index 000000000000..baead5099ed7 --- /dev/null +++ b/R-package/tests/testthat/test_dmatrix.R @@ -0,0 +1,67 @@ +require(xgboost) + +context("testing xgb.DMatrix functionality") + +data(agaricus.test, package='xgboost') +test_data <- agaricus.test$data[1:100,] +test_label <- agaricus.test$label[1:100] + +test_that("xgb.DMatrix: basic construction, saving, loading", { + # from sparse matrix + dtest1 <- xgb.DMatrix(test_data, label=test_label) + # from dense matrix + dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label) + expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) + + # save to a local file + tmp_file <- tempfile('xgb.DMatrix_') + expect_true(xgb.DMatrix.save(dtest1, tmp_file)) + # read from a local file + dtest3 <- xgb.DMatrix(tmp_file) + unlink(tmp_file) + expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) +}) + +test_that("xgb.DMatrix: getinfo & setinfo", { + dtest <- xgb.DMatrix(test_data) + expect_true(setinfo(dtest, 'label', test_label)) + labels <- getinfo(dtest, 'label') + expect_equal(test_label, getinfo(dtest, 'label')) + + expect_true(length(getinfo(dtest, 'weight')) == 0) + expect_true(length(getinfo(dtest, 'base_margin')) == 0) + + expect_true(setinfo(dtest, 'weight', test_label)) + expect_true(setinfo(dtest, 'base_margin', test_label)) + expect_true(setinfo(dtest, 'group', c(50,50))) + expect_error(setinfo(dtest, 'group', test_label)) + + # providing character values will give a warning + expect_warning( setinfo(dtest, 'weight', rep('a', nrow(test_data))) ) + + # any other label should error + expect_error(setinfo(dtest, 'asdf', test_label)) +}) + +test_that("xgb.DMatrix: slice, dim", { + dtest <- xgb.DMatrix(test_data, label=test_label) + expect_equal(dim(dtest), dim(test_data)) + dsub1 <- slice(dtest, 1:42) + expect_equal(nrow(dsub1), 42) + expect_equal(ncol(dsub1), ncol(test_data)) + + dsub2 <- dtest[1:42,] + expect_equal(dim(dtest), dim(test_data)) + expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label')) +}) + +test_that("xgb.DMatrix: colnames", { + dtest <- xgb.DMatrix(test_data, label=test_label) + expect_equal(colnames(dtest), colnames(test_data)) + expect_error( colnames(dtest) <- 'asdf') + new_names <- make.names(1:ncol(test_data)) + expect_silent( colnames(dtest) <- new_names) + expect_equal(colnames(dtest), new_names) + expect_silent(colnames(dtest) <- NULL) + expect_null(colnames(dtest)) +}) From fb5291271e18a533d38b169eca93cd0b73bf2031 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:42:26 -0500 Subject: [PATCH 5/6] fix print.xgb.DMatrix doc --- R-package/R/xgb.DMatrix.R | 13 +++++++++++-- R-package/man/print.xgb.DMatrix.Rd | 12 ++++++++++-- R-package/man/xgb.DMatrix.Rd | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 744c2a05cee8..6bd10a189654 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -327,11 +327,20 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { #' Print xgb.DMatrix #' -#' Print information about an xgb.DMatrix. -#' Currently is would display +#' Print information about xgb.DMatrix. +#' Currently it displays dimensions and presence of info-fields and colnames. +#' #' @param x an xgb.DMatrix object #' @param verbose whether to print colnames (when present) #' +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' dtrain +#' print(dtrain, verbose=T) #' @export print.xgb.DMatrix <- function(x, verbose=FALSE, ...) { cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') diff --git a/R-package/man/print.xgb.DMatrix.Rd b/R-package/man/print.xgb.DMatrix.Rd index 92c87eec88f5..c90e05c938d5 100644 --- a/R-package/man/print.xgb.DMatrix.Rd +++ b/R-package/man/print.xgb.DMatrix.Rd @@ -12,7 +12,15 @@ print.xgb.DMatrix(x, verbose = FALSE, ...) \item{verbose}{whether to print colnames (when present)} } \description{ -Print information about an xgb.DMatrix. -Currently is would display +Print information about xgb.DMatrix. +Currently it displays dimensions and presence of info-fields and colnames. +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) + +dtrain +print(dtrain, verbose=T) } diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index a960d585a7ae..cdb8fdf62c43 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -18,7 +18,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu } \description{ Contruct xgb.DMatrix object from dense matrix, sparse matrix -or local file (that is a previously saved \code{xgb.DMatrix}). +or local file (that was created previously by saving an \code{xgb.DMatrix}). } \examples{ data(agaricus.train, package='xgboost') From 33131e2e13c2e0e0aaafb61f55d53fe0a8f5e492 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 20:28:40 -0500 Subject: [PATCH 6/6] make travis happy --- R-package/R/xgb.DMatrix.R | 4 ++-- R-package/man/print.xgb.DMatrix.Rd | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 6bd10a189654..c5f4fed72576 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -332,7 +332,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { #' #' @param x an xgb.DMatrix object #' @param verbose whether to print colnames (when present) -#' +#' @param ... not currently used #' #' @examples #' data(agaricus.train, package='xgboost') @@ -340,7 +340,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { #' dtrain <- xgb.DMatrix(train$data, label=train$label) #' #' dtrain -#' print(dtrain, verbose=T) +#' print(dtrain, verbose=TRUE) #' @export print.xgb.DMatrix <- function(x, verbose=FALSE, ...) { cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') diff --git a/R-package/man/print.xgb.DMatrix.Rd b/R-package/man/print.xgb.DMatrix.Rd index c90e05c938d5..95837b9f49da 100644 --- a/R-package/man/print.xgb.DMatrix.Rd +++ b/R-package/man/print.xgb.DMatrix.Rd @@ -10,6 +10,8 @@ print.xgb.DMatrix(x, verbose = FALSE, ...) \item{x}{an xgb.DMatrix object} \item{verbose}{whether to print colnames (when present)} + +\item{...}{not currently used} } \description{ Print information about xgb.DMatrix. @@ -21,6 +23,6 @@ train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) dtrain -print(dtrain, verbose=T) +print(dtrain, verbose=TRUE) }