Merge branch 'rc0.3.0'

kaz-yos · Feb 18, 2014 · 1952788 · 1952788
2 parents 9b93ff8 + b8e13e7
commit 1952788
Show file tree

Hide file tree

Showing 18 changed files with 800 additions and 700 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: tableone
 Type: Package
 Title: Create "Table 1" to describe baseline characteristics
-Version: 0.2.1
-Date: 2014-02-15
+Version: 0.3.0
+Date: 2014-02-17
 Author: Kazuki Yoshida, Justin Bohn
 Maintainer: Kazuki Yoshida <[email protected]>
 Description: This package creates "Table 1", i.e., description of baseline
-    patient characteristics, which is essential every medical research. This
+    patient characteristics, which is essential in every medical research. This
     package provides functions to create such summaries for continuous and
     categorical variables, optionally with subgroups comparisons. The package
     was insipired by and based on descriptive statistics functions in Deducer,

diff --git a/NEWS b/NEWS
@@ -1,3 +1,20 @@
+tableone 0.3.0 (2014-02-16)
+----------------------------------------------------------------
+NEW FEATURES
+
+* CreateTableOne has a new factorVars argument, a character
+  vector specifying numerically coded variables that should be
+  treated as factors.
+
+* The print method for the TableOne/CatTable class object has a
+  new minMax argument, a logical value specifying whether to show
+  median [min, max] instead of median [IQR] for nonnormal variables
+
+BUG FIXES
+
+* Documentations were fixed to represent the current version.
+
+
 tableone 0.2.1 (2014-02-15)
 ----------------------------------------------------------------
 BUG FIXES

diff --git a/R/CreateCatTable.R b/R/CreateCatTable.R
@@ -1,89 +1,81 @@
 ##' Create an object summarizing categorical variables
-##' 
+##'
 ##' Create an object summarizing categorical variables optionally stratifying
 ##' by one or more startifying variables and performing statistical tests. The
 ##' object gives a table that is easy to use in medical research papers. See
 ##' also \code{\link{print.CatTable}} and \code{\link{summary.CatTable}}.
-##' 
+##'
 ##' @param vars Variable(s) to be summarized given as a character vector.
-##' @param strata Stratifying (grouping) variable name(s) given as a character
-##' vector. If omitted, the overall results are returned.
-##' @param data A data frame in which these variables exist. All variables
-##' (both vars and strata) must be in this data frame.
-##' @param test If TRUE, as in the default and there are more than two groups,
-##' groupwise comparisons are performed. Both tests that require the large
-##' sample approximation and exact tests are performed. Either one of the
-##' result can be obtained from the print method.
-##' @param testApprox A function used to perform the large sample approximation
-##' based tests. The default is \code{\link{chisq.test}}. This is not recommended when some
-##' of the cell have small counts like fewer than 5.
+##' @param strata Stratifying (grouping) variable name(s) given as a character vector. If omitted, the overall results are returned.
+##' @param data A data frame in which these variables exist. All variables (both vars and strata) must be in this data frame.
+##' @param test If TRUE, as in the default and there are more than two groups, groupwise comparisons are performed. Both tests that require the large sample approximation and exact tests are performed. Either one of the result can be obtained from the print method.
+##' @param testApprox A function used to perform the large sample approximation based tests. The default is \code{\link{chisq.test}}. This is not recommended when some of the cell have small counts like fewer than 5.
 ##' @param argsApprox A named list of arguments passed to the function specified in testApprox. The default is \code{list(correct = TRUE)}, which turns on the continuity correction for \code{\link{chisq.test}}.
-##' @param testExact A function used to perform the exact tests. The default is
-##' fisher.test. If the cells have large numbers, it will fail because of
-##' memory limitation. In this situation, the large sample approximation based
-##' should suffice.
+##' @param testExact A function used to perform the exact tests. The default is fisher.test. If the cells have large numbers, it will fail because of memory limitation. In this situation, the large sample approximation based should suffice.
 ##' @param argsExact A named list of arguments passed to the function specified in testExact. The default is \code{list(workspace = 2*10^5)}, which specifies the memory space allocated for \code{\link{fisher.test}}.
-##' @return An object of class \code{CatTable}, which really is a \code{\link{by}} object with
-##' additional attributes. Each element of the \code{\link{by}} part is a matrix with rows
-##' representing variables, and columns representing summary statistics.
+##' @return An object of class \code{CatTable}, which really is a \code{\link{by}} object with additional attributes. Each element of the \code{\link{by}} part is a matrix with rows representing variables, and columns representing summary statistics.
 ##' @author Kazuki Yoshida (based on \code{Deducer::frequencies()})
 ##' @seealso
 ##' \code{\link{CreateCatTable}}, \code{\link{print.CatTable}}, \code{\link{summary.CatTable}},
 ##' \code{\link{CreateContTable}}, \code{\link{print.ContTable}}, \code{\link{summary.ContTable}},
 ##' \code{\link{CreateTableOne}}, \code{\link{print.TableOne}}, \code{\link{summary.TableOne}}
 ##' @examples
-##' 
+##'
 ##' ## Load
 ##' library(tableone)
-##' 
+##'
 ##' ## Load Mayo Clinic Primary Biliary Cirrhosis Data
 ##' library(survival)
 ##' data(pbc)
 ##' ## Check variables
 ##' head(pbc)
-##' 
+##'
 ##' ## Create an overall table for categorical variables
 ##' catVars <- c("status","ascites","hepato","spiders","edema","stage")
 ##' catTableOverall <- CreateCatTable(vars = catVars, data = pbc)
-##' 
+##'
 ##' ## Simply typing the object name will invoke the print.CatTable method,
 ##' ## which will show the sample size, frequencies and percentages.
-##' ## For 2-level variables, only the higher level is shown for simplicity.
+##' ## For 2-level variables, only the higher level is shown for simplicity
+##' ## unless the variables are specified in the cramVars argument.
 ##' catTableOverall
+##'
+##' ## If you need to show both levels for some 2-level factors, use cramVars
+##' print(catTableOverall, cramVars = "hepato")
 ##' 
 ##' ## Use the showAllLevels argument to see all levels for all variables.
 ##' print(catTableOverall, showAllLevels = TRUE)
-##' 
+##'
 ##' ## You can choose form frequencies ("f") and/or percentages ("p") or both.
 ##' ## "fp" frequency (percentage) is the default. Row names change accordingly.
 ##' print(catTableOverall, format = "f")
 ##' print(catTableOverall, format = "p")
-##' 
+##'
 ##' ## To further examine the variables, use the summary.CatTable method,
 ##' ## which will show more details.
 ##' summary(catTableOverall)
-##' 
+##'
 ##' ## The table can be stratified by one or more variables
 ##' catTableBySexTrt <- CreateCatTable(vars = catVars,
 ##'                                    strata = c("sex","trt"), data = pbc)
-##' 
+##'
 ##' ## print now includes p-values which are by default calculated by chisq.test.
 ##' ## It is formatted at the decimal place specified by the pDigits argument
 ##' ## (3 by default). It does <0.001 for you.
 ##' catTableBySexTrt
-##' 
-##' ## The exact argument will toggle the p-values to the example test result from
+##'
+##' ## The exact argument toggles the p-values to the exact test result from
 ##' ## fisher.test. It will show which ones are from exact tests.
 ##' print(catTableBySexTrt, exact = "ascites")
-##' 
+##'
 ##' ## summary now includes both types of p-values
 ##' summary(catTableBySexTrt)
-##' 
+##'
 ##' ## If your work flow includes copying to Excel and Word when writing manuscripts,
 ##' ## you may benefit from the quote argument. This will quote everything so that
 ##' ## Excel does not mess up the cells.
 ##' print(catTableBySexTrt, exact = "ascites", quote = TRUE)
-##' 
+##'
 ##' @export
 CreateCatTable <-
     function(vars,                                 # character vector of variable names
@@ -99,33 +91,33 @@ CreateCatTable <-
 ### Data check
     ## Check if the data given is a dataframe
     ModuleStopIfNotDataFrame(data)
-    
+
     ## Check if variables exist. Drop them if not.
     vars <- ModuleReturnVarsExist(vars, data)
 
     ## Abort if no variables exist at this point
     ModuleStopIfNoVarsLeft(vars)
-    
+
     ## Extract necessary variables (unused variables are not included in dat)
     dat <- data[c(vars)]
 
+    ## Toggle test FALSE if no strata
+    test <- ModuleReturnFalseIfNoStrata(strata, test)
+
     ## Convert to a factor if it is not a factor already. (categorical version only)
     ## Not done on factors, to avoid dropping zero levels.
     datNotFactor <- sapply(dat, class) != "factor"
     dat[datNotFactor] <- lapply(dat[datNotFactor], factor)
 
-    ## Toggle test FALSE if no strata
-    test <- ModuleReturnFalseIfNoStrata(strata, test)
-
     ## Create strata data frame (data frame with only strata variables)
     strata <- ModuleReturnStrata(strata, data, dat)
 
 
-### Perform descriptive analysis
+### Actual descriptive statistics are calculated here.
 
     ## strata--variable-CreateTableForOneVar structure
     ## Devide by strata
-    result <- by(data = dat, INDICES = strata,
+    result <- by(data = dat, INDICES = strata, # INDICES can be a multi-column data frame
 
                  ## Work on each stratum
                  FUN = function(dfStrataDat) { # dfStrataDat should be a data frame
@@ -137,56 +129,47 @@ CreateCatTable <-
 
                  }, simplify = FALSE)
 
-    
-    ## Add stratification information to the column header
+
+    ## Add stratification variable information as an attribute
     if (length(result) > 1 ) {
         ## strataVarName from dimension headers
         strataVarName <- ModuleCreateStrataVarName(result)
         ## Add an attribute for the stratifying variable name
         attributes(result) <- c(attributes(result),
                                 list(strataVarName = strataVarName))
     }
-    
+
 
 ### Perform tests when necessary
     ## Initialize
-    pValues <- NULL
+    pValues   <- NULL
     listXtabs <- list()
 
-    ## Only when test is asked for              # Should always do this?
+    ## Only when test is asked for
     if (test == TRUE) {
 
-        ## Create all combinations of strata levels and collapse as a vector for level combinations.
-        dfStrataLevels <- expand.grid(attr(result, "dimnames")) # 1st var cycles fastest, consistent with by()
-        ## Create a single variable representing all strata        
-        strataLevels <- apply(X      = dfStrataLevels,
-                              MARGIN = 1,
-                              FUN    = paste0, collapse = ":")
-        ## Create the actual variable from the observed levels
-        strataVar <- as.character(interaction(strata, sep = ":"))
-
-
-        ## Make it a factor (kruskal.test requires it). Use levels not to drop defined nonexisting levels.
-        strataVar                   <- factor(strataVar, levels = strataLevels)
-
+        ## Create a single variable representation of multivariable stratification
+        strataVar <- ModuleCreateStrataVarAsFactor(result, strata)
+
         ## Loop over variables in dat, and create a list of xtabs
+        ## Empty strata are kept in the corss tables. Different behavior than the cont counterpart!
         listXtabs <- sapply(X = names(dat),
                             FUN = function(var) {
                                 ## Create a formula
                                 formula <- paste0("~ ", var, " + ", "strataVar")
                                 formula <- as.formula(formula)
-                                
+
                                 ## Create a 2-dimensional crosstable
                                 xtabs(formula = formula, data = dat)
                             },
                             simplify = FALSE)
 
         ## Rename the second dimension of the xtabs with the newly create name.
         for (i in seq_along(listXtabs)) {
-            
+
             names(dimnames(listXtabs[[i]]))[2] <- strataVarName
-        }        
-        
+        }
+
         ## Loop over xtabs, and create p-values
         pValues <- sapply(X = listXtabs,
                           FUN = function(xtabs) {
@@ -196,7 +179,7 @@ CreateCatTable <-
                                   pExact  = ModuleTestSafe(xtabs, testExact,  argsExact)
                                   )
                           },
-                          simplify = FALSE)        
+                          simplify = FALSE)
 
         ## Create a single data frame (n x 2 (normal,nonormal))
         pValues <- do.call(rbind, pValues)