From 831013a8892a6d39e12bbd45aeb559ccd258bc0a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 5 Aug 2021 18:12:04 -0700 Subject: [PATCH] format_col/format_list_item printing generics for customization (#3414) --- NAMESPACE | 8 ++++ NEWS.md | 2 + R/print.data.table.R | 86 ++++++++++++++++++++++++----------------- inst/tests/tests.Rraw | 34 ++++++++++++++++ man/print.data.table.Rd | 35 +++++++++++++++++ 5 files changed, 130 insertions(+), 35 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fbd4f8df21..999a834304 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -187,3 +187,11 @@ S3method(unique, ITime) S3method('[<-', IDate) S3method(edit, data.table) +# generics to support custom column formatters +export(format_col) +S3method(format_col, default) +S3method(format_col, POSIXct) +S3method(format_col, expression) +export(format_list_item) +S3method(format_list_item, default) + diff --git a/NEWS.md b/NEWS.md index b5b93e3546..f333ec0356 100644 --- a/NEWS.md +++ b/NEWS.md @@ -99,6 +99,8 @@ 16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. +17. `data.table` printing now supports customizable methods for both columns and list column row items, part of [#1523](https://github.com/Rdatatable/data.table/issues/1523). `format_col` is S3-generic for customizing how to print whole columns; `format_list_item` is S3-generic for customizing how to print each row of a list column. Thanks to @mllg who initially filed [#3338](https://github.com/Rdatatable/data.table/pulls/3338) with the seed of the idea, @franknarf1 who earlier suggested the idea of providing custom formatters, @fparages who submitted a patch to improve the printing of timezones for [#2842](https://github.com/Rdatatable/data.table/issues/2842), @RichardRedding for pointing out an error relating to printing wide `expression` columns in [#3011](https://github.com/Rdatatable/data.table/issues/3011), and @MichaelChirico for implementing. See `?print.data.table` for examples. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/print.data.table.R b/R/print.data.table.R index 3f19cdc391..023551074a 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -140,44 +140,11 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), invisible(x) } -format.data.table = function (x, ..., justify="none", timezone = FALSE) { +format.data.table = function (x, ..., justify="none") { if (is.atomic(x) && !is.null(x)) { stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } - format.item = function(x) { - if (is.null(x)) # NULL item in a list column - "" - else if (is.atomic(x) || inherits(x,"formula")) # FR #2591 - format.data.table issue with columns of class "formula" - paste(c(format(head(x, 6L), justify=justify, ...), if (length(x) > 6L) "..."), collapse=",") # fix for #37 - format has to be added here... - else - paste0("<", class(x)[1L], paste_dims(x), ">") - } - # FR #2842 add timezone for posix timestamps - format.timezone = function(col) { # paste timezone to a time object - tz = attr(col,'tzone', exact=TRUE) - if (!is.null(tz)) { # date object with tz - nas = is.na(col) - col = paste0(as.character(col)," ",tz) # parse to character - col[nas] = NA_character_ - } - return(col) - } - # FR #1091 for pretty printing of character - # TODO: maybe instead of doing "this is...", we could do "this ... test"? - char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { - trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) - if (!is.character(x) || trunc.char <= 0L) return(x) - idx = which(nchar(x) > trunc.char) - x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") - x - } - do.call("cbind",lapply(x,function(col,...) { - if (!is.null(dim(col))) return("") - if(timezone) col = format.timezone(col) - if (is.list(col)) col = vapply_1c(col, format.item) - else col = format(char.trunc(col), justify=justify, ...) # added an else here to fix #37 - col - },...)) + do.call("cbind", lapply(x, format_col, ..., justify=justify)) } mimicsAutoPrint = c("knit_print.default") @@ -205,6 +172,55 @@ paste_dims = function(x) { paste0("[", paste(dims,collapse="x"), "]") } +format_col = function(x, ...) { + UseMethod("format_col") +} + +format_list_item = function(x, ...) { + UseMethod("format_list_item") +} + +format_col.default = function(x, ...) { + if (!is.null(dim(x))) return("") + if (is.list(x)) return(vapply_1c(x, format_list_item, ...)) + format(char.trunc(x), ...) # relevant to #37 +} + +# #2842 -- different columns can have different tzone, so force usage in output +format_col.POSIXct = function(x, ..., timezone=FALSE) { + if (timezone) { + tz = attr(x,'tzone',exact=TRUE) + nas = is.na(x) + x = paste0(as.character(x)," ",tz) + is.na(x) = nas + } else { + x = format(x, usetz=FALSE) + } + x +} + +# #3011 -- expression columns can wrap to newlines which breaks printing +format_col.expression = function(x, ...) format(char.trunc(as.character(x)), ...) + +format_list_item.default = function(x, ...) { + if (is.null(x)) # NULL item in a list column + "" + else if (is.atomic(x) || inherits(x, "formula")) # FR #2591 - format.data.table issue with columns of class "formula" + paste(c(format(head(x, 6L), ...), if (length(x) > 6L) "..."), collapse=",") # fix for #5435 and #37 - format has to be added here... + else + paste0("<", class(x)[1L], paste_dims(x), ">") +} + +# FR #1091 for pretty printing of character +# TODO: maybe instead of doing "this is...", we could do "this ... test"? +char.trunc <- function(x, trunc.char = getOption("datatable.prettyprint.char")) { + trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) + if (!is.character(x) || trunc.char <= 0L) return(x) + idx = which(nchar(x) > trunc.char) + x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") + x +} + # to calculate widths of data.table for PR #4074 # gets the width of the data.table at each column # and compares it to the console width diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 963c8c9718..9a30d6b3d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -34,6 +34,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table + format_col.default = data.table:::format_col.default + format_list_item.default = data.table:::format_list_item.default getdots = data.table:::getdots groupingsets.data.table = data.table:::groupingsets.data.table guess = data.table:::guess @@ -17002,6 +17004,38 @@ DT = data.table( s4class(x=2L, y="yes", z=1))) test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) +# format_col and format_list_item printing helpers/generics +## Use case: solve #2842 by defining format_col.POSIXct to have usetz = TRUE +DT = data.table( + t1 = as.POSIXct('2018-05-01 12:34:56', tz = 'UTC'), + t2 = as.POSIXct('2018-05-01 12:34:56', tz = 'Asia/Singapore') +) +test(2130.101, print(DT, timezone=TRUE), output='UTC') +test(2130.102, print(DT, timezone=FALSE), notOutput='UTC') + +# default expression printing can break format_col.default, #3011 +test(2130.11, print(data.table(e = expression(1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13))), output = '1 + 2 + 3') + +# format_col generic is used +format_col.complex = function(x, ...) sprintf('(%.1f, %.1fi)', Re(x), Im(x)) +registerS3method("format_col", "complex", format_col.complex) +# this registerS3method does seem to be necessary to work within the test.data.table() environment +# assigning the method using <<- probably works too, but we don't want to write to user's environment at all +x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) +test(2130.12, x, output = '(1.0, 3.0i)') +rm(format_col.complex) +registerS3method("format_col", "complex", format_col.default) +# otherwise it remains registered after test.data.table() and causes test 1610.1 to fail on the next run for example, and user display if they have complex data +# haven't found a way to unregister an S3 method (tried registering NULL but there's an error that NULL isn't a function) + +# format_list_item() generic is used +format_list_item.myclass <- function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">") +registerS3method("format_list_item", "myclass", format_list_item.myclass) +DT = data.table(row = 1:2, objs = list(structure(list(id = "foo"), class = "myclass"), structure(list(id = "bar"), class = "myclass"))) +test(2130.13, print(DT), output = "myclass:foo.*myclass:bar") +rm(format_list_item.myclass) +registerS3method("format_list_item", "myclass", format_list_item.default) + # .SD from grouping should be unlocked, part of #4159 x = data.table(a=1:3, b=4:6) test(2131.1, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'), diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index db7337a381..234fcd8ff1 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -1,10 +1,18 @@ \name{print.data.table} \alias{print.data.table} +\alias{format_col} +\alias{format_col.default} +\alias{format_col.POSIXct} +\alias{format_col.expression} +\alias{format_list_item} +\alias{format_list_item.default} \title{ data.table Printing Options } \description{ \code{print.data.table} extends the functionalities of \code{print.data.frame}. Key enhancements include automatic output compression of many observations and concise column-wise \code{class} summary. + + \code{format_col} and \code{format_list_item} generics provide flexibility for end-users to define custom printing methods for generic classes. } \usage{ \method{print}{data.table}(x, @@ -17,6 +25,14 @@ trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, timezone=FALSE, \dots) + + format_col(x, \dots) + \method{format_col}{default}(x, \dots) + \method{format_col}{POSIXct}(x, \dots, timezone=FALSE) + \method{format_col}{expression}(x, \dots) + + format_list_item(x, \dots) + \method{format_list_item}{default}(x, \dots) } \arguments{ \item{x}{ A \code{data.table}. } @@ -31,8 +47,17 @@ \item{timezone}{ If \code{TRUE}, time columns of class POSIXct or POSIXlt will be printed with their timezones (if attribute is available). } \item{\dots}{ Other arguments ultimately passed to \code{format}. } } +\value{ + \code{print.data.table} returns \code{x} invisibly. + + \code{format_col} returns a \code{length(x)}-size \code{character} vector. + + \code{format_list_item} returns a length-1 \code{character} scalar. +} \details{ By default, with an eye to the typically large number of observations in a \code{data.table}, only the beginning and end of the object are displayed (specifically, \code{head(x, topn)} and \code{tail(x, topn)} are displayed unless \code{nrow(x) < nrows}, in which case all rows will print). + + \code{format_col} is applied at a column level; for example, \code{format_col.POSIXct} is used to tag the time zones of \code{POSIXct} columns. \code{format_list_item} is applied to the elements (rows) of \code{list} columns; see Examples. } \seealso{\code{\link{print.default}}} \examples{ @@ -72,5 +97,15 @@ thing_61 = vector("complex", 3)) print(DT, trunc.cols=TRUE) options(old_width) + + # Formatting customization + format_col.complex = function(x, ...) sprintf('(\%.1f, \%.1fi)', Re(x), Im(x)) + x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) + print(x) + + iris = as.data.table(iris) + iris_agg = iris[ , .(reg = list(lm(Sepal.Length ~ Petal.Length))), by = Species] + format_list_item.lm = function(x, ...) sprintf('', format(x$call$formula)) + print(iris_agg) }