From 72e70d6d40254b16a4132b5d9c4b88c408f50945 Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Tue, 11 Jun 2024 11:21:46 +0200 Subject: [PATCH] Fix #32 --- R/cpp11.R | 4 ++-- R/parser.R | 8 +++---- R/type_convert.R | 3 ++- README.Rmd | 23 ++++++++++++++++++- README.md | 43 ++++++++++++++++++++++++++++++++--- misc/benchmark.md | 14 ++++++------ src/CollectorGuess.cpp | 27 +++++++++++----------- src/cpp11.cpp | 8 +++---- src/utils.h | 7 ++++++ tests/testthat/test-parsing.R | 10 ++++++++ 10 files changed, 111 insertions(+), 36 deletions(-) diff --git a/R/cpp11.R b/R/cpp11.R index a9958c9..abe451b 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,7 +1,7 @@ # Generated by cpp11: do not edit by hand -collectorGuess <- function(input, locale_, guessInteger, guess_max) { - .Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max) +collectorGuess <- function(input, locale_, guessInteger, guess_max, trim_ws) { + .Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max, trim_ws) } parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws) { diff --git a/R/parser.R b/R/parser.R index 50a0c91..fd5c44d 100644 --- a/R/parser.R +++ b/R/parser.R @@ -174,7 +174,6 @@ col_number <- function() { collector("number") } - #' Parse using the "best" type #' #' `parse_guess()` returns the parser vector. This function uses a number of heuristics @@ -203,7 +202,8 @@ col_number <- function() { #' # ISO 8601 date times #' parse_guess(c("2010-10-10")) parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, guess_max = NA, .return_problems = FALSE) { - parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max), na = na, locale = locale, trim_ws = trim_ws, + parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max, trim_ws = trim_ws), + na = na, locale = locale, trim_ws = trim_ws, .return_problems = .return_problems) } @@ -213,7 +213,7 @@ col_guess <- function() { collector("guess") } -guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000) { +guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000, trim_ws = FALSE) { x[x %in% na] <- NA_character_ stopifnot(is.locale(locale)) if (is.na(guess_max)) { @@ -223,7 +223,7 @@ guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na if (abs(guess_max) == Inf || is.nan(guess_max) || guess_max < 1 || is.na(guess_max)) { guess_max <- length(x) } - collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max)) + collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max), trim_ws) } #' Parse factors diff --git a/R/type_convert.R b/R/type_convert.R index 69a5b29..a2bfb3c 100644 --- a/R/type_convert.R +++ b/R/type_convert.R @@ -46,7 +46,8 @@ type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, locale = locale, na = na, guess_integer = guess_integer, - guess_max = guess_max + guess_max = guess_max, + trim_ws = trim_ws ) specs <- col_spec_standardise( diff --git a/README.Rmd b/README.Rmd index 5a5ed24..352cf0c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,7 +22,9 @@ knitr::opts_chunk$set( `minty` (**Min**imal **ty**pe guesser) is a package with the type inferencing and parsing tools (the so-called 1e parsing engine) extracted from `readr` (with permission, see this issue [tidyverse/readr#1517](https://github.com/tidyverse/readr/issues/1517)). Since July 2021, these tools are not used internally by `readr` for parsing text files. Now `vroom` is used by default, unless explicitly call the first edition parsing engine (see the explanation on [editions](https://github.com/tidyverse/readr?tab=readme-ov-file#editions)). -`readr`'s 1e type inferencing and parsing tools are used by various R packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory objects, but those packages do not use the main functions (e.g. `readr::read_delim()`) of `readr`. As explained in the README of `readr`, those 1e code will be eventually removed from `readr`. `minty` aims at providing a set of minimal, long-term, and compatible type inferencing and parsing tools for those packages. +`readr`'s 1e type inferencing and parsing tools are used by various R packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory objects, but those packages do not use the main functions (e.g. `readr::read_delim()`) of `readr`. As explained in the README of `readr`, those 1e code will be eventually removed from `readr`. + +`minty` aims at providing a set of minimal, long-term, and compatible type inferencing and parsing tools for those packages. You might consider `minty` to be 1.5e parsing engine. ## Installation @@ -165,6 +167,25 @@ minty::parse_guess(c("1", "2", "drei"), guess_max = 2) readr::parse_guess(c("1", "2", "drei")) ``` +For `parse_guess()` and `type_convert()`, `trim_ws` is considered before type guessing (the expected behavior of `vroom::vroom()` / `readr::read_delim()`). + +```{r} +minty::parse_guess(c(" 1", " 2 ", " 3 "), trim_ws = TRUE) +``` + +```{r} +readr::parse_guess(c(" 1", " 2 ", " 3 "), trim_ws = TRUE) +``` + +```{r} +##tidyverse/readr#1536 +minty::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str() +``` + +```{r} +readr::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str() +``` + ## Similar packages For parsing ambiguous date(time) diff --git a/README.md b/README.md index bfc8e19..e4a2ae0 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,11 @@ call the first edition parsing engine (see the explanation on packages, e.g. `readODS` and `surveytoolbox` for parsing in-memory objects, but those packages do not use the main functions (e.g. `readr::read_delim()`) of `readr`. As explained in the README of -`readr`, those 1e code will be eventually removed from `readr`. `minty` -aims at providing a set of minimal, long-term, and compatible type -inferencing and parsing tools for those packages. +`readr`, those 1e code will be eventually removed from `readr`. + +`minty` aims at providing a set of minimal, long-term, and compatible +type inferencing and parsing tools for those packages. You might +consider `minty` to be 1.5e parsing engine. ## Installation @@ -246,6 +248,41 @@ readr::parse_guess(c("1", "2", "drei")) #> [1] "1" "2" "drei" ``` +For `parse_guess()` and `type_convert()`, `trim_ws` is considered before +type guessing (the expected behavior of `vroom::vroom()` / +`readr::read_delim()`). + +``` r +minty::parse_guess(c(" 1", " 2 ", " 3 "), trim_ws = TRUE) +#> [1] 1 2 3 +``` + +``` r +readr::parse_guess(c(" 1", " 2 ", " 3 "), trim_ws = TRUE) +#> [1] "1" "2" "3" +``` + +``` r +##tidyverse/readr#1536 +minty::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str() +#> 'data.frame': 1 obs. of 2 variables: +#> $ a: num 1 +#> $ b: num 2 +``` + +``` r +readr::type_convert(data.frame(a = "1 ", b = " 2"), trim_ws = TRUE) |> str() +#> +#> ── Column specification ──────────────────────────────────────────────────────── +#> cols( +#> a = col_character(), +#> b = col_double() +#> ) +#> 'data.frame': 1 obs. of 2 variables: +#> $ a: chr "1" +#> $ b: num 2 +``` + ## Similar packages For parsing ambiguous date(time) diff --git a/misc/benchmark.md b/misc/benchmark.md index 204fcdb..d3d9509 100644 --- a/misc/benchmark.md +++ b/misc/benchmark.md @@ -7,7 +7,7 @@ suppressPackageStartupMessages(library(readr)) Sys.time() ``` - [1] "2024-06-10 13:52:45 CEST" + [1] "2024-06-11 11:20:48 CEST" Under 200 rows, simple @@ -19,7 +19,7 @@ bench::mark(minty::type_convert(iris_chr), iterations = 10) # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 minty::type_convert(iris_chr) 387µs 410µs 2377. 702KB 0 + 1 minty::type_convert(iris_chr) 389µs 403µs 2387. 703KB 0 ``` r bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10) @@ -28,7 +28,7 @@ bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10) # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 suppressMessages(readr::type_conve… 2.15ms 2.21ms 365. 1.81MB 0 + 1 suppressMessages(readr::type_conve… 2.14ms 2.2ms 368. 1.81MB 0 Many rows @@ -43,7 +43,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE), iterati # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 x <- minty::type_convert(flights_ch… 1.02s 1.06s 0.940 189MB 17.1 + 1 x <- minty::type_convert(flights_ch… 1.08s 1.14s 0.884 189MB 16.1 ``` r bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5) @@ -55,7 +55,7 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 y <- suppressMessages(readr::type_c… 1.01s 1.04s 0.954 153MB 17.2 + 1 y <- suppressMessages(readr::type_c… 995ms 1.02s 0.973 153MB 17.5 ``` r all.equal(x, y) @@ -75,7 +75,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_ma # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 x <- minty::type_convert(flights_ch… 529ms 535ms 1.84 153MB 19.9 + 1 x <- minty::type_convert(flights_ch… 534ms 538ms 1.83 153MB 19.7 ``` r bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5) @@ -87,7 +87,7 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 y <- suppressMessages(readr::type_c… 1.02s 1.02s 0.959 153MB 17.5 + 1 y <- suppressMessages(readr::type_c… 1s 1.02s 0.974 153MB 17.7 ``` r all.equal(x, y) diff --git a/src/CollectorGuess.cpp b/src/CollectorGuess.cpp index 3fe4109..4bd26db 100644 --- a/src/CollectorGuess.cpp +++ b/src/CollectorGuess.cpp @@ -10,13 +10,11 @@ typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale); -bool canParse( - const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale, unsigned int guess_max) { +bool canParse(const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale, + unsigned int guess_max, bool trim_ws) { unsigned int n = 0; for (const auto & i : x) { n++; - //Rprintf("%u\n", n); - //Rprintf(i); if (n > guess_max) { break; } @@ -27,8 +25,8 @@ bool canParse( if (i.size() == 0) { continue; } - - if (!canParseF(std::string(i), pLocale)) { + auto i_str = trim_ws ? trimString(std::string(i)) : std::string(i); + if (!canParseF(i_str, pLocale)) { return false; } } @@ -130,7 +128,8 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, - unsigned int guess_max) { + unsigned int guess_max, + bool trim_ws) { LocaleInfo locale(static_cast(locale_)); if (input.size() == 0) { @@ -142,25 +141,25 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { } // Work from strictest to most flexible - if (canParse(input, isLogical, &locale, guess_max)) { + if (canParse(input, isLogical, &locale, guess_max, trim_ws)) { return "logical"; } - if (guessInteger && canParse(input, isInteger, &locale, guess_max)) { + if (guessInteger && canParse(input, isInteger, &locale, guess_max, trim_ws)) { return "integer"; } - if (canParse(input, isDouble, &locale, guess_max)) { + if (canParse(input, isDouble, &locale, guess_max, trim_ws)) { return "double"; } - if (canParse(input, isNumber, &locale, guess_max)) { + if (canParse(input, isNumber, &locale, guess_max, trim_ws)) { return "number"; } - if (canParse(input, isTime, &locale, guess_max)) { + if (canParse(input, isTime, &locale, guess_max, trim_ws)) { return "time"; } - if (canParse(input, isDate, &locale, guess_max)) { + if (canParse(input, isDate, &locale, guess_max, trim_ws)) { return "date"; } - if (canParse(input, isDateTime, &locale, guess_max)) { + if (canParse(input, isDateTime, &locale, guess_max, trim_ws)) { return "datetime"; } diff --git a/src/cpp11.cpp b/src/cpp11.cpp index dfd113a..de112b0 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -6,10 +6,10 @@ #include // CollectorGuess.cpp -std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max); -extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max) { +std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max, bool trim_ws); +extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max, SEXP trim_ws) { BEGIN_CPP11 - return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger), cpp11::as_cpp>(guess_max))); + return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger), cpp11::as_cpp>(guess_max), cpp11::as_cpp>(trim_ws))); END_CPP11 } // parse.cpp @@ -36,7 +36,7 @@ extern "C" SEXP _minty_r_is_string_cpp11(SEXP x) { extern "C" { static const R_CallMethodDef CallEntries[] = { - {"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 4}, + {"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 5}, {"_minty_parse_vector_", (DL_FUNC) &_minty_parse_vector_, 5}, {"_minty_r_is_string_cpp11", (DL_FUNC) &_minty_r_is_string_cpp11, 1}, {"_minty_type_convert_col", (DL_FUNC) &_minty_type_convert_col, 6}, diff --git a/src/utils.h b/src/utils.h index 6ec46b3..b16024a 100644 --- a/src/utils.h +++ b/src/utils.h @@ -81,4 +81,11 @@ inline bool starts_with_comment( return true; } +inline std::string trimString(std::string const &str, std::string const &whitespace=" \r\n\t\v\f") { + auto start = str.find_first_not_of(whitespace); + auto end = str.find_last_not_of(whitespace); + + return str.substr(start, end - start + 1); +} + #endif diff --git a/tests/testthat/test-parsing.R b/tests/testthat/test-parsing.R index 086f402..19ab684 100644 --- a/tests/testthat/test-parsing.R +++ b/tests/testthat/test-parsing.R @@ -16,3 +16,13 @@ test_that("parse_guess() guess_max", { expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 2)), "numeric") expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 3)), "character") }) + +test_that("parse_guess() trim_ws #32 or tidyverse/readr#1536", { + expect_equal(parse_guess(c(" 1", "2 ", " 3 "), trim_ws = TRUE), c(1, 2, 3)) + expect_equal(parse_guess(c(" 1", "2 ", " 3 "), trim_ws = FALSE), c(" 1", "2 ", " 3 ")) + expect_equal(parse_guess(c(" TRUE", "FALSE ", " T "), trim_ws = TRUE), c(TRUE, FALSE, TRUE)) + ## integration in type_convert() + x <- type_convert(data.frame(a = c("1 ", " 1"), b = c(" 2", " 2")), trim_ws = TRUE) + expect_equal(class(x$a), "numeric") + expect_equal(class(x$b), "numeric") +})