Skip to content

Commit

Permalink
Implement guess_max fix #33 (#34)
Browse files Browse the repository at this point in the history
* Implementation of `guess_max`

* Update doc
  • Loading branch information
chainsawriot authored Jun 10, 2024
1 parent fe1ba71 commit 7d08b05
Show file tree
Hide file tree
Showing 13 changed files with 143 additions and 35 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: minty
Title: Minimal Type Guesser
Version: 0.0.2
Version: 0.0.3
Authors@R: c(
person("Chung-hong", "Chan", role = c("aut", "cre"), email = "[email protected]", comment = c(ORCID = "0000-0002-6232-7530")),
person("Hadley", "Wickham", , "[email protected]", role = "aut", comment = "author of the ported code from readr"),
Expand Down
4 changes: 2 additions & 2 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by cpp11: do not edit by hand

collectorGuess <- function(input, locale_, guessInteger) {
.Call(`_minty_collectorGuess`, input, locale_, guessInteger)
collectorGuess <- function(input, locale_, guessInteger, guess_max) {
.Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max)
}

parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws) {
Expand Down
18 changes: 12 additions & 6 deletions R/parser.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ col_number <- function() {
#' @inheritParams parse_atomic
#' @param guess_integer If `TRUE`, guess integer types for whole numbers, if
#' `FALSE` guess numeric type for all numbers.
#' @param guess_max Maximum number of data rows to use for guessing column types. `NA`: uses all data.
#' @family parsers
#' @return a parsed vector
#' @export
Expand All @@ -201,8 +202,8 @@ col_number <- function() {
#'
#' # ISO 8601 date times
#' parse_guess(c("2010-10-10"))
parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, .return_problems = FALSE) {
parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na), na = na, locale = locale, trim_ws = trim_ws,
parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, guess_max = NA, .return_problems = FALSE) {
parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max), na = na, locale = locale, trim_ws = trim_ws,
.return_problems = .return_problems)
}

Expand All @@ -212,12 +213,17 @@ col_guess <- function() {
collector("guess")
}

guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA")) {
guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000) {
x[x %in% na] <- NA_character_

stopifnot(is.locale(locale))

collectorGuess(x, locale, guessInteger = guess_integer)
if (is.na(guess_max)) {
guess_max <- NA_integer_
}
stopifnot(is.numeric(guess_max))
if (abs(guess_max) == Inf || is.nan(guess_max) || guess_max < 1 || is.na(guess_max)) {
guess_max <- length(x)
}
collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max))
}

#' Parse factors
Expand Down
5 changes: 3 additions & 2 deletions R/type_convert.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE)
#' str(type_convert(df))
type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
locale = default_locale(), guess_integer = FALSE,
locale = default_locale(), guess_integer = FALSE, guess_max = NA,
verbose = FALSE) {
stopifnot(is.data.frame(df))
is_character <- vapply(df, is.character, logical(1))
Expand All @@ -45,7 +45,8 @@ type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
guess_parser,
locale = locale,
na = na,
guess_integer = guess_integer
guess_integer = guess_integer,
guess_max = guess_max
)

specs <- col_spec_standardise(
Expand Down
18 changes: 16 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,25 @@ Some features from `vroom` have been ported to `minty`, but not `readr`.

```{r}
## tidyverse/readr#1526
minty::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str()
minty::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str()
```

```{r}
readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str()
readr::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str()
```

`guess_max` is available for `parse_guess()` and `type_convert()`, default to `NA` (same as `readr`).

```{r}
minty::parse_guess(c("1", "2", "drei"))
```

```{r}
minty::parse_guess(c("1", "2", "drei"), guess_max = 2)
```

```{r}
readr::parse_guess(c("1", "2", "drei"))
```

## Similar packages
Expand Down
22 changes: 20 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,13 @@ Some features from `vroom` have been ported to `minty`, but not `readr`.

``` r
## tidyverse/readr#1526
minty::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str()
minty::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str()
#> 'data.frame': 3 obs. of 1 variable:
#> $ a: num NaN Inf -Inf
```

``` r
readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str()
readr::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str()
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
Expand All @@ -228,6 +228,24 @@ readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str()
#> $ a: chr "NaN" "Inf" "-INF"
```

`guess_max` is available for `parse_guess()` and `type_convert()`,
default to `NA` (same as `readr`).

``` r
minty::parse_guess(c("1", "2", "drei"))
#> [1] "1" "2" "drei"
```

``` r
minty::parse_guess(c("1", "2", "drei"), guess_max = 2)
#> [1] 1 2 NA
```

``` r
readr::parse_guess(c("1", "2", "drei"))
#> [1] "1" "2" "drei"
```

## Similar packages

For parsing ambiguous date(time)
Expand Down
3 changes: 3 additions & 0 deletions man/parse_guess.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/type_convert.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 38 additions & 6 deletions misc/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ suppressPackageStartupMessages(library(readr))
Sys.time()
```

[1] "2024-06-10 09:57:34 CEST"
[1] "2024-06-10 13:52:45 CEST"

Under 200 rows, simple

Expand All @@ -19,7 +19,7 @@ bench::mark(minty::type_convert(iris_chr), iterations = 10)
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
1 minty::type_convert(iris_chr) 368µs 399µs 2383. 695KB 0
1 minty::type_convert(iris_chr) 387µs 410µs 2377. 702KB 0

``` r
bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10)
Expand All @@ -28,7 +28,7 @@ bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10)
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl>
1 suppressMessages(readr::type_conve… 2.11ms 2.15ms 376. 1.81MB 0
1 suppressMessages(readr::type_conve… 2.15ms 2.21ms 365. 1.81MB 0

Many rows

Expand All @@ -43,7 +43,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE), iterati
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl>
1 x <- minty::type_convert(flights_ch… 991ms 1.05s 0.961 189MB 17.5
1 x <- minty::type_convert(flights_ch… 1.02s 1.06s 0.940 189MB 17.1

``` r
bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5)
Expand All @@ -55,7 +55,39 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl>
1 y <- suppressMessages(readr::type_c… 991ms 1.05s 0.970 153MB 17.5
1 y <- suppressMessages(readr::type_c… 1.01s 1.04s 0.954 153MB 17.2

``` r
all.equal(x, y)
```

[1] TRUE

Many row, guess_max

``` r
bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_max = 500), iterations = 5)
```

Warning: Some expressions had a GC in every iteration; so filtering is
disabled.

# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl>
1 x <- minty::type_convert(flights_ch… 529ms 535ms 1.84 153MB 19.9

``` r
bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5)
```

Warning: Some expressions had a GC in every iteration; so filtering is
disabled.

# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl>
1 y <- suppressMessages(readr::type_c… 1.02s 1.02s 0.959 153MB 17.5

``` r
all.equal(x, y)
Expand Down Expand Up @@ -90,7 +122,7 @@ sessionInfo()
[1] stats graphics grDevices utils datasets methods base

other attached packages:
[1] readr_2.1.5 minty_0.0.2
[1] readr_2.1.5 minty_0.0.3

loaded via a namespace (and not attached):
[1] crayon_1.5.2 vctrs_0.6.5 cli_3.6.2 knitr_1.46
Expand Down
8 changes: 8 additions & 0 deletions misc/benchmark.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer
all.equal(x, y)
```

Many row, guess_max

```{r}
bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_max = 500), iterations = 5)
bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5)
all.equal(x, y)
```

```{r}
sessionInfo()
```
28 changes: 18 additions & 10 deletions src/CollectorGuess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@
typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale);

bool canParse(
const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) {
const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale, unsigned int guess_max) {
unsigned int n = 0;
for (const auto & i : x) {
n++;
//Rprintf("%u\n", n);
//Rprintf(i);
if (n > guess_max) {
break;
}
if (i == NA_STRING) {
continue;
}
Expand All @@ -21,7 +28,7 @@ bool canParse(
continue;
}

if (!canParse(std::string(i), pLocale)) {
if (!canParseF(std::string(i), pLocale)) {
return false;
}
}
Expand Down Expand Up @@ -122,7 +129,8 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
[[cpp11::register]] std::string collectorGuess(
const cpp11::strings& input,
const cpp11::list& locale_,
bool guessInteger) {
bool guessInteger,
unsigned int guess_max) {
LocaleInfo locale(static_cast<SEXP>(locale_));

if (input.size() == 0) {
Expand All @@ -134,25 +142,25 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) {
}

// Work from strictest to most flexible
if (canParse(input, isLogical, &locale)) {
if (canParse(input, isLogical, &locale, guess_max)) {
return "logical";
}
if (guessInteger && canParse(input, isInteger, &locale)) {
if (guessInteger && canParse(input, isInteger, &locale, guess_max)) {
return "integer";
}
if (canParse(input, isDouble, &locale)) {
if (canParse(input, isDouble, &locale, guess_max)) {
return "double";
}
if (canParse(input, isNumber, &locale)) {
if (canParse(input, isNumber, &locale, guess_max)) {
return "number";
}
if (canParse(input, isTime, &locale)) {
if (canParse(input, isTime, &locale, guess_max)) {
return "time";
}
if (canParse(input, isDate, &locale)) {
if (canParse(input, isDate, &locale, guess_max)) {
return "date";
}
if (canParse(input, isDateTime, &locale)) {
if (canParse(input, isDateTime, &locale, guess_max)) {
return "datetime";
}

Expand Down
8 changes: 4 additions & 4 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#include <R_ext/Visibility.h>

// CollectorGuess.cpp
std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger);
extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger) {
std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max);
extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max) {
BEGIN_CPP11
return cpp11::as_sexp(collectorGuess(cpp11::as_cpp<cpp11::decay_t<const cpp11::strings&>>(input), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(guessInteger)));
return cpp11::as_sexp(collectorGuess(cpp11::as_cpp<cpp11::decay_t<const cpp11::strings&>>(input), cpp11::as_cpp<cpp11::decay_t<const cpp11::list&>>(locale_), cpp11::as_cpp<cpp11::decay_t<bool>>(guessInteger), cpp11::as_cpp<cpp11::decay_t<unsigned int>>(guess_max)));
END_CPP11
}
// parse.cpp
Expand All @@ -36,7 +36,7 @@ extern "C" SEXP _minty_r_is_string_cpp11(SEXP x) {

extern "C" {
static const R_CallMethodDef CallEntries[] = {
{"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 3},
{"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 4},
{"_minty_parse_vector_", (DL_FUNC) &_minty_parse_vector_, 5},
{"_minty_r_is_string_cpp11", (DL_FUNC) &_minty_r_is_string_cpp11, 1},
{"_minty_type_convert_col", (DL_FUNC) &_minty_type_convert_col, 6},
Expand Down
15 changes: 15 additions & 0 deletions tests/testthat/test-parsing.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
test_that("trimmed before NA detection", {
expect_equal(parse_logical(c(" TRUE ", "FALSE", " NA ")), c(TRUE, FALSE, NA))
})

test_that("parse_guess() guess_max", {
## weird input
expected_output <- c(1, 2, 3)
expect_error(parse_guess(c("1", "2", "3"), guess_max = "123"))
expect_equal(parse_guess(c("1", "2", "3"), guess_max = NA), expected_output)
expect_equal(parse_guess(c("1", "2", "3"), guess_max = Inf), expected_output)
expect_equal(parse_guess(c("1", "2", "3"), guess_max = -Inf), expected_output)
expect_equal(parse_guess(c("1", "2", "3"), guess_max = 3.14), expected_output)
expect_equal(parse_guess(c("1", "2", "3"), guess_max = -3.14), expected_output)
expect_equal(parse_guess(c("1", "2", "3"), guess_max = 0), expected_output)
## Off by one?
expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 2)), "numeric")
expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 3)), "character")
})

0 comments on commit 7d08b05

Please sign in to comment.