Skip to content

Commit

Permalink
Implement $str$extract_many() (pola-rs#1163)
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher authored Jul 7, 2024
1 parent 4ee3340 commit 471c070
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 33 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,5 @@ Collate:
'zzz.R'
Config/rextendr/version: 0.3.1
VignetteBuilder: knitr
Config/polars/LibVersion: 0.41.0
Config/polars/LibVersion: 0.41.1
Config/polars/RustToolchainVersion: nightly-2024-06-23
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Polars R Package (development version)

### New features

- New method `$str$extract_many()` (#1163).

## Polars R Package 0.18.0

### Breaking changes
Expand Down
31 changes: 31 additions & 0 deletions R/expr__string.R
Original file line number Diff line number Diff line change
Expand Up @@ -1093,3 +1093,34 @@ ExprStr_tail = function(n) {
.pr$Expr$str_tail(self, n) |>
unwrap("in $str$tail():")
}


#' Use the aho-corasick algorithm to extract matches
#'
#' @param patterns String patterns to search. This can be an Expr or something
#' coercible to an Expr. Strings are parsed as column names.
#' @inheritParams ExprStr_contains_any
#' @param ... Ignored.
#' @param overlapping Whether matches can overlap.
#'
#' @inherit ExprStr_slice return
#'
#' @examples
#' df = pl$DataFrame(values = "discontent")
#' patterns = pl$lit(c("winter", "disco", "onte", "discontent"))
#'
#' df$with_columns(
#' matches = pl$col("values")$str$extract_many(patterns),
#' matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE)
#' )
#'
#' df = pl$DataFrame(
#' values = c("discontent", "rhapsody"),
#' patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce"))
#' )
#'
#' df$select(pl$col("values")$str$extract_many("patterns"))
ExprStr_extract_many = function(patterns, ..., ascii_case_insensitive = FALSE, overlapping = FALSE) {
.pr$Expr$str_extract_many(self, patterns, ascii_case_insensitive, overlapping) |>
unwrap("in $str$extract_many():")
}
2 changes: 2 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,8 @@ RPolarsExpr$str_contains_any <- function(patterns, ascii_case_insensitive) .Call

RPolarsExpr$str_replace_many <- function(patterns, replace_with, ascii_case_insensitive) .Call(wrap__RPolarsExpr__str_replace_many, self, patterns, replace_with, ascii_case_insensitive)

RPolarsExpr$str_extract_many <- function(patterns, ascii_case_insensitive, overlapping) .Call(wrap__RPolarsExpr__str_extract_many, self, patterns, ascii_case_insensitive, overlapping)

RPolarsExpr$str_find <- function(pat, literal, strict) .Call(wrap__RPolarsExpr__str_find, self, pat, literal, strict)

RPolarsExpr$str_head <- function(n) .Call(wrap__RPolarsExpr__str_head, self, n)
Expand Down
47 changes: 47 additions & 0 deletions man/ExprStr_extract_many.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/rust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "r-polars"
version = "0.41.0"
version = "0.41.1"
edition = "2021"
rust-version = "1.79.0"
publish = false
Expand Down
18 changes: 18 additions & 0 deletions src/rust/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2458,6 +2458,24 @@ impl RPolarsExpr {
.into())
}

fn str_extract_many(
&self,
patterns: Robj,
ascii_case_insensitive: Robj,
overlapping: Robj,
) -> RResult<Self> {
Ok(self
.0
.clone()
.str()
.extract_many(
robj_to!(PLExprCol, patterns)?,
robj_to!(bool, ascii_case_insensitive)?,
robj_to!(bool, overlapping)?,
)
.into())
}

pub fn str_find(&self, pat: Robj, literal: Robj, strict: Robj) -> RResult<Self> {
let pat = robj_to!(PLExpr, pat)?;
let literal = robj_to!(Option, bool, literal)?;
Expand Down
51 changes: 26 additions & 25 deletions tests/testthat/_snaps/after-wrappers.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,31 +424,32 @@
[279] "str_contains" "str_contains_any"
[281] "str_count_matches" "str_ends_with"
[283] "str_extract" "str_extract_all"
[285] "str_extract_groups" "str_find"
[287] "str_head" "str_hex_decode"
[289] "str_hex_encode" "str_join"
[291] "str_json_decode" "str_json_path_match"
[293] "str_len_bytes" "str_len_chars"
[295] "str_pad_end" "str_pad_start"
[297] "str_replace" "str_replace_all"
[299] "str_replace_many" "str_reverse"
[301] "str_slice" "str_split"
[303] "str_split_exact" "str_splitn"
[305] "str_starts_with" "str_strip_chars"
[307] "str_strip_chars_end" "str_strip_chars_start"
[309] "str_tail" "str_to_date"
[311] "str_to_datetime" "str_to_integer"
[313] "str_to_lowercase" "str_to_time"
[315] "str_to_titlecase" "str_to_uppercase"
[317] "str_zfill" "struct_field_by_name"
[319] "struct_rename_fields" "struct_with_fields"
[321] "sub" "sum"
[323] "tail" "tan"
[325] "tanh" "to_physical"
[327] "top_k" "unique"
[329] "unique_counts" "unique_stable"
[331] "upper_bound" "value_counts"
[333] "var" "xor"
[285] "str_extract_groups" "str_extract_many"
[287] "str_find" "str_head"
[289] "str_hex_decode" "str_hex_encode"
[291] "str_join" "str_json_decode"
[293] "str_json_path_match" "str_len_bytes"
[295] "str_len_chars" "str_pad_end"
[297] "str_pad_start" "str_replace"
[299] "str_replace_all" "str_replace_many"
[301] "str_reverse" "str_slice"
[303] "str_split" "str_split_exact"
[305] "str_splitn" "str_starts_with"
[307] "str_strip_chars" "str_strip_chars_end"
[309] "str_strip_chars_start" "str_tail"
[311] "str_to_date" "str_to_datetime"
[313] "str_to_integer" "str_to_lowercase"
[315] "str_to_time" "str_to_titlecase"
[317] "str_to_uppercase" "str_zfill"
[319] "struct_field_by_name" "struct_rename_fields"
[321] "struct_with_fields" "sub"
[323] "sum" "tail"
[325] "tan" "tanh"
[327] "to_physical" "top_k"
[329] "unique" "unique_counts"
[331] "unique_stable" "upper_bound"
[333] "value_counts" "var"
[335] "xor"

# public and private methods of each class When

Expand Down
35 changes: 35 additions & 0 deletions tests/testthat/test-expr_string.R
Original file line number Diff line number Diff line change
Expand Up @@ -920,3 +920,38 @@ test_that("$str$tail() works", {
)
)
})

test_that("$str$extract_many() works", {
df = pl$DataFrame(values = c("discontent", "dollar $"))
patterns = pl$lit(c("winter", "disco", "ONTE", "discontent", "$"))

expect_equal(
df$select(
matches = pl$col("values")$str$extract_many(patterns),
matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE)
)$to_list(),
list(matches = list("disco", "$"), matches_overlap = list(c("disco", "discontent"), "$"))
)

# arg "ascii_case_insensitive" works
expect_equal(
df$select(
matches_overlap = pl$col("values")$str$extract_many(
patterns,
ascii_case_insensitive = TRUE, overlapping = TRUE
)
)$to_list(),
list(matches_overlap = list(c("disco", "onte", "discontent"), "$"))
)

# can pass column names as strings
df = pl$DataFrame(
values = c("discontent", "rhapsody"),
patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce"))
)

expect_equal(
df$select(pl$col("values")$str$extract_many("patterns"))$to_list(),
list(values = list("disco", c("rhap", "ody")))
)
})
6 changes: 0 additions & 6 deletions tools/lib-sums.tsv

This file was deleted.

0 comments on commit 471c070

Please sign in to comment.