-
Notifications
You must be signed in to change notification settings - Fork 917
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of #15162. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #15880
- Loading branch information
1 parent
d91380e
commit 7fd6918
Showing
17 changed files
with
215 additions
and
25 deletions.
There are no files selected for viewing
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
======== | ||
contains | ||
======== | ||
|
||
.. automodule:: cudf._lib.pylibcudf.strings.contains | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,5 @@ strings | |
.. toctree:: | ||
:maxdepth: 1 | ||
|
||
contains | ||
replace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
13 changes: 8 additions & 5 deletions
13
python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,12 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
# Copyright (c) 2022-2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport int32_t | ||
|
||
|
||
cdef extern from "cudf/strings/regex/flags.hpp" \ | ||
namespace "cudf::strings" nogil: | ||
|
||
ctypedef enum regex_flags: | ||
DEFAULT 'cudf::strings::regex_flags::DEFAULT' | ||
MULTILINE 'cudf::strings::regex_flags::MULTILINE' | ||
DOTALL 'cudf::strings::regex_flags::DOTALL' | ||
cpdef enum class regex_flags(int32_t): | ||
DEFAULT | ||
MULTILINE | ||
DOTALL |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,12 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . cimport capitalize, case, char_types, find, replace | ||
from . cimport ( | ||
capitalize, | ||
case, | ||
char_types, | ||
contains, | ||
find, | ||
regex_flags, | ||
regex_program, | ||
replace, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,12 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . import capitalize, case, char_types, find, replace | ||
from . import ( | ||
capitalize, | ||
case, | ||
char_types, | ||
contains, | ||
find, | ||
regex_flags, | ||
regex_program, | ||
replace, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from cudf._lib.pylibcudf.column cimport Column | ||
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram | ||
|
||
|
||
cpdef Column contains_re(Column input, RegexProgram prog) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
|
||
from cudf._lib.pylibcudf.column cimport Column | ||
from cudf._lib.pylibcudf.libcudf.column.column cimport column | ||
from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains | ||
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram | ||
|
||
|
||
cpdef Column contains_re( | ||
Column input, | ||
RegexProgram prog | ||
): | ||
"""Returns a boolean column identifying rows which match the given | ||
regex_program object. | ||
For details, see :cpp:func:`cudf::strings::contains_re`. | ||
Parameters | ||
---------- | ||
input : Column | ||
The input strings | ||
prog : RegexProgram | ||
Regex program instance | ||
Returns | ||
------- | ||
pylibcudf.Column | ||
New column of boolean results for each string | ||
""" | ||
|
||
cdef unique_ptr[column] result | ||
|
||
with nogil: | ||
result = cpp_contains.contains_re( | ||
input.view(), | ||
prog.c_obj.get()[0] | ||
) | ||
|
||
return Column.from_libcudf(move(result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) 2020-2024, NVIDIA CORPORATION. | ||
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ | ||
regex_flags as RegexFlags # no-cython-lint |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libcpp.memory cimport unique_ptr | ||
from libcpp.string cimport string | ||
|
||
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program | ||
|
||
|
||
cdef class RegexProgram: | ||
cdef unique_ptr[regex_program] c_obj |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
|
||
from libcpp.memory cimport unique_ptr | ||
from libcpp.string cimport string | ||
from libcpp.utility cimport move | ||
|
||
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags | ||
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program | ||
|
||
from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags | ||
from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags | ||
|
||
|
||
cdef class RegexProgram: | ||
|
||
def __init__(self, *args, **kwargs): | ||
raise ValueError("Do not instantiate RegexProgram directly, use create") | ||
|
||
@staticmethod | ||
def create(str pattern, int flags): | ||
cdef unique_ptr[regex_program] c_prog | ||
cdef regex_flags c_flags | ||
cdef string c_pattern = pattern.encode() | ||
|
||
cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) | ||
if isinstance(flags, object): | ||
if isinstance(flags, (int, RegexFlags)): | ||
c_flags = <regex_flags>flags | ||
with nogil: | ||
c_prog = regex_program.create(c_pattern, c_flags) | ||
|
||
ret.c_obj = move(c_prog) | ||
else: | ||
raise ValueError("flags must be of type RegexFlags") | ||
|
||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pytest | ||
|
||
import cudf._lib.pylibcudf as plc | ||
|
||
|
||
@pytest.mark.parametrize("pat", ["(", "*", "\\"]) | ||
def test_regex_program_invalid(pat): | ||
with pytest.raises(RuntimeError): | ||
plc.strings.regex_program.RegexProgram.create( | ||
pat, plc.strings.regex_flags.RegexFlags.DEFAULT | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
import cudf._lib.pylibcudf as plc | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def pa_target_col(): | ||
return pa.array( | ||
["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] | ||
) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def plc_target_col(pa_target_col): | ||
return plc.interop.from_arrow(pa_target_col) | ||
|
||
|
||
@pytest.fixture( | ||
params=[ | ||
"A", | ||
"de", | ||
".*", | ||
"^a", | ||
"^A", | ||
"[^a-z]", | ||
"[a-z]{3,}", | ||
"^[A-Z]{2,}", | ||
"j|u", | ||
], | ||
scope="module", | ||
) | ||
def pa_target_scalar(request): | ||
return pa.scalar(request.param, type=pa.string()) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def plc_target_pat(pa_target_scalar): | ||
prog = plc.strings.regex_program.RegexProgram.create( | ||
pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT | ||
) | ||
return prog | ||
|
||
|
||
def test_contains_re( | ||
pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat | ||
): | ||
got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) | ||
expected = pa.compute.match_substring_regex( | ||
pa_target_col, pa_target_scalar.as_py() | ||
) | ||
assert_column_eq(got, expected) |