Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate strings contains operations to pylibcudf #15880

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx)
set(cython_sources char_types.pyx regex_flags.pyx)

set(linked_libraries cudf::cudf)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Copyright (c) 2022, NVIDIA CORPORATION.

from libc.stdint cimport int32_t

cdef extern from "cudf/strings/regex/flags.hpp" \
namespace "cudf::strings" nogil:

ctypedef enum regex_flags:
DEFAULT 'cudf::strings::regex_flags::DEFAULT'
MULTILINE 'cudf::strings::regex_flags::MULTILINE'
DOTALL 'cudf::strings::regex_flags::DOTALL'
cpdef enum class regex_flags(int32_t):
DEFAULT
MULTILINE
DOTALL
Empty file.
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
set(cython_sources case.pyx capitalize.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx
)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport capitalize, case, char_types, find
from . cimport (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import capitalize, case, char_types, find
from . import (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar

brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
28 changes: 28 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
from cudf._lib.pylibcudf.scalar cimport Scalar

from cython.operator import dereference

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar

cpdef Column contains_re(
Column input,
RegexProgram prog
):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring needed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might also want to start the sphinx docs for the strings submodule for pylibcudf.

As far as I can tell, there's no .rst files for strings in the api_docs/pylibcudf folder.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a few sphinx docs here

cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
)

return Column.from_libcudf(move(result))

2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.regex_flags import regex_flags as RegexFlags # no-cython-lint
8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.string cimport string
from libcpp.memory cimport unique_ptr
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program

cdef class RegexProgram:
cdef unique_ptr[regex_program] c_obj
36 changes: 36 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2024, NVIDIA CORPORATION.


from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags


cdef class RegexProgram:

def __init__(self, *args, **kwargs):
raise ValueError("Do not instantiate RegexProgram directly, use create")

@staticmethod
def create(object pattern, object flags):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def create(object pattern, object flags):
def create(str pattern, RegexFlags flags):

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My experimentation leads me to believe it must be done this way: 758755c

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you always have to use the cimported name when typing Cython functions, not the imported name. regex_flags != RegexFlags because the latter is imported rather than cimported. This is deliberate! We import the aliased name because that is the name we want to expose to users of pylibcudf in pure Python, using Python naming conventions for classes (CapsCase) since enums in Python are classes, while regex_flags is the snake_case representation of the Cython/C enum.

cdef unique_ptr[regex_program] c_prog
cdef regex_flags c_flags
cdef string c_pattern = pattern.encode()

cdef RegexProgram ret = RegexProgram.__new__(RegexProgram)
if isinstance(flags, object):
if isinstance(flags, (int, RegexFlags)):
c_flags = flags
with nogil:
c_prog = regex_program.create(c_pattern, c_flags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Whose job is it to ensure that the provided input is a "valid" regex?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The libcudf method should validate the pattern at the c++ level:

cudf::logic_error If pattern is invalid or contains unsupported features

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add some simple tests of this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think that's reasonable. Will add some

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couple of tests at b15588a


ret.c_obj = move(c_prog)
else:
raise ValueError("flags must be of type RegexFlags")

return ret
23 changes: 7 additions & 16 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
contains_re as cpp_contains_re,
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
Expand All @@ -23,28 +22,20 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.pylibcudf.strings import contains
from cudf._lib.pylibcudf.strings.regex_program import RegexProgram


@acquire_spill_lock()
def contains_re(Column source_strings, object reg_ex, uint32_t flags):
"""
Returns a Column of boolean values with True for `source_strings`
that contain regular expression `reg_ex`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_contains_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand Down
42 changes: 42 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_target_col():
return pa.array(
["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
)


@pytest.fixture(scope="module")
def plc_target_col(pa_target_col):
return plc.interop.from_arrow(pa_target_col)


@pytest.fixture(params=["A"], scope="module")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll need to add some more actual tests here.

def pa_target_scalar(request):
return pa.scalar(request.param, type=pa.string())


@pytest.fixture(scope="module")
def plc_target_pat(pa_target_scalar):
prog = plc.strings.regex_program.RegexProgram.create(
pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT
)
return prog


def test_contains_re(
pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
):
got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
expected = pa.compute.match_substring_regex(
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)
Loading