Skip to content

Commit

Permalink
Add string.find_multiple APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Sep 25, 2024
1 parent ba7d6e7 commit cfcd0f0
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=============
find_multiple
=============

.. automodule:: pylibcudf.strings.find_multiple
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ strings
contains
extract
find
find_multiple
findall
regex_flags
regex_program
Expand Down
27 changes: 7 additions & 20 deletions python/cudf/cudf/_lib/strings/find_multiple.pyx
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.find_multiple cimport (
find_multiple as cpp_find_multiple,
)

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def find_multiple(Column source_strings, Column target_strings):
"""
Returns a column with character position values where each
of the `target_strings` are found in each string of `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view target_view = target_strings.view()

with nogil:
c_result = move(cpp_find_multiple(
source_view,
target_view
))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.find_multiple.find_multiple(
source_strings.to_pylibcudf(mode="read"),
target_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_result)
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
nogil:

cdef unique_ptr[column] find_multiple(
column_view source_strings,
column_view input,
column_view targets) except +
17 changes: 15 additions & 2 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,21 @@
# =============================================================================

set(cython_sources
capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
capitalize.pyx
case.pyx
char_types.pyx
contains.pyx
extract.pyx
find.pyx
find_multiple.pyx
findall.pyx
regex_flags.pyx
regex_program.pyx
repeat.pyx
replace.pyx
side_type.pyx
slice.pyx
strip.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from . cimport (
convert,
extract,
find,
find_multiple,
findall,
regex_flags,
regex_program,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
convert,
extract,
find,
find_multiple,
findall,
regex_flags,
regex_program,
Expand Down
6 changes: 6 additions & 0 deletions python/pylibcudf/pylibcudf/strings/find_multiple.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column find_multiple(Column input, Column targets)
39 changes: 39 additions & 0 deletions python/pylibcudf/pylibcudf/strings/find_multiple.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple


cpdef Column find_multiple(Column input, Column targets):
"""
Returns a lists column with character position values where each
of the target strings are found in each string.
For details, see :cpp:func:`cudf::strings::find_multiple`.
Parameters
----------
input : Column
Strings instance for this operation
targets : Column
Strings to search for in each string
Returns
-------
Column
Lists column with character position values
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_find_multiple.find_multiple(
input.view(),
targets.view()
)
)

return Column.from_libcudf(move(c_result))
26 changes: 26 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc


def test_find_multiple():
arr = pa.array(["abc", "def"])
targets = pa.array(["a", "c", "e"])
plc_result = plc.strings.find_multiple.find_multiple(
plc.interop.from_arrow(arr),
plc.interop.from_arrow(targets),
)
result = plc.interop.to_arrow(plc_result)
expected = pa.chunked_array(
[
pa.array(
[
[elem.find(target) for target in targets.to_pylist()]
for elem in arr.to_pylist()
],
type=result.type,
)
]
)
assert result.equals(expected)

0 comments on commit cfcd0f0

Please sign in to comment.