From cfcd0f04ed7a57a04fd7695cc2392eabcc9c7f5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:39:51 -0700 Subject: [PATCH] Add string.find_multiple APIs to pylibcudf --- .../pylibcudf/strings/find_multiple.rst | 6 +++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../cudf/cudf/_lib/strings/find_multiple.pyx | 27 ++++--------- .../libcudf/strings/find_multiple.pxd | 2 +- .../pylibcudf/strings/CMakeLists.txt | 17 +++++++- .../pylibcudf/pylibcudf/strings/__init__.pxd | 1 + .../pylibcudf/pylibcudf/strings/__init__.py | 1 + .../pylibcudf/strings/find_multiple.pxd | 6 +++ .../pylibcudf/strings/find_multiple.pyx | 39 +++++++++++++++++++ .../tests/test_string_find_multiple.py | 26 +++++++++++++ 10 files changed, 103 insertions(+), 23 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..7e0d128cfb2 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -9,6 +9,7 @@ strings contains extract find + find_multiple findall regex_flags regex_program diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..3d048c1f50b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, + column_view input, column_view targets) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 77f20b0b917..d3da8e80be0 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,8 +13,21 @@ # ============================================================================= set(cython_sources - capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx - regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx + capitalize.pyx + case.pyx + char_types.pyx + contains.pyx + extract.pyx + find.pyx + find_multiple.pyx + findall.pyx + regex_flags.pyx + regex_program.pyx + repeat.pyx + replace.pyx + side_type.pyx + slice.pyx + strip.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 91d884b294b..4f373b64454 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( convert, extract, find, + find_multiple, findall, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index b4856784390..99e683817cc 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -8,6 +8,7 @@ convert, extract, find, + find_multiple, findall, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..413fc1cb79d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..1ef1462d08c --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + plc_result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [ + pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=result.type, + ) + ] + ) + assert result.equals(expected)