From 3d74c523e8116a66fdea8f1707d6ca34288206d7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 8 Apr 2024 14:56:02 -0500 Subject: [PATCH 01/18] initial file structure to build and import --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 ++ python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 ++ python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 ++ .../_lib/pylibcudf/strings/CMakeLists.txt | 23 +++++++++++++++++++ .../cudf/_lib/pylibcudf/strings/__init__.pxd | 0 .../cudf/_lib/pylibcudf/strings/__init__.py | 0 .../cudf/cudf/_lib/pylibcudf/strings/case.pxd | 13 +++++++++++ .../cudf/cudf/_lib/pylibcudf/strings/case.pyx | 17 ++++++++++++++ 8 files changed, 59 insertions(+) create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 81d15cf95b4..e97a9020bb4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -44,3 +44,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) link_to_pyarrow_headers(pylibcudf_interop) + +add_subdirectory(strings) \ No newline at end of file diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 48c23a9dd4c..5adefa5fd93 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( search, sorting, stream_compaction, + strings, types, unary, ) @@ -48,6 +49,7 @@ __all__ = [ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 8ccb0ecc341..a623807ec07 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -16,6 +16,7 @@ rolling, search, sorting, + strings, stream_compaction, types, unary, @@ -48,6 +49,7 @@ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt new file mode 100644 index 00000000000..a35b766d39e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources + case.pyx +) +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd new file mode 100644 index 00000000000..135e795eb95 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +#from .column cimport Column +#from .scalar cimport Scalar +#from .types cimport DataType + +#cpdef Column capitalize(Column input) +# TODO: title +#cpdef Column is_title(Column input) +#cpdef Column to_lower(Column input) +#cpdef Column to_upper(Column input) +#cpdef Column swapcase(Column input) + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx new file mode 100644 index 00000000000..4fdf9b58dd6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +#from cudf._lib.cpp.strings cimport case as cpp_case +#from cudf._lib.cpp.column.column cimport column + +#from .column cimport Column + +#cpdef Column capitalize(Column input): +# cdef unique_ptr[column] c_result +# with nogil: +# c_result = cpp_case.capitalize(input.view()) +# +# return Column.from_libcudf(move(c_result)) + From 1f88eb32a4a111e041c22fcc5cc81611a88969d4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 8 Apr 2024 16:45:17 -0500 Subject: [PATCH 02/18] updates, tests --- python/cudf/cudf/_lib/cpp/strings/case.pxd | 8 +++- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 +- .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/cudf/_lib/pylibcudf/strings/case.pxd | 13 ++---- .../cudf/cudf/_lib/pylibcudf/strings/case.pyx | 31 +++++++++----- .../cudf/pylibcudf_tests/test_string_case.py | 40 +++++++++++++++++++ 7 files changed, 76 insertions(+), 24 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_case.py diff --git a/python/cudf/cudf/_lib/cpp/strings/case.pxd b/python/cudf/cudf/_lib/cpp/strings/case.pxd index 01cd08c10ff..8c39069022e 100644 --- a/python/cudf/cudf/_lib/cpp/strings/case.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/case.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column @@ -6,6 +6,12 @@ from cudf._lib.cpp.column.column_view cimport column_view cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil: + cdef unique_ptr[column] capitalize( + const column_view & input) except + + + cdef unique_ptr[column] is_title( + const column_view & input) except + + cdef unique_ptr[column] to_lower( const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index e97a9020bb4..c2b7cb7ca3d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -45,4 +45,4 @@ rapids_cython_create_modules( ) link_to_pyarrow_headers(pylibcudf_interop) -add_subdirectory(strings) \ No newline at end of file +add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index a623807ec07..89f874f5fa5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -16,8 +16,8 @@ rolling, search, sorting, - strings, stream_compaction, + strings, types, unary, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index a35b766d39e..ebe32b675a0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,9 +12,7 @@ # the License. # ============================================================================= -set(cython_sources - case.pyx -) +set(cython_sources case.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd index 135e795eb95..225d566fe06 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd @@ -1,13 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -#from .column cimport Column -#from .scalar cimport Scalar -#from .types cimport DataType +from cudf._lib.pylibcudf.column cimport Column -#cpdef Column capitalize(Column input) -# TODO: title -#cpdef Column is_title(Column input) -#cpdef Column to_lower(Column input) -#cpdef Column to_upper(Column input) -#cpdef Column swapcase(Column input) +cpdef Column to_lower(Column input) +cpdef Column to_upper(Column input) +cpdef Column swapcase(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx index 4fdf9b58dd6..69910fd8c50 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx @@ -3,15 +3,28 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -#from cudf._lib.cpp.strings cimport case as cpp_case -#from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.strings cimport case as cpp_case +from cudf._lib.pylibcudf.column cimport Column -#from .column cimport Column -#cpdef Column capitalize(Column input): -# cdef unique_ptr[column] c_result -# with nogil: -# c_result = cpp_case.capitalize(input.view()) -# -# return Column.from_libcudf(move(c_result)) +cpdef Column to_lower(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_lower(input.view()) + return Column.from_libcudf(move(c_result)) + +cpdef Column to_upper(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_upper(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column swapcase(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.swapcase(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py new file mode 100644 index 00000000000..71aeb6fe7d0 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.strings import case + + +@pytest.fixture(scope="module") +def string_col(): + return pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]) + + +def test_to_upper(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = case.to_upper(plc_col) + expected = pa.Array.from_pandas( + string_col.to_pandas().apply(lambda x: x.upper()) + ) + assert_column_eq(got, expected) + + +def test_to_lower(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = case.to_lower(plc_col) + expected = pa.Array.from_pandas( + string_col.to_pandas().apply(lambda x: x.lower()) + ) + assert_column_eq(got, expected) + + +def test_swapcase(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = case.swapcase(plc_col) + expected = pa.Array.from_pandas( + string_col.to_pandas().apply(lambda x: x.swapcase()) + ) + assert_column_eq(got, expected) From d343a62660a8eb7e4515733aece32593704cd4d6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 9 Apr 2024 07:42:50 -0500 Subject: [PATCH 03/18] fix up tests --- python/cudf/cudf/_lib/strings/case.pyx | 50 +++++++------------ .../cudf/pylibcudf_tests/test_string_case.py | 40 +++++++++++---- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx index 09af1178946..38f242a67d6 100644 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ b/python/cudf/cudf/_lib/strings/case.pyx @@ -1,48 +1,34 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.strings.case cimport ( - swapcase as cpp_swapcase, - to_lower as cpp_to_lower, - to_upper as cpp_to_upper, -) + +from cudf._lib.pylibcudf.strings import case @acquire_spill_lock() def to_upper(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_upper(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_upper( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def to_lower(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_lower(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_lower( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def swapcase(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_swapcase(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.swapcase( + source_strings.to_pylibcudf(mode='read') + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py index 71aeb6fe7d0..478bfc0b9e4 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -10,31 +10,51 @@ @pytest.fixture(scope="module") def string_col(): - return pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]) + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +def wrap_nulls(func): + def wrapper(x): + if x is None: + return None + return func(x) + + return wrapper def test_to_upper(string_col): plc_col = plc.interop.from_arrow(string_col) got = case.to_upper(plc_col) - expected = pa.Array.from_pandas( - string_col.to_pandas().apply(lambda x: x.upper()) - ) + + @wrap_nulls + def to_upper(x): + return x.upper() + + expected = pa.Array.from_pandas(string_col.to_pandas().apply(to_upper)) assert_column_eq(got, expected) def test_to_lower(string_col): plc_col = plc.interop.from_arrow(string_col) got = case.to_lower(plc_col) - expected = pa.Array.from_pandas( - string_col.to_pandas().apply(lambda x: x.lower()) - ) + + @wrap_nulls + def to_lower(x): + return x.lower() + + expected = pa.Array.from_pandas(string_col.to_pandas().apply(to_lower)) assert_column_eq(got, expected) def test_swapcase(string_col): plc_col = plc.interop.from_arrow(string_col) got = case.swapcase(plc_col) - expected = pa.Array.from_pandas( - string_col.to_pandas().apply(lambda x: x.swapcase()) - ) + + @wrap_nulls + def swapcase(x): + return x.swapcase() + + expected = pa.Array.from_pandas(string_col.to_pandas().apply(swapcase)) assert_column_eq(got, expected) From 9f18dd09bbfb9e7ab0b2d8b652975871815cb3ff Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 06:59:20 -0500 Subject: [PATCH 04/18] fix imports --- python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd | 3 +++ python/cudf/cudf/_lib/pylibcudf/strings/__init__.py | 3 +++ python/cudf/cudf/pylibcudf_tests/test_string_case.py | 7 +++---- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index e69de29bb2d..ff87549b5b5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index e69de29bb2d..ff87549b5b5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py index 478bfc0b9e4..88a178cfe0c 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -5,7 +5,6 @@ from utils import assert_column_eq import cudf._lib.pylibcudf as plc -from cudf._lib.pylibcudf.strings import case @pytest.fixture(scope="module") @@ -26,7 +25,7 @@ def wrapper(x): def test_to_upper(string_col): plc_col = plc.interop.from_arrow(string_col) - got = case.to_upper(plc_col) + got = plc.strings.case.to_upper(plc_col) @wrap_nulls def to_upper(x): @@ -38,7 +37,7 @@ def to_upper(x): def test_to_lower(string_col): plc_col = plc.interop.from_arrow(string_col) - got = case.to_lower(plc_col) + got = plc.strings.case.to_lower(plc_col) @wrap_nulls def to_lower(x): @@ -50,7 +49,7 @@ def to_lower(x): def test_swapcase(string_col): plc_col = plc.interop.from_arrow(string_col) - got = case.swapcase(plc_col) + got = plc.strings.case.swapcase(plc_col) @wrap_nulls def swapcase(x): From f0ad27d44d2470cca3b2d40a2731b0f040fe789b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 07:05:21 -0500 Subject: [PATCH 05/18] address reviews --- .../cudf/pylibcudf_tests/test_string_case.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py index 88a178cfe0c..8ae74636b98 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -26,34 +26,19 @@ def wrapper(x): def test_to_upper(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.to_upper(plc_col) - - @wrap_nulls - def to_upper(x): - return x.upper() - - expected = pa.Array.from_pandas(string_col.to_pandas().apply(to_upper)) + expected = pa.compute.utf8_upper(string_col) assert_column_eq(got, expected) def test_to_lower(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.to_lower(plc_col) - - @wrap_nulls - def to_lower(x): - return x.lower() - - expected = pa.Array.from_pandas(string_col.to_pandas().apply(to_lower)) + expected = pa.compute.utf8_lower(string_col) assert_column_eq(got, expected) def test_swapcase(string_col): plc_col = plc.interop.from_arrow(string_col) got = plc.strings.case.swapcase(plc_col) - - @wrap_nulls - def swapcase(x): - return x.swapcase() - - expected = pa.Array.from_pandas(string_col.to_pandas().apply(swapcase)) + expected = pa.compute.utf8_swapcase(string_col) assert_column_eq(got, expected) From 724c9fb098ab4932d9e3635d62d9a970427e7b62 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 09:33:01 -0500 Subject: [PATCH 06/18] initial - need to add a scalar factory --- .../cudf/cudf/_lib/cpp/strings/capitalize.pxd | 5 ++- .../_lib/pylibcudf/strings/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../_lib/pylibcudf/strings/capitalize.pxd | 9 ++++++ .../_lib/pylibcudf/strings/capitalize.pyx | 32 +++++++++++++++++++ python/cudf/cudf/_lib/strings/capitalize.pyx | 14 ++++---- 7 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd index d193a8265b1..430c1b412cf 100644 --- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd @@ -2,12 +2,15 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.column.column_view cimport column_view cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] capitalize( - const column_view & strings) except + + const column_view & strings, + const scalar & delimiters + ) except + cdef unique_ptr[column] title( const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index ebe32b675a0..79190c91779 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources case.pyx) +set(cython_sources case.pyx capitalize.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index ff87549b5b5..35a4665a479 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case +from . import case, capitalize diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index ff87549b5b5..35a4665a479 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case +from . import case, capitalize diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd new file mode 100644 index 00000000000..9acf189fc23 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column capitalize(Column input, Scalar delimiters=*) +cpdef Column title(Column input) +cpdef Column is_title(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx new file mode 100644 index 00000000000..afe218f96dc --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.strings cimport capitalize as cpp_capitalize +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.scalar cimport Scalar +from cython.operator import dereference + + +cpdef Column capitalize(Column input, Scalar delimiters=Scala): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_capitalize.capitalize(input.view(), dereference(delimiters.c_obj)) + + return Column.from_libcudf(move(c_result)) + +cpdef Column title(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_capitalize.title(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_title(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_capitalize.is_title(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx index f6a80ac8fbe..cdf3ad7b9ea 100644 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/strings/capitalize.pyx @@ -14,17 +14,15 @@ from cudf._lib.cpp.strings.capitalize cimport ( title as cpp_title, ) +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def capitalize(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_capitalize(source_view)) - - return Column.from_unique_ptr(move(c_result)) - + return Column.from_pylibcudf( + plc.strings.capitalize( + source_strings.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def title(Column source_strings): From b47f4ee466aecc120dbb15d6e299dea499f75379 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 10:29:31 -0500 Subject: [PATCH 07/18] add the scalar factory? --- python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd | 9 +++++++++ python/cudf/cudf/_lib/cpp/strings/capitalize.pxd | 4 ++-- python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx | 9 ++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd new file mode 100644 index 00000000000..b8bdf3a544f --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from cudf._lib.cpp.scalar.scalar cimport scalar +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: + cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + \ No newline at end of file diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd index 430c1b412cf..919d63b36e6 100644 --- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd @@ -2,14 +2,14 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.cpp.column.column_view cimport column_view cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] capitalize( const column_view & strings, - const scalar & delimiters + const string_scalar & delimiters ) except + cdef unique_ptr[column] title( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index afe218f96dc..7197f5bfad3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -5,12 +5,19 @@ from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.strings cimport capitalize as cpp_capitalize +from cudf._lib.cpp.scalar.scalar_factories cimport make_string_scalar as cpp_make_string_scalar +from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar from cython.operator import dereference -cpdef Column capitalize(Column input, Scalar delimiters=Scala): +cpdef Column capitalize( + Column input, + Scalar delimiters=Scalar.from_libcudf( + cpp_make_string_scalar("".encode('utf-8')) + ) +): cdef unique_ptr[column] c_result with nogil: c_result = cpp_capitalize.capitalize(input.view(), dereference(delimiters.c_obj)) From 080e552746af31905b6308b5e268a541598acf32 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 10 Apr 2024 11:16:31 -0500 Subject: [PATCH 08/18] style fixes and more plumbing --- .../cudf/_lib/cpp/scalar/scalar_factories.pxd | 6 ++- .../cudf/cudf/_lib/cpp/strings/capitalize.pxd | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../_lib/pylibcudf/strings/capitalize.pyx | 14 +++++-- python/cudf/cudf/_lib/strings/capitalize.pyx | 40 ++++++------------- 6 files changed, 31 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd index b8bdf3a544f..7eba3f9e41b 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from cudf._lib.cpp.scalar.scalar cimport scalar from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from cudf._lib.cpp.scalar.scalar cimport scalar + + cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: - cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + \ No newline at end of file + cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd index 919d63b36e6..24818aef44c 100644 --- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd @@ -1,9 +1,9 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 35a4665a479..eb6fd128bc9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case, capitalize +from . import capitalize, case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 35a4665a479..eb6fd128bc9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import case, capitalize +from . import capitalize, case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 7197f5bfad3..9256f90f519 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -4,26 +4,31 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from cudf._lib.cpp.strings cimport capitalize as cpp_capitalize -from cudf._lib.cpp.scalar.scalar_factories cimport make_string_scalar as cpp_make_string_scalar -from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar + from cython.operator import dereference cpdef Column capitalize( - Column input, + Column input, Scalar delimiters=Scalar.from_libcudf( cpp_make_string_scalar("".encode('utf-8')) ) ): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_capitalize.capitalize(input.view(), dereference(delimiters.c_obj)) + c_result = cpp_capitalize.capitalize( + input.view(), dereference(delimiters.c_obj) + ) return Column.from_libcudf(move(c_result)) + cpdef Column title(Column input): cdef unique_ptr[column] c_result with nogil: @@ -31,6 +36,7 @@ cpdef Column title(Column input): return Column.from_libcudf(move(c_result)) + cpdef Column is_title(Column input): cdef unique_ptr[column] c_result with nogil: diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx index cdf3ad7b9ea..d3e69e38803 100644 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/strings/capitalize.pyx @@ -1,21 +1,10 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.strings.capitalize cimport ( - capitalize as cpp_capitalize, - is_title as cpp_is_title, - title as cpp_title, -) - import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def capitalize(Column source_strings): return Column.from_pylibcudf( @@ -24,23 +13,20 @@ def capitalize(Column source_strings): ) ) + @acquire_spill_lock() def title(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_title(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.title( + source_strings.to_pylibcudf(mode="read") + ) + ) @acquire_spill_lock() def is_title(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_title(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.is_title( + source_strings.to_pylibcudf(mode="read") + ) + ) From c890a6cb78dc0db1b5aab5170299a6a40c7e4a61 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 16 Apr 2024 11:01:31 -0500 Subject: [PATCH 09/18] updates: tests, more issues with scalars --- .../cudf/_lib/cpp/scalar/scalar_factories.pxd | 11 ---- .../_lib/pylibcudf/strings/capitalize.pxd | 2 +- .../_lib/pylibcudf/strings/capitalize.pyx | 12 ++-- .../pylibcudf_tests/test_string_capitalize.py | 57 +++++++++++++++++++ 4 files changed, 63 insertions(+), 19 deletions(-) delete mode 100644 python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd deleted file mode 100644 index 7eba3f9e41b..00000000000 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string - -from cudf._lib.cpp.scalar.scalar cimport scalar - - -cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: - cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd index 9acf189fc23..2c96a5f305c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd @@ -4,6 +4,6 @@ from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar -cpdef Column capitalize(Column input, Scalar delimiters=*) +cpdef Column capitalize(Column input, Scalar delimiters) cpdef Column title(Column input) cpdef Column is_title(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 9256f90f519..9adc9b43b78 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -4,9 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.scalar.scalar_factories cimport ( - make_string_scalar as cpp_make_string_scalar, -) +from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.cpp.strings cimport capitalize as cpp_capitalize from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar @@ -16,14 +14,14 @@ from cython.operator import dereference cpdef Column capitalize( Column input, - Scalar delimiters=Scalar.from_libcudf( - cpp_make_string_scalar("".encode('utf-8')) - ) + Scalar delimiters + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 ): cdef unique_ptr[column] c_result with nogil: c_result = cpp_capitalize.capitalize( - input.view(), dereference(delimiters.c_obj) + input.view(), (dereference(delimiters.c_obj)) ) return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py new file mode 100644 index 00000000000..052cce0d02b --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def is_title_data(): + data = [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + ] + return pa.array(data) + + +@pytest.fixture(scope="module") +def title_data(): + data = [ + None, + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + ] + return pa.array(data) + + +def test_capitalize(title_data): + plc_col = plc.interop.from_arrow(title_data) + got = plc.strings.capitalize(plc_col) + expected = pa.compute.utf8_capitalize(title_data) + assert_column_eq(got, expected) + + +def test_title(title_data): + plc_col = plc.interop.from_arrow(title_data) + got = plc.strings.title(plc_col) + expected = pa.compute.utf8_title(title_data) + assert_column_eq(got, expected) + + +def test_is_title(is_title_data): + plc_col = plc.interop.from_arrow(is_title_data) + got = plc.strings.is_title(plc_col) + expected = pa.compute.utf8_is_title(is_title_data) + assert_column_eq(got, expected) From a15dde32b4786b637cb35c964f0846aa31beb88d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 16 Apr 2024 13:31:53 -0500 Subject: [PATCH 10/18] compiles --- python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 9adc9b43b78..9d00abd274a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -19,9 +19,14 @@ cpdef Column capitalize( # https://github.com/rapidsai/cudf/issues/15505 ): cdef unique_ptr[column] c_result + cdef const string_scalar* cpp_delimiters = ( + delimiters.c_obj.get() + ) + with nogil: c_result = cpp_capitalize.capitalize( - input.view(), (dereference(delimiters.c_obj)) + input.view(), + dereference(cpp_delimiters) ) return Column.from_libcudf(move(c_result)) From 16b53f8221df4245ac02a5f840b134f3d829386c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 10 May 2024 07:33:43 -0700 Subject: [PATCH 11/18] fixes and updates --- .../cudf/_lib/cpp/scalar/scalar_factories.pxd | 10 ++++++++++ .../cudf/_lib/pylibcudf/strings/CMakeLists.txt | 2 ++ .../cudf/cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/capitalize.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/capitalize.pyx | 15 +++++++++++++-- python/cudf/cudf/_lib/strings/capitalize.pyx | 6 +++--- 6 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd new file mode 100644 index 00000000000..cca3fb205e8 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar_factories.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.scalar.scalar cimport scalar + + +cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: + cdef unique_ptr[scalar] make_string_scalar(const string & _string) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index ec8a25363cf..71f2c98bae6 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -20,3 +20,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +link_to_pyarrow_headers(pylibcudf_capitalize) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 4bc93fd20e0..09dfb81a07d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, find +from . cimport capitalize, case, find diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd index 2c96a5f305c..9acf189fc23 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd @@ -4,6 +4,6 @@ from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar -cpdef Column capitalize(Column input, Scalar delimiters) +cpdef Column capitalize(Column input, Scalar delimiters=*) cpdef Column title(Column input) cpdef Column is_title(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 9d00abd274a..94336d31bbe 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -5,6 +5,9 @@ from libcpp.utility cimport move from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.cpp.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from cudf._lib.cpp.strings cimport capitalize as cpp_capitalize from cudf._lib.pylibcudf.column cimport Column from cudf._lib.pylibcudf.scalar cimport Scalar @@ -14,12 +17,20 @@ from cython.operator import dereference cpdef Column capitalize( Column input, - Scalar delimiters + Scalar delimiters=None # TODO: default scalar values # https://github.com/rapidsai/cudf/issues/15505 ): + cdef unique_ptr[column] c_result - cdef const string_scalar* cpp_delimiters = ( + cdef const string_scalar* cpp_delimiters + + if delimiters is None: + delimiters = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cpp_delimiters = ( delimiters.c_obj.get() ) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx index d3e69e38803..5ebcba7cdd8 100644 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/strings/capitalize.pyx @@ -8,7 +8,7 @@ import cudf._lib.pylibcudf as plc @acquire_spill_lock() def capitalize(Column source_strings): return Column.from_pylibcudf( - plc.strings.capitalize( + plc.strings.capitalize.capitalize( source_strings.to_pylibcudf(mode="read") ) ) @@ -17,7 +17,7 @@ def capitalize(Column source_strings): @acquire_spill_lock() def title(Column source_strings): return Column.from_pylibcudf( - plc.strings.title( + plc.strings.capitalize.title( source_strings.to_pylibcudf(mode="read") ) ) @@ -26,7 +26,7 @@ def title(Column source_strings): @acquire_spill_lock() def is_title(Column source_strings): return Column.from_pylibcudf( - plc.strings.is_title( + plc.strings.capitalize.is_title( source_strings.to_pylibcudf(mode="read") ) ) From 53c5cac510600ab9e8456224e4c93d33386fef38 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 10 May 2024 07:51:53 -0700 Subject: [PATCH 12/18] fix tests --- python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 052cce0d02b..38d712e8aee 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -38,20 +38,20 @@ def title_data(): def test_capitalize(title_data): plc_col = plc.interop.from_arrow(title_data) - got = plc.strings.capitalize(plc_col) + got = plc.strings.capitalize.capitalize(plc_col) expected = pa.compute.utf8_capitalize(title_data) assert_column_eq(got, expected) def test_title(title_data): plc_col = plc.interop.from_arrow(title_data) - got = plc.strings.title(plc_col) + got = plc.strings.capitalize.title(plc_col) expected = pa.compute.utf8_title(title_data) assert_column_eq(got, expected) def test_is_title(is_title_data): plc_col = plc.interop.from_arrow(is_title_data) - got = plc.strings.is_title(plc_col) + got = plc.strings.capitalize.is_title(plc_col) expected = pa.compute.utf8_is_title(is_title_data) assert_column_eq(got, expected) From 8cbf5c1cabaccb78015335845e78dec907a1ca1b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 07:50:56 -0700 Subject: [PATCH 13/18] address reviews --- .../_lib/pylibcudf/strings/CMakeLists.txt | 2 - .../_lib/pylibcudf/strings/capitalize.pyx | 3 +- .../cudf/cudf/pylibcudf_tests/common/utils.py | 1 - .../pylibcudf_tests/test_string_capitalize.py | 37 ++++++++----------- 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index 71f2c98bae6..ec8a25363cf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -20,5 +20,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) - -link_to_pyarrow_headers(pylibcudf_capitalize) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 3f6caf398cb..9162d1ee220 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -23,14 +23,13 @@ cpdef Column capitalize( ): cdef unique_ptr[column] c_result - cdef const string_scalar* cpp_delimiters if delimiters is None: delimiters = Scalar.from_libcudf( cpp_make_string_scalar("".encode()) ) - cpp_delimiters = ( + cdef const string_scalar* cpp_delimiters = ( delimiters.c_obj.get() ) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 6636ab9e5f8..596cd2c92ae 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() - assert plc_pa.equals(pa_array) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 38d712e8aee..9e3386be9ac 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -8,7 +8,7 @@ @pytest.fixture(scope="module") -def is_title_data(): +def pa_data(): data = [ "leopard", "Golden Eagle", @@ -21,37 +21,32 @@ def is_title_data(): "AƻB", "Ⓑⓖ", "Art of War", - ] - return pa.array(data) - - -@pytest.fixture(scope="module") -def title_data(): - data = [ - None, "The quick bRoWn fox juMps over the laze DOG", '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', "accénted", + None, ] return pa.array(data) -def test_capitalize(title_data): - plc_col = plc.interop.from_arrow(title_data) - got = plc.strings.capitalize.capitalize(plc_col) - expected = pa.compute.utf8_capitalize(title_data) +@pytest.fixture(scope="module") +def plc_data(pa_data): + return plc.interop.from_arrow(pa_data) + + +def test_capitalize(plc_data, pa_data): + got = plc.strings.capitalize.capitalize(plc_data) + expected = pa.compute.utf8_capitalize(pa_data) assert_column_eq(got, expected) -def test_title(title_data): - plc_col = plc.interop.from_arrow(title_data) - got = plc.strings.capitalize.title(plc_col) - expected = pa.compute.utf8_title(title_data) +def test_title(plc_data, pa_data): + got = plc.strings.capitalize.title(plc_data) + expected = pa.compute.utf8_title(pa_data) assert_column_eq(got, expected) -def test_is_title(is_title_data): - plc_col = plc.interop.from_arrow(is_title_data) - got = plc.strings.capitalize.is_title(plc_col) - expected = pa.compute.utf8_is_title(is_title_data) +def test_is_title(plc_data, pa_data): + got = plc.strings.capitalize.is_title(plc_data) + expected = pa.compute.utf8_is_title(pa_data) assert_column_eq(got, expected) From 1081d6619825021a2b49b59fcb0ebc76764a370e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 11:14:08 -0700 Subject: [PATCH 14/18] plumb StringCharacterTypes --- .../_lib/pylibcudf/libcudf/CMakeLists.txt | 1 + .../pylibcudf/libcudf/strings/CMakeLists.txt | 23 +++++++++++++++++++ .../pylibcudf/libcudf/strings/char_types.pxd | 23 ++++++++++--------- .../pylibcudf/libcudf/strings/char_types.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../_lib/pylibcudf/strings/capitalize.pyx | 6 ++++- .../_lib/pylibcudf/strings/char_types.pxd | 5 ++++ .../_lib/pylibcudf/strings/char_types.pyx | 4 ++++ .../cudf/cudf/pylibcudf_tests/common/utils.py | 1 + .../pylibcudf_tests/test_string_capitalize.py | 8 ++++++- 12 files changed, 61 insertions(+), 16 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 89d3dc66f00..034e0235c07 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -15,6 +15,7 @@ set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd stream_compaction.pyx types.pyx unary.pyx ) +add_subdirectory(strings) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt new file mode 100644 index 00000000000..930c22781d0 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources char_types.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_strings +) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd index 408b3687c4a..f63e1a93f91 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from cudf._lib.pylibcudf.libcudf.column.column cimport column @@ -10,17 +11,17 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/char_types/char_types.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum string_character_types: - DECIMAL 'cudf::strings::string_character_types::DECIMAL' - NUMERIC 'cudf::strings::string_character_types::NUMERIC' - DIGIT 'cudf::strings::string_character_types::DIGIT' - ALPHA 'cudf::strings::string_character_types::ALPHA' - SPACE 'cudf::strings::string_character_types::SPACE' - UPPER 'cudf::strings::string_character_types::UPPER' - LOWER 'cudf::strings::string_character_types::LOWER' - ALPHANUM 'cudf::strings::string_character_types::ALPHANUM' - CASE_TYPES 'cudf::strings::string_character_types::CASE_TYPES' - ALL_TYPES 'cudf::strings::string_character_types::ALL_TYPES' + cpdef enum class string_character_types(uint32_t): + DECIMAL + NUMERIC + DIGIT + ALPHA + SPACE + UPPER + LOWER + ALPHANUM + CASE_TYPES + ALL_TYPES cdef extern from "cudf/strings/char_types/char_types.hpp" \ namespace "cudf::strings" nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index ec8a25363cf..0e9c1c916f0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx find.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 09dfb81a07d..ec3dbc150b5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, find +from . cimport capitalize, case, char_types, find diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 4bc93fd20e0..3793bda0aa4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, find +from . import capitalize, case, char_types, find diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index 9162d1ee220..a5927bdd79a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize from cudf._lib.pylibcudf.scalar cimport Scalar +from cudf._lib.pylibcudf.strings.char_types cimport string_character_types from cython.operator import dereference @@ -42,7 +43,10 @@ cpdef Column capitalize( return Column.from_libcudf(move(c_result)) -cpdef Column title(Column input): +cpdef Column title( + Column input, + string_character_types sequence_type=string_character_types.ALPHA +): cdef unique_ptr[column] c_result with nogil: c_result = cpp_capitalize.title(input.view()) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd new file mode 100644 index 00000000000..a80e02f520c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.char_types cimport ( + string_character_types, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx new file mode 100644 index 00000000000..d96161951c6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.char_types import \ + string_character_types as StringCharacterTypes # no-cython-lint diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 596cd2c92ae..848081cba4e 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,6 +35,7 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() + breakpoint() assert plc_pa.equals(pa_array) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 9e3386be9ac..4a6e3fb906e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -41,7 +41,13 @@ def test_capitalize(plc_data, pa_data): def test_title(plc_data, pa_data): - got = plc.strings.capitalize.title(plc_data) + # A sequence shall be all characters that are not a space + # matches arrow for now + str_char_type = ( + plc.strings.char_types.StringCharacterTypes.ALL_TYPES + & ~plc.strings.char_types.StringCharacterTypes.SPACE + ) + got = plc.strings.capitalize.title(plc_data, str_char_type) expected = pa.compute.utf8_title(pa_data) assert_column_eq(got, expected) From 4314467cc15d36220c75ef4b727e0ac9b2bb4aa1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 12:29:10 -0700 Subject: [PATCH 15/18] minor cleanup --- python/cudf/cudf/pylibcudf_tests/common/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 848081cba4e..596cd2c92ae 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: plc_pa = plc_pa.combine_chunks() if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() - breakpoint() assert plc_pa.equals(pa_array) From 6dcd38cb1d22401665db5f32d46dbd4ef6057e5f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 12:34:24 -0700 Subject: [PATCH 16/18] plumb sequence_type --- .../cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd | 7 ++++++- python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd index e7994676760..b0771e16680 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd @@ -4,6 +4,9 @@ from libcpp.memory cimport unique_ptr from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.strings.char_types cimport ( + string_character_types, +) cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: @@ -13,7 +16,9 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil: ) except + cdef unique_ptr[column] title( - const column_view & strings) except + + const column_view & strings, + string_character_types sequence_type + ) except + cdef unique_ptr[column] is_title( const column_view & strings) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx index a5927bdd79a..d3f79088018 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx @@ -49,7 +49,7 @@ cpdef Column title( ): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_capitalize.title(input.view()) + c_result = cpp_capitalize.title(input.view(), sequence_type) return Column.from_libcudf(move(c_result)) From cc9ff42eabd5405392c768e3ea2192443022d7db Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 12:44:00 -0700 Subject: [PATCH 17/18] arrow default matches CASE_TYPES --- .../cudf/cudf/pylibcudf_tests/test_string_capitalize.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 4a6e3fb906e..dd7e96e871b 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -41,13 +41,9 @@ def test_capitalize(plc_data, pa_data): def test_title(plc_data, pa_data): - # A sequence shall be all characters that are not a space - # matches arrow for now - str_char_type = ( - plc.strings.char_types.StringCharacterTypes.ALL_TYPES - & ~plc.strings.char_types.StringCharacterTypes.SPACE + got = plc.strings.capitalize.title( + plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) - got = plc.strings.capitalize.title(plc_data, str_char_type) expected = pa.compute.utf8_title(pa_data) assert_column_eq(got, expected) From e9e8b356510b1b6170ccda212008cd1e65c867bf Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 May 2024 18:26:10 -0700 Subject: [PATCH 18/18] minor fixes --- python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 034e0235c07..8a6ce6a5187 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -15,12 +15,11 @@ set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd stream_compaction.pyx types.pyx unary.pyx ) -add_subdirectory(strings) set(linked_libraries cudf::cudf) - rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) +add_subdirectory(strings)