Skip to content

Commit

Permalink
Add remaining string.char_types APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Sep 10, 2024
1 parent 6dd5689 commit fffc838
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 123 deletions.
178 changes: 58 additions & 120 deletions python/cudf/cudf/_lib/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -1,50 +1,28 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.


from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.char_types cimport (
all_characters_of_type as cpp_all_characters_of_type,
filter_characters_of_type as cpp_filter_characters_of_type,
string_character_types,
)

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

import pylibcudf as plc


@acquire_spill_lock()
def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
"""
Returns a Column of strings keeping only alphanumeric character types.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
plc_column = plc.strings.char_types.filter_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.ALL_TYPES if keep
else string_character_types.ALPHANUM,
py_repl.device_value.c_value,
string_character_types.ALPHANUM if keep
else string_character_types.ALL_TYPES
)

with nogil:
c_result = move(cpp_filter_characters_of_type(
source_view,
string_character_types.ALL_TYPES if keep
else string_character_types.ALPHANUM,
scalar_repl[0],
string_character_types.ALPHANUM if keep
else string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
that contain only decimal characters -- those that can be used
to extract base10 numbers.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DECIMAL,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.DECIMAL,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -75,17 +48,12 @@ def is_alnum(Column source_strings):
Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHANUM,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.ALPHANUM,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only alphabetic characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHA,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.ALPHA,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -113,17 +76,12 @@ def is_digit(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only decimal and digit characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DIGIT,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.DIGIT,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
that contain only numeric characters. These include digit and
numeric characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.NUMERIC,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.NUMERIC,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -152,17 +105,12 @@ def is_upper(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only upper-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.UPPER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.UPPER,
string_character_types.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -171,17 +119,12 @@ def is_lower(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only lower-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.LOWER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.LOWER,
string_character_types.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -190,14 +133,9 @@ def is_space(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contains all characters which are spaces only.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.SPACE,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
string_character_types.SPACE,
string_character_types.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)
3 changes: 0 additions & 3 deletions python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
CASE_TYPES
ALL_TYPES

cdef extern from "cudf/strings/char_types/char_types.hpp" \
namespace "cudf::strings" nogil:

cdef unique_ptr[column] all_characters_of_type(
column_view source_strings,
string_character_types types,
Expand Down
16 changes: 16 additions & 0 deletions python/pylibcudf/pylibcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.char_types cimport string_character_types
from pylibcudf.scalar cimport Scalar


cpdef Column all_characters_of_type(
Column source_strings,
string_character_types types,
string_character_types verify_types
)

cpdef Column filter_characters_of_type(
Column source_strings,
string_character_types types_to_remove,
Scalar replacement,
string_character_types types_to_keep
)
88 changes: 88 additions & 0 deletions python/pylibcudf/pylibcudf/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,92 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings cimport char_types as cpp_char_types
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference
from pylibcudf.libcudf.strings.char_types import \
string_character_types as StringCharacterTypes # no-cython-lint


cpdef Column all_characters_of_type(
Column source_strings,
string_character_types types,
string_character_types verify_types
):
"""
Filter specific character types from a column of strings.
Parameters
----------
source_strings : Column
Strings instance for this operation
types : StringCharacterTypes
The character types to check in each string
verify_types : StringCharacterTypes
Only verify against these character types.
Returns
-------
Column
New column of boolean results for each string
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_char_types.all_characters_of_type(
source_strings.view(),
types,
verify_types,
)
)

return Column.from_libcudf(move(c_result))

cpdef Column filter_characters_of_type(
Column source_strings,
string_character_types types_to_remove,
Scalar replacement,
string_character_types types_to_keep
):
"""
Filter specific character types from a column of strings.
Parameters
----------
source_strings : Column
Strings instance for this operation
types_to_remove : StringCharacterTypes
The character types to check in each string.
replacement : Scalar
The replacement character to use when removing characters
types_to_keep : StringCharacterTypes
Default `ALL_TYPES` means all characters of `types_to_remove`
will be filtered.
Returns
-------
Column
New column of boolean results for each string
"""
cdef const string_scalar* c_replacement = <const string_scalar*>(
replacement.c_obj.get()
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_char_types.filter_characters_of_type(
source_strings.view(),
types_to_remove,
dereference(c_replacement),
types_to_keep,
)
)

return Column.from_libcudf(move(c_result))
Loading

0 comments on commit fffc838

Please sign in to comment.