Skip to content

Commit

Permalink
Convert cudf.Scalar usage to pylibcudf and pyarrow usage (#17686)
Browse files Browse the repository at this point in the history
A lot of `cudf.Scalar` usage is to eventually end up with a device scalar object (`pylibcudf.Scalar`) to pass to a pylibcudf routine. The conversion logic to get there can be achieved by converting to a `pyarrow.Scalar` and using `pylibcudf.interop.from_arrow`. This way we offload a lot of scalar-conversion-logic in `cudf.Scalar` to `pyarrow.Scalar` which can further be converted using the interop method.

This PR just tackles some straightforward cases of the above. Another PR will tackle the more involved cases

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: #17686
  • Loading branch information
mroeschke authored Jan 8, 2025
1 parent f1cb88d commit 2c385c4
Show file tree
Hide file tree
Showing 15 changed files with 167 additions and 139 deletions.
4 changes: 3 additions & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.

"""Define common type operations."""

Expand All @@ -13,6 +13,7 @@
import cupy as cp
import numpy as np
import pandas as pd
import pyarrow as pa
from pandas.api import types as pd_types

import cudf
Expand Down Expand Up @@ -144,6 +145,7 @@ def is_scalar(val):
cudf.Scalar,
cudf._lib.scalar.DeviceScalar,
cudf.core.tools.datetimes.DateOffset,
pa.Scalar,
),
) or (
pd_types.is_scalar(val)
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/byte_pair_encoding.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
1 this is it
dtype: object
"""
sep = cudf.Scalar(separator, dtype="str")
return cudf.Series._from_column(
text._column.byte_pair_encoding(self.merge_pairs, sep)
text._column.byte_pair_encoding(self.merge_pairs, separator)
)
22 changes: 13 additions & 9 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,12 @@ def find_and_replace(
def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
plc_column = plc.replace.clamp(
self.to_pylibcudf(mode="read"),
cudf.Scalar(lo, self.dtype).device_value.c_value,
cudf.Scalar(hi, self.dtype).device_value.c_value,
plc.interop.from_arrow(
pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype))
),
plc.interop.from_arrow(
pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype))
),
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]

Expand Down Expand Up @@ -1029,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
# https://github.com/rapidsai/cudf/issues/14515 by
# providing a mode in which cudf::contains does not mask
# the result.
result = result.fillna(cudf.Scalar(rhs.null_count > 0))
result = result.fillna(rhs.null_count > 0)
return result

def as_mask(self) -> Buffer:
Expand Down Expand Up @@ -1995,12 +1999,12 @@ def as_column(
column = Column.from_pylibcudf(
plc.filling.sequence(
len(arbitrary),
cudf.Scalar(
arbitrary.start, dtype=np.dtype(np.int64)
).device_value.c_value,
cudf.Scalar(
arbitrary.step, dtype=np.dtype(np.int64)
).device_value.c_value,
plc.interop.from_arrow(
pa.scalar(arbitrary.start, type=pa.int64())
),
plc.interop.from_arrow(
pa.scalar(arbitrary.step, type=pa.int64())
),
)
)
if cudf.get_option("default_integer_bitwidth") and dtype is None:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,8 @@ def is_year_end(self) -> ColumnBase:
day_of_year = self.day_of_year
leap_dates = self.is_leap_year

leap = day_of_year == cudf.Scalar(366)
non_leap = day_of_year == cudf.Scalar(365)
leap = day_of_year == 366
non_leap = day_of_year == 365
return leap.copy_if_else(non_leap, leap_dates).fillna(False)

@property
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
with acquire_spill_lock():
plc_column = plc.strings.convert.convert_lists.format_list_column(
lc.to_pylibcudf(mode="read"),
cudf.Scalar("None").device_value.c_value,
plc.interop.from_arrow(pa.scalar("None")),
separators.to_pylibcudf(mode="read"),
)
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]
Expand Down Expand Up @@ -391,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase:
)

@acquire_spill_lock()
def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase:
return type(self).from_pylibcudf(
plc.lists.contains(
self.to_pylibcudf(mode="read"),
search_key.device_value.c_value,
plc.interop.from_arrow(search_key),
)
)

@acquire_spill_lock()
def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase:
return type(self).from_pylibcudf(
plc.lists.index_of(
self.to_pylibcudf(mode="read"),
search_key.device_value.c_value,
plc.interop.from_arrow(search_key),
plc.lists.DuplicateFindOption.FIND_FIRST,
)
)
Expand Down Expand Up @@ -569,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
dtype: bool
"""
return self._return_or_inplace(
self._column.contains_scalar(cudf.Scalar(search_key))
self._column.contains_scalar(pa.scalar(search_key))
)

def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
Expand Down Expand Up @@ -618,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
"""

if is_scalar(search_key):
result = self._column.index_of_scalar(cudf.Scalar(search_key))
result = self._column.index_of_scalar(pa.scalar(search_key))
else:
result = self._column.index_of_column(as_column(search_key))
return self._return_or_inplace(result)
Expand Down
11 changes: 4 additions & 7 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand All @@ -7,6 +7,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa
from numba.np import numpy_support
from typing_extensions import Self

Expand Down Expand Up @@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
elif self.dtype.kind == "b":
conv_func = functools.partial(
plc.strings.convert.convert_booleans.from_booleans,
true_string=cudf.Scalar(
"True", dtype="str"
).device_value.c_value,
false_string=cudf.Scalar(
"False", dtype="str"
).device_value.c_value,
true_string=plc.interop.from_arrow(pa.scalar("True")),
false_string=plc.interop.from_arrow(pa.scalar("False")),
)
elif self.dtype.kind in {"i", "u"}:
conv_func = plc.strings.convert.convert_integers.from_integers
Expand Down
Loading

0 comments on commit 2c385c4

Please sign in to comment.