Convert cudf.Scalar usage to pylibcudf and pyarrow usage (#17686)

A lot of `cudf.Scalar` usage is to eventually end up with a device scalar object (`pylibcudf.Scalar`) to pass to a pylibcudf routine. The conversion logic to get there can be achieved by converting to a `pyarrow.Scalar` and using `pylibcudf.interop.from_arrow`. This way we offload a lot of scalar-conversion-logic in `cudf.Scalar` to `pyarrow.Scalar` which can further be converted using the interop method. This PR just tackles some straightforward cases of the above. Another PR will tackle the more involved cases Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: #17686
rapidsai · Jan 8, 2025 · 2c385c4 · 2c385c4
1 parent f1cb88d
commit 2c385c4
Show file tree

Hide file tree

Showing 15 changed files with 167 additions and 139 deletions.
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 """Define common type operations."""
 
@@ -13,6 +13,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pandas.api import types as pd_types
 
 import cudf
@@ -144,6 +145,7 @@ def is_scalar(val):
             cudf.Scalar,
             cudf._lib.scalar.DeviceScalar,
             cudf.core.tools.datetimes.DateOffset,
+            pa.Scalar,
         ),
     ) or (
         pd_types.is_scalar(val)

diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         1             this is it
         dtype: object
         """
-        sep = cudf.Scalar(separator, dtype="str")
         return cudf.Series._from_column(
-            text._column.byte_pair_encoding(self.merge_pairs, sep)
+            text._column.byte_pair_encoding(self.merge_pairs, separator)
         )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -253,8 +253,12 @@ def find_and_replace(
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
         plc_column = plc.replace.clamp(
             self.to_pylibcudf(mode="read"),
-            cudf.Scalar(lo, self.dtype).device_value.c_value,
-            cudf.Scalar(hi, self.dtype).device_value.c_value,
+            plc.interop.from_arrow(
+                pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
+            plc.interop.from_arrow(
+                pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
         )
         return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
@@ -1029,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
             # https://github.com/rapidsai/cudf/issues/14515 by
             # providing a mode in which cudf::contains does not mask
             # the result.
-            result = result.fillna(cudf.Scalar(rhs.null_count > 0))
+            result = result.fillna(rhs.null_count > 0)
         return result
 
     def as_mask(self) -> Buffer:
@@ -1995,12 +1999,12 @@ def as_column(
             column = Column.from_pylibcudf(
                 plc.filling.sequence(
                     len(arbitrary),
-                    cudf.Scalar(
-                        arbitrary.start, dtype=np.dtype(np.int64)
-                    ).device_value.c_value,
-                    cudf.Scalar(
-                        arbitrary.step, dtype=np.dtype(np.int64)
-                    ).device_value.c_value,
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.start, type=pa.int64())
+                    ),
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.step, type=pa.int64())
+                    ),
                 )
             )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -353,8 +353,8 @@ def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
         leap_dates = self.is_leap_year
 
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
+        leap = day_of_year == 366
+        non_leap = day_of_year == 365
         return leap.copy_if_else(non_leap, leap_dates).fillna(False)
 
     @property

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -285,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         with acquire_spill_lock():
             plc_column = plc.strings.convert.convert_lists.format_list_column(
                 lc.to_pylibcudf(mode="read"),
-                cudf.Scalar("None").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar("None")),
                 separators.to_pylibcudf(mode="read"),
             )
             return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
@@ -391,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase:
         )
 
     @acquire_spill_lock()
-    def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.contains(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
             )
         )
 
     @acquire_spill_lock()
-    def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.index_of(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
                 plc.lists.DuplicateFindOption.FIND_FIRST,
             )
         )
@@ -569,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
         dtype: bool
         """
         return self._return_or_inplace(
-            self._column.contains_scalar(cudf.Scalar(search_key))
+            self._column.contains_scalar(pa.scalar(search_key))
         )
 
     def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
@@ -618,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
         """
 
         if is_scalar(search_key):
-            result = self._column.index_of_scalar(cudf.Scalar(search_key))
+            result = self._column.index_of_scalar(pa.scalar(search_key))
         else:
             result = self._column.index_of_column(as_column(search_key))
         return self._return_or_inplace(result)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from numba.np import numpy_support
 from typing_extensions import Self
 
@@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         elif self.dtype.kind == "b":
             conv_func = functools.partial(
                 plc.strings.convert.convert_booleans.from_booleans,
-                true_string=cudf.Scalar(
-                    "True", dtype="str"
-                ).device_value.c_value,
-                false_string=cudf.Scalar(
-                    "False", dtype="str"
-                ).device_value.c_value,
+                true_string=plc.interop.from_arrow(pa.scalar("True")),
+                false_string=plc.interop.from_arrow(pa.scalar("False")),
             )
         elif self.dtype.kind in {"i", "u"}:
             conv_func = plc.strings.convert.convert_integers.from_integers