From a0fc6a89a596ebae7df436be25aed70ec908f83e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 09:33:08 -0500
Subject: [PATCH 01/14] Use cooperative-groups instead of cub warp-reduce for
 strings contains (#17540)

Replaces the `cub::WarpReduce` usage in `cudf::strings::contains` with cooperative-groups `any()`.
The change is only for the `contains_warp_parallel` kernel which is used for wider strings.
Using cooperative-groups generates more efficient code for the same results and gives an additional 11-14% performance improvement.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17540
---
 cpp/src/strings/search/find.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 0f33fcb6fe1..94bc81ec933 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
@@ -347,13 +348,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  auto const idx    = cudf::detail::grid_1d::global_thread_id();
-  using warp_reduce = cub::WarpReduce<bool>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
   auto const str_idx = idx / cudf::detail::warp_size;
   if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
+
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
@@ -373,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
     }
   }
 
-  auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+  auto const result = warp.any(found);
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
 

From 0f5d4b9514b92f69465f4d76b1f9db1c5a37f33a Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 9 Dec 2024 10:41:26 -0500
Subject: [PATCH 02/14] Remove unused IO utilities from cudf python (#17374)

Removes unused IO utilities from cuDF Python. Depends on #17163 #16042 #17252 #17263

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17374
---
 python/cudf/cudf/_lib/io/utils.pxd |  6 +--
 python/cudf/cudf/_lib/io/utils.pyx | 87 ++----------------------------
 2 files changed, 5 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 96504ebdd66..9b8bab012e2 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -13,9 +13,6 @@ from pylibcudf.libcudf.io.types cimport (
 from cudf._lib.column cimport Column
 
 
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & data) except*
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
 cdef add_df_col_struct_names(
     df,
     child_names_dict
@@ -26,7 +23,8 @@ cdef update_col_struct_field_names(
 )
 cdef update_struct_field_names(
     table,
-    vector[column_name_info]& schema_info)
+    vector[column_name_info]& schema_info
+)
 cdef Column update_column_struct_field_names(
     Column col,
     column_name_info& info
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index f23980b387a..df4675be599 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,97 +1,16 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython.buffer cimport PyBUF_READ
-from cpython.memoryview cimport PyMemoryView_FromMemory
-from libcpp.memory cimport unique_ptr
+
 from libcpp.string cimport string
-from libcpp.utility cimport move
+
 from libcpp.vector cimport vector
 
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-)
+from pylibcudf.libcudf.io.types cimport column_name_info
 
 from cudf._lib.column cimport Column
 
-import codecs
-import io
-import os
-
 from cudf.core.dtypes import StructDtype
 
-# Converts the Python sink input to libcudf IO sink_info.
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & sink
-) except*:
-    cdef vector[data_sink *] data_sinks
-    cdef vector[string] paths
-    if isinstance(src[0], io.StringIO):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.TextIOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            # Files opened in text mode expect writes to be str rather than
-            # bytes, which requires conversion from utf-8. If the underlying
-            # buffer is utf-8, we can bypass this conversion by writing
-            # directly to it.
-            if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}:
-                raise NotImplementedError(f"Unsupported encoding {s.encoding}")
-            sink.push_back(
-                unique_ptr[data_sink](new iobase_data_sink(s.buffer))
-            )
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.IOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], (basestring, os.PathLike)):
-        paths.reserve(len(src))
-        for s in src:
-            paths.push_back(<string> os.path.expanduser(s).encode())
-        return sink_info(move(paths))
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src)))
-
-
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
-    cdef vector[unique_ptr[data_sink]] datasinks
-    cdef sink_info info = make_sinks_info([src], datasinks)
-    if not datasinks.empty():
-        sink.swap(datasinks[0])
-    return info
-
-
-# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
-# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
-cdef cppclass iobase_data_sink(data_sink):
-    object buf
-
-    iobase_data_sink(object buf_):
-        this.buf = buf_
-
-    void host_write(const void * data, size_t size) with gil:
-        if isinstance(buf, io.StringIO):
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
-                      .tobytes().decode())
-        else:
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
-
-    void flush() with gil:
-        buf.flush()
-
-    size_t bytes_written() with gil:
-        return buf.tell()
-
-
 cdef add_df_col_struct_names(df, child_names_dict):
     for name, child_names in child_names_dict.items():
         col = df._data[name]

From ba3ed5773171a545d43d9e0f598c6c2eb37ec122 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 9 Dec 2024 10:08:13 -0800
Subject: [PATCH 03/14] Fix nvcc-imposed UB in `constexpr` functions (#17534)

nvcc does not support `constexpr` functions that are not well-defined to call from the device. This is UB even when the function is not called from the device.

Throwing an exception is one such operation. This PR cleans up error handling for functions that are called from device, and removes `constexpr` from the ones that are not actually used from the device, or in the constexpr context.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17534
---
 .../cudf/detail/utilities/device_operators.cuh | 18 +++++++++++++++++-
 cpp/include/cudf/utilities/span.hpp            |  2 ++
 cpp/src/io/orc/writer_impl.cu                  |  2 +-
 cpp/src/io/utilities/time_utils.cuh            |  6 +++---
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 46f424e051b..d16be5e22dd 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,11 @@ struct DeviceSum {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support device operator identity");
+#endif
     return T{};
   }
 };
@@ -141,7 +145,11 @@ struct DeviceMin {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity");
+#endif
     return cuda::std::numeric_limits<T>::max();
   }
 
@@ -189,7 +197,11 @@ struct DeviceMax {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity");
+#endif
     return cuda::std::numeric_limits<T>::lowest();
   }
 
@@ -225,7 +237,11 @@ struct DeviceProduct {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity");
+#endif
     return T{1, numeric::scale_type{0}};
   }
 };
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 21ee4fa9e9b..2273a89892b 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -417,7 +417,9 @@ class base_2dspan {
   constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
     : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+#ifndef __CUDA_ARCH__
     CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
+#endif
   }
 
   /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d432deb8e79..76e5369ffd0 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -506,7 +506,7 @@ size_t max_varint_size()
   return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7);
 }
 
-constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
+size_t RLE_stream_size(TypeKind kind, size_t count)
 {
   using cudf::util::div_rounding_up_unsafe;
   constexpr auto byte_rle_max_len = 128;
diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh
index 687766c1bcc..ff1b9f58e6c 100644
--- a/cpp/src/io/utilities/time_utils.cuh
+++ b/cpp/src/io/utilities/time_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = {
 
 struct get_period {
   template <typename T>
-  constexpr int32_t operator()()
+  int32_t operator()()
   {
     if constexpr (is_chrono<T>()) { return T::period::den; }
     CUDF_FAIL("Invalid, non chrono type");
@@ -42,7 +42,7 @@ struct get_period {
 /**
  * @brief Function that translates cuDF time unit to clock frequency
  */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
+inline int32_t to_clockrate(type_id timestamp_type_id)
 {
   return timestamp_type_id == type_id::EMPTY
            ? 0

From ed2892c8a4f00ad376e7b020d09371902fbf6b68 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:47:22 -0500
Subject: [PATCH 04/14] Document undefined behavior in div_rounding_up_safe
 (#17542)

Adds more description to the `div_rounding_up_safe` utility identifying undefined behavior.
Closes #17539

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Lawrence Mitchell (https://github.com/wence-)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17542
---
 .../cudf/detail/utilities/integer_utils.hpp   | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 8b709f2a8f8..957b6b70fe2 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
 }  // namespace detail
 
 /**
- * Divides the left-hand-side by the right-hand-side, rounding up
+ * @brief Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
  *
- * @param dividend the number to divide
- * @param divisor the number of by which to divide
- * @return The least integer multiple of {@link divisor} which is greater than or equal to
- * the non-integral division dividend/divisor.
+ * The result is undefined if `divisor == 0` or
+ * if `divisor == -1` and `dividend == min<I>()`.
+ *
+ * Will not overflow, and may _or may not_ be slower than the intuitive
+ * approach of using `(dividend + divisor - 1) / divisor`.
  *
- * @note will not overflow, and may _or may not_ be slower than the intuitive
- * approach of using (dividend + divisor - 1) / divisor
+ * @tparam I Integer type for `dividend`, `divisor`, and the return type
+ * @param dividend The number to divide
+ * @param divisor The number by which to divide
+ * @return The least integer multiple of `divisor` which is greater than or equal to
+ * the non-integral division `dividend/divisor`
  */
 template <typename I>
 constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept

From a79077cf67ff2154c2e0cd8b40891a8ec6d1712c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 9 Dec 2024 13:06:53 -0600
Subject: [PATCH 05/14] [JNI] Enables fabric handles for CUDA async memory
 pools (#17526)

This PR adds a `CUDA_ASYNC_FABRIC` allocation mode in `RmmAllocationMode` and pipes in the options to RMM's `cuda_async_memory_resource` of a `fabric` for the handle type, and `read_write` as the memory protection mode (as that's the only mode supported by the pools, and is required for IPC).

If `CUDA_ASYNC` is used, fabric handles are not requested, and the memory protection is `none`.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17526
---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 11 ++++++----
 .../ai/rapids/cudf/RmmAllocationMode.java     |  7 ++++++-
 .../cudf/RmmCudaAsyncMemoryResource.java      | 15 ++++++++++++--
 java/src/main/native/src/RmmJni.cpp           | 20 ++++++++++++++-----
 4 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index ed029c918e4..d1cc0cc96fe 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource)
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
    *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
+   *                       {@link RmmAllocationMode#CUDA_ASYNC},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
     boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
     if (isAsync && isManaged) {
@@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       } else if (isAsync) {
         resource = new RmmLimitingResourceAdaptor<>(
             new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
+      } else if (isAsyncFabric) {
+        resource = new RmmLimitingResourceAdaptor<>(
+            new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512);
       } else if (isManaged) {
         resource = new RmmManagedMemoryResource();
       } else {
@@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
 
   private static native long allocInternal(long size, long stream) throws RmmException;
 
-
   static native void free(long ptr, long length, long stream) throws RmmException;
 
   /**
@@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle,
 
   static native void releaseArenaMemoryResource(long handle);
 
-  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
+  static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException;
 
   static native void releaseCudaAsyncMemoryResource(long handle);
 
@@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path,
 
   static native void releaseLoggingResourceAdaptor(long handle);
 
-
   static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
 
   static native void releaseTrackingResourceAdaptor(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 966c21bee22..3f7bc1fae76 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,9 @@ public class RmmAllocationMode {
    * Use CUDA async suballocation strategy
    */
   public static final int CUDA_ASYNC = 0x00000008;
+  /**
+   * Use CUDA async suballocation strategy with fabric handles that are
+   * peer accessible with read-write access
+   */
+  public static final int CUDA_ASYNC_FABRIC = 0x00000010;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index fa1f13cb7ed..cf4936e2e24 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource {
    * @param releaseThreshold size in bytes for when memory is released back to cuda
    */
   public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) {
+    this(size, releaseThreshold, false);
+  }
+
+  /**
+   * Create a new async memory resource
+   * @param size the initial size of the pool
+   * @param releaseThreshold size in bytes for when memory is released back to cuda
+   * @param fabric if true request peer read+write accessible fabric handles when
+   *        creating the pool
+   */
+  public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) {
     this.size = size;
     this.releaseThreshold = releaseThreshold;
-    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold);
+    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric);
   }
 
   @Override
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 23c7b7fb243..0f424761bfe 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -772,14 +772,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
-                                                                           jclass clazz,
-                                                                           jlong init,
-                                                                           jlong release)
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
+  JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
+
+    // When we are using fabric, we need to set the memory access to be
+    // read_write, in order for peer GPUs to have access to this memory.
+    // Otherwise, choose default parameters (optional set to nullopt).
+    auto [handle_type, prot_flag] =
+      fabric
+        ? std::pair{std::optional{
+                      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric},
+                    std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}}
+        : std::pair{std::nullopt, std::nullopt};
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);
+
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)

From f5955929b06e2a4609b9fca0e3f949afb9b1dadd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:22:04 -0800
Subject: [PATCH 06/14] Remove cudf._lib.string.convert/split in favor of
 inlining pylibcudf (#17496)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17496
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/strings/CMakeLists.txt  |  15 --
 python/cudf/cudf/_lib/strings/__init__.py     |  15 --
 .../cudf/_lib/strings/convert/CMakeLists.txt  |  24 --
 .../cudf/_lib/strings/convert/__init__.pxd    |   0
 .../cudf/_lib/strings/convert/__init__.py     |   0
 .../strings/convert/convert_fixed_point.pyx   |  76 ------
 .../_lib/strings/convert/convert_floats.pyx   |  19 --
 .../_lib/strings/convert/convert_integers.pyx |  20 --
 .../_lib/strings/convert/convert_lists.pyx    |  32 ---
 .../_lib/strings/convert/convert_urls.pyx     |  48 ----
 .../cudf/_lib/strings/split/CMakeLists.txt    |  22 --
 .../cudf/cudf/_lib/strings/split/__init__.pxd |   0
 .../cudf/cudf/_lib/strings/split/__init__.py  |   0
 .../cudf/_lib/strings/split/partition.pyx     |  35 ---
 python/cudf/cudf/_lib/strings/split/split.pyx | 155 -----------
 python/cudf/cudf/core/column/decimal.py       |  15 +-
 python/cudf/cudf/core/column/lists.py         |  10 +-
 python/cudf/cudf/core/column/string.py        | 246 +++++++++++++++---
 python/cudf/cudf/core/tools/datetimes.py      |   5 +-
 python/cudf/cudf/core/tools/numeric.py        |  66 ++---
 22 files changed, 262 insertions(+), 543 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/strings/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/split/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/strings/split/partition.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/split/split.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index e98cf283bbb..f9ac3a16940 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -46,4 +46,3 @@ target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
-add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 4758a933898..52e9b89da7b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -12,7 +12,6 @@
     sort,
     stream_compaction,
     string_casting,
-    strings,
     strings_udf,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
deleted file mode 100644
index dca9c4cc3fc..00000000000
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-add_subdirectory(convert)
-add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index b795c54c112..341ba6d11c3 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -32,18 +32,3 @@
     detokenize,
     tokenize_with_vocabulary,
 )
-from cudf._lib.strings.convert.convert_fixed_point import to_decimal
-from cudf._lib.strings.convert.convert_floats import is_float
-from cudf._lib.strings.convert.convert_integers import is_integer
-from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
-from cudf._lib.strings.split.partition import partition, rpartition
-from cudf._lib.strings.split.split import (
-    rsplit,
-    rsplit_re,
-    rsplit_record,
-    rsplit_record_re,
-    split,
-    split_re,
-    split_record,
-    split_record_re,
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
deleted file mode 100644
index e8a76b476a8..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx
-                   convert_lists.pyx convert_urls.pyx
-)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
deleted file mode 100644
index 96dcd021c3b..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def from_decimal(Column input_col):
-    """
-    Converts a `Decimal64Column` to a `StringColumn`.
-
-    Parameters
-    ----------
-    input_col : input column of type decimal
-
-    Returns
-    -------
-    A column of strings representing the input decimal values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def to_decimal(Column input_col, object out_type):
-    """
-    Returns a `Decimal64Column` from the provided `StringColumn`
-    using the scale in the `out_type`.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    out_type : The type and scale of the decimal column expected
-
-    Returns
-    -------
-    A column of decimals parsed from the string values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(out_type),
-    )
-    result = Column.from_pylibcudf(plc_column)
-    result.dtype.precision = out_type.precision
-    return result
-
-
-@acquire_spill_lock()
-def is_fixed_point(Column input_col, object dtype):
-    """
-    Returns a Column of boolean values with True for `input_col`
-    that have fixed-point characters. The output row also has a
-    False value if the corresponding string would cause an integer
-    overflow. The scale of the `dtype` is used to determine overflow
-    in the output row.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    dtype : The type and scale of a decimal column
-
-    Returns
-    -------
-    A Column of booleans indicating valid decimal conversion.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(dtype),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
deleted file mode 100644
index 5da6e3f10cc..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    plc_column = plc.strings.convert.convert_floats.is_float(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
deleted file mode 100644
index 50113347ccb..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have integers.
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_integers.is_integer(
-            source_strings.to_pylibcudf(mode="read")
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
deleted file mode 100644
index 3a2cb4bd5c7..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def format_list_column(Column source_list, Column separators):
-    """
-    Format a list column of strings into a strings column.
-
-    Parameters
-    ----------
-    input_col : input column of type list with strings child.
-
-    separators: strings used for formatting (', ', '[', ']')
-
-    Returns
-    -------
-    Formatted strings column
-    """
-    plc_column = plc.strings.convert.convert_lists.format_list_column(
-        source_list.to_pylibcudf(mode="read"),
-        as_device_scalar("None").c_value,
-        separators.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
deleted file mode 100644
index d5c2f771970..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def url_decode(Column source_strings):
-    """
-    Decode each string in column. No format checking is performed.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL decoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_decode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def url_encode(Column source_strings):
-    """
-    Encode each string in column. No format checking is performed.
-    All characters are encoded except for ASCII letters, digits,
-    and these characters: '.','_','-','~'. Encoding converts to
-    hex using UTF-8 encoded bytes.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL encoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_encode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
deleted file mode 100644
index 4ede0a2fac5..00000000000
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources partition.pyx split.pyx)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
deleted file mode 100644
index 5319addc41c..00000000000
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def partition(Column source_strings,
-              object py_delimiter):
-    """
-    Returns data by splitting the `source_strings`
-    column at the first occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.partition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rpartition(Column source_strings,
-               object py_delimiter):
-    """
-    Returns a Column by splitting the `source_strings`
-    column at the last occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.rpartition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
deleted file mode 100644
index 4ec6c7073d8..00000000000
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def split(Column source_strings,
-          object py_delimiter,
-          size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_table = plc.strings.split.split.split(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record(Column source_strings,
-                 object py_delimiter,
-                 size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_column = plc.strings.split.split.split_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit(Column source_strings,
-           object py_delimiter,
-           size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_table = plc.strings.split.split.rsplit(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_record(Column source_strings,
-                  object py_delimiter,
-                  size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_column = plc.strings.split.split.rsplit_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def split_re(Column source_strings,
-             object pattern,
-             size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_table = plc.strings.split.split.split_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_re(Column source_strings,
-              object pattern,
-              size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_table = plc.strings.split.split.rsplit_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record_re(Column source_strings,
-                    object pattern,
-                    size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_column = plc.strings.split.split.split_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit_record_re(Column source_strings,
-                     object pattern,
-                     size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_column = plc.strings.split.split.rsplit_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 2c22724d3d7..9e6a73f1a9c 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -10,13 +10,12 @@
 import numpy as np
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.strings.convert.convert_fixed_point import (
-    from_decimal as cpp_from_decimal,
-)
 from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop, unary
-from cudf.core.buffer import as_buffer
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
@@ -89,7 +88,13 @@ def as_decimal_column(
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
-            return cpp_from_decimal(self)
+            with acquire_spill_lock():
+                plc_column = (
+                    plc.strings.convert.convert_fixed_point.from_fixed_point(
+                        self.to_pylibcudf(mode="read"),
+                    )
+                )
+                return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
         else:
             return cast(
                 cudf.core.column.StringColumn,
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ea384888388..b95fb0a0d39 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
@@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
 
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        with acquire_spill_lock():
+            plc_column = plc.strings.convert.convert_lists.format_list_column(
+                lc.to_pylibcudf(mode="read"),
+                cudf.Scalar("None").device_value.c_value,
+                separators.to_pylibcudf(mode="read"),
+            )
+            return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6b45828568c..4a2483a80e3 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -19,6 +19,7 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
+from cudf import _lib as libcudf
 from cudf._lib import string_casting as str_cast, strings as libstrings
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
@@ -44,6 +45,7 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.numerical import NumericalColumn
 
 
 def str_to_boolean(column: StringColumn):
@@ -1336,7 +1338,7 @@ def isinteger(self) -> SeriesOrIndex:
         2    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_integer(self._column))
+        return self._return_or_inplace(self._column.is_integer())
 
     def ishex(self) -> SeriesOrIndex:
         """
@@ -1468,7 +1470,7 @@ def isfloat(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_float(self._column))
+        return self._return_or_inplace(self._column.is_float())
 
     def isdecimal(self) -> SeriesOrIndex:
         """
@@ -2710,26 +2712,25 @@ def split(
         if len(str(pat)) <= 1:
             regex = False
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.split_re(self._column, pat, n)
+                    data = self._column.split_re(pat, n)
                 else:
-                    data = libstrings.split(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.split(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.split_record_re(self._column, pat, n)
+                result_table = self._column.split_record_re(pat, n)
             else:
-                result_table = libstrings.split_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.split_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2883,28 +2884,25 @@ def rsplit(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.rsplit_re(self._column, pat, n)
+                    data = self._column.rsplit_re(pat, n)
                 else:
-                    data = libstrings.rsplit(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.rsplit_record_re(
-                    self._column, pat, n
-                )
+                result_table = self._column.rsplit_record_re(pat, n)
             else:
-                result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.rsplit_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2989,7 +2987,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
+            self._column.partition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -3054,7 +3052,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
+            self._column.rpartition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -4499,8 +4497,7 @@ def url_decode(self) -> SeriesOrIndex:
         1    https://medium.com/rapids-ai
         dtype: object
         """
-
-        return self._return_or_inplace(libstrings.url_decode(self._column))
+        return self._return_or_inplace(self._column.url_decode())
 
     def url_encode(self) -> SeriesOrIndex:
         """
@@ -4531,7 +4528,7 @@ def url_encode(self) -> SeriesOrIndex:
         1    https%3A%2F%2Fmedium.com%2Frapids-ai
         dtype: object
         """
-        return self._return_or_inplace(libstrings.url_encode(self._column))
+        return self._return_or_inplace(self._column.url_encode())
 
     def code_points(self) -> SeriesOrIndex:
         """
@@ -6015,13 +6012,13 @@ def as_numerical_column(
         out_dtype = cudf.api.types.dtype(dtype)
         string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(string_col).all():
+            if not string_col.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(string_col).all():
+            if not string_col.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
@@ -6099,10 +6096,17 @@ def as_timedelta_column(
     ) -> cudf.core.column.TimeDeltaColumn:
         return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
+    @acquire_spill_lock()
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return libstrings.to_decimal(self, dtype)
+    ) -> cudf.core.column.DecimalBaseColumn:
+        plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+            self.to_pylibcudf(mode="read"),
+            libcudf.types.dtype_to_pylibcudf_type(dtype),
+        )
+        result = Column.from_pylibcudf(plc_column)
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result  # type: ignore[return-value]
 
     def as_string_column(self) -> StringColumn:
         return self
@@ -6138,12 +6142,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
         if self.dtype == to_dtype:
             return True
-        elif (
-            to_dtype.kind in {"i", "u"}
-            and not libstrings.is_integer(self).all()
-        ):
+        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
             return False
-        elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
+        elif to_dtype.kind == "f" and not self.is_float().all():
             return False
         else:
             return True
@@ -6333,11 +6334,180 @@ def title(self) -> Self:
     def is_title(self) -> Self:
         return self._modify_characters(plc.strings.capitalize.is_title)
 
+    @acquire_spill_lock()
     def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
-        with acquire_spill_lock():
-            plc_result = plc.strings.replace.replace_multiple(
-                self.to_pylibcudf(mode="read"),
-                pattern.to_pylibcudf(mode="read"),
-                replacements.to_pylibcudf(mode="read"),
+        plc_result = plc.strings.replace.replace_multiple(
+            self.to_pylibcudf(mode="read"),
+            pattern.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+        )
+        return cast(Self, Column.from_pylibcudf(plc_result))
+
+    @acquire_spill_lock()
+    def _split_record_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Column,
+        ],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return cast(Self, Column.from_pylibcudf(plc_column))
+
+    def split_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.split_record_re
+        )
+
+    def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_record_re
+        )
+
+    @acquire_spill_lock()
+    def _split_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Table,
+        ],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
             )
-            return cast(Self, Column.from_pylibcudf(plc_result))
+        )
+
+    def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.split_re
+        )
+
+    def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_re
+        )
+
+    @acquire_spill_lock()
+    def _split_record(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.split_record
+        )
+
+    def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.rsplit_record
+        )
+
+    @acquire_spill_lock()
+    def _split(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.split)
+
+    def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
+
+    @acquire_spill_lock()
+    def _partition(
+        self,
+        delimiter: cudf.Scalar,
+        method: Callable[[plc.Column, plc.Scalar], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.partition
+        )
+
+    def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.rpartition
+        )
+
+    @acquire_spill_lock()
+    def url_decode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_decode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def url_encode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_encode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_integer(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_integers.is_integer(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_float(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_floats.is_float(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 80ee078917a..8be336021b1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -15,9 +15,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
                         )
                         break
                     elif arg_col.dtype.kind == "O":
-                        if not cpp_is_integer(arg_col).all():
+                        if not arg_col.is_integer().all():
                             col = new_series._column.strptime(
                                 cudf.dtype("datetime64[ns]"), format=format
                             )
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 91f23490031..40348461f8c 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -2,14 +2,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -18,10 +17,16 @@
 from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
-    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.string import StringColumn
 
 
-def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
+def to_numeric(
+    arg,
+    errors: Literal["raise", "coerce", "ignore"] = "raise",
+    downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+    dtype_backend=None,
+):
     """
     Convert argument into numerical types.
 
@@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         else:
             try:
                 col = _convert_str_col(
-                    col._get_decategorized_column(), errors, downcast
+                    col._get_decategorized_column(),  # type: ignore[attr-defined]
+                    errors,
+                    downcast,
                 )
             except ValueError as e:
                 if errors == "ignore":
@@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
                     raise e
     elif is_string_dtype(dtype):
         try:
-            col = _convert_str_col(col, errors, downcast)
+            col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
             if errors == "ignore":
                 return arg
@@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         return col.values
 
 
-def _convert_str_col(col, errors, _downcast=None):
+def _convert_str_col(
+    col: StringColumn,
+    errors: Literal["raise", "coerce", "ignore"],
+    _downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+) -> NumericalColumn:
     """
     Converts a string column to numeric column
 
@@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None):
     if not is_string_dtype(col):
         raise TypeError("col must be string dtype.")
 
-    is_integer = libstrings.is_integer(col)
-    if is_integer.all():
-        return col.astype(dtype=cudf.dtype("i8"))
+    if col.is_integer().all():
+        return col.astype(dtype=cudf.dtype("i8"))  # type: ignore[return-value]
 
-    col = _proc_inf_empty_strings(col)
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
+    converted_col = (
+        col.to_lower()
+        .find_and_replace(as_column([""]), as_column(["NaN"]))
+        .replace_multiple(
+            as_column(["+", "inf", "inity"]),  # type: ignore[arg-type]
+            as_column(["", "Inf", ""]),  # type: ignore[arg-type]
+        )
+    )
 
-    is_float = libstrings.is_float(col)
+    is_float = converted_col.is_float()
     if is_float.all():
         if _downcast in {"unsigned", "signed", "integer"}:
             warnings.warn(
@@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.astype(dtype=cudf.dtype("float32"))
+            return converted_col.astype(dtype=cudf.dtype("float32"))  # type: ignore[return-value]
         else:
-            return col.astype(dtype=cudf.dtype("float64"))
+            return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            col = libcudf.string_casting.stod(col)
+            converted_col = libcudf.string_casting.stod(converted_col)
             non_numerics = is_float.unary_operator("not")
-            col[non_numerics] = None
-            return col
+            converted_col[non_numerics] = None
+            return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
-
-
-def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
-    """Handles empty and infinity strings"""
-    col = col.to_lower()  # type: ignore[attr-defined]
-    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
-    # TODO: This can be handled by libcudf in
-    # future see StringColumn.as_numerical_column
-    col = col.replace_multiple(  # type: ignore[attr-defined]
-        as_column(["+", "inf", "inity"]),
-        as_column(["", "Inf", ""]),
-    )
-    return col

From 5b412dc14d047959d1a2b70bf27ffea139769f7a Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 9 Dec 2024 16:54:59 -0600
Subject: [PATCH 07/14] [JNI] remove rmm argument to set rw access for fabric
 handles (#17553)

This is a follow up from https://github.com/rapidsai/cudf/pull/17526, where fabric handles can be enabled from RMM. That PR also sets the memory access protection flag (`cudaMemPoolSetAccess`), but I have learned that this second flag is not needed from the owner device. In fact, it causes confusion because the owning device fails to call this function with some of the flags (access none).  `cudaMemPoolSetAccess` is meant to only be called from peer processes that have imported the pool's handle. In our case, UCX handles this from the peer's side and it does not need to be anywhere in RMM or cuDF.

Sorry for the noise. I'd like to get this fix in, and then I am going to fix RMM by removing that API.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17553
---
 java/src/main/native/src/RmmJni.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 0f424761bfe..8c733018fa7 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -778,17 +778,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
   try {
     cudf::jni::auto_set_device(env);
 
-    // When we are using fabric, we need to set the memory access to be
-    // read_write, in order for peer GPUs to have access to this memory.
-    // Otherwise, choose default parameters (optional set to nullopt).
-    auto [handle_type, prot_flag] =
-      fabric
-        ? std::pair{std::optional{
-                      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric},
-                    std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}}
-        : std::pair{std::nullopt, std::nullopt};
-
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);
+    auto handle_type =
+      fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}
+             : std::nullopt;
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type);
 
     return reinterpret_cast<jlong>(ret);
   }

From 9df95d1c5fd41b1b87976fd3680a1d06f2d26310 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:55:16 -0800
Subject: [PATCH 08/14] Remove cudf._lib.transform in favor of inlining
 pylibcudf (#17505)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17505
---
 python/cudf/cudf/_lib/CMakeLists.txt        |   1 -
 python/cudf/cudf/_lib/transform.pyx         | 113 --------------------
 python/cudf/cudf/core/column/categorical.py |   4 +-
 python/cudf/cudf/core/column/column.py      |  34 ++++--
 python/cudf/cudf/core/column/lists.py       |   2 +-
 python/cudf/cudf/core/column/numerical.py   |  30 +++++-
 python/cudf/cudf/core/dataframe.py          |  30 +++---
 python/cudf/cudf/core/df_protocol.py        |   3 +-
 python/cudf/cudf/core/frame.py              |   9 +-
 python/cudf/cudf/core/indexed_frame.py      |   2 +-
 python/cudf/cudf/core/reshape.py            |   7 +-
 python/cudf/cudf/core/series.py             |   3 +-
 python/cudf/cudf/datasets.py                |   3 +-
 python/cudf/cudf/tests/test_column.py       |   9 +-
 14 files changed, 85 insertions(+), 165 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/transform.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index f9ac3a16940..084fc19a61e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -25,7 +25,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    transform.pyx
     types.pyx
     utils.pyx
 )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
deleted file mode 100644
index a163bb07888..00000000000
--- a/python/cudf/cudf/_lib/transform.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from numba.np import numpy_support
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.utils import cudautils
-
-from pylibcudf cimport transform as plc_transform
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def bools_to_mask(Column col):
-    """
-    Given an int8 (boolean) column, compress the data from booleans to bits and
-    return a Buffer
-    """
-    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
-    """
-    Given a mask buffer, returns a boolean column representng bit 0 -> False
-    and 1 -> True within range of [begin_bit, end_bit),
-    """
-    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
-        raise TypeError("mask_buffer is not an instance of "
-                        "cudf.core.buffer.Buffer")
-    plc_column = plc_transform.mask_to_bools(
-        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def nans_to_nulls(Column input):
-    mask, _ = plc_transform.nans_to_nulls(
-        input.to_pylibcudf(mode="read")
-    )
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def transform(Column input, op):
-    nb_type = numpy_support.from_dtype(input.dtype)
-    nb_signature = (nb_type,)
-    compiled_op = cudautils.compile_udf(op, nb_signature)
-    np_dtype = cudf.dtype(compiled_op[1])
-
-    plc_column = plc_transform.transform(
-        input.to_pylibcudf(mode="read"),
-        compiled_op[0],
-        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
-        True
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def table_encode(list source_columns):
-    plc_table, plc_column = plc_transform.encode(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
-    )
-
-    return (
-        [Column.from_pylibcudf(col) for col in plc_table.columns()],
-        Column.from_pylibcudf(plc_column)
-    )
-
-
-def one_hot_encode(Column input_column, Column categories):
-    plc_table = plc_transform.one_hot_encode(
-        input_column.to_pylibcudf(mode="read"),
-        categories.to_pylibcudf(mode="read"),
-    )
-    result_columns = [
-        Column.from_pylibcudf(col, data_ptr_exposed=True)
-        for col in plc_table.columns()
-    ]
-    result_labels = [
-        x if x is not None else '<NA>'
-        for x in categories.to_arrow().to_pylist()
-    ]
-    return dict(zip(result_labels, result_columns))
-
-
-@acquire_spill_lock()
-def compute_column(list columns, tuple column_names, str expr):
-    """Compute a new column by evaluating an expression on a set of columns.
-
-    Parameters
-    ----------
-    columns : list
-        The set of columns forming the table to evaluate the expression on.
-    column_names : tuple[str]
-        The names associated with each column. These names are necessary to map
-        column names in the expression to indices in the provided list of
-        columns, which are what will be used by libcudf to evaluate the
-        expression on the table.
-    expr : str
-        The expression to evaluate.
-    """
-    result = plc_transform.compute_column(
-        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        plc.expressions.to_expression(expr, column_names),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c849a9d3d2b..71ec11e75af 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -13,7 +13,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.transform import bools_to_mask
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
@@ -775,12 +774,11 @@ def to_pandas(
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
-            new_mask = bools_to_mask(self.notnull())
             col = type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
                 dtype=self.dtype,
-                mask=new_mask,
+                mask=self.notnull().fillna(False).as_mask(),
                 children=self.children,
             )
         else:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1ddc79e8970..b317858077f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -32,7 +32,6 @@
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.transform import bools_to_mask
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -373,10 +372,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
+    @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:
-        return libcudf.transform.mask_to_bools(
-            self.base_mask, self.offset, self.offset + len(self)
+        plc_column = plc.transform.mask_to_bools(
+            self.base_mask.get_ptr(mode="read"),  # type: ignore[union-attr]
+            self.offset,
+            self.offset + len(self),
         )
+        return type(self).from_pylibcudf(plc_column)
 
     @cached_property
     def memory_usage(self) -> int:
@@ -981,11 +984,14 @@ def as_mask(self) -> Buffer:
         -------
         Buffer
         """
-
         if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
-        return bools_to_mask(self)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.bools_to_mask(
+                self.to_pylibcudf(mode="read")
+            )
+            return as_buffer(mask)
 
     @property
     def is_unique(self) -> bool:
@@ -1514,6 +1520,18 @@ def _return_sentinel_column():
         )
         return codes.fillna(na_sentinel.value)
 
+    def one_hot_encode(
+        self, categories: ColumnBase
+    ) -> abc.Generator[ColumnBase]:
+        plc_table = plc.transform.one_hot_encode(
+            self.to_pylibcudf(mode="read"),
+            categories.to_pylibcudf(mode="read"),
+        )
+        return (
+            type(self).from_pylibcudf(col, data_ptr_exposed=True)
+            for col in plc_table.columns()
+        )
+
 
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     """Check if an object dtype Series or array contains NaN."""
@@ -2093,8 +2111,7 @@ def as_column(
                     )
                 # Consider NaT as NA in the mask
                 # but maintain NaT as a value
-                bool_mask = as_column(~is_nat)
-                mask = as_buffer(bools_to_mask(bool_mask))
+                mask = as_column(~is_nat).as_mask()
             buffer = as_buffer(arbitrary.view("|u1"))
             col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
             if dtype:
@@ -2264,8 +2281,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         )
         return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
     elif typecode == "b":
-        col = as_column(cai_mask)
-        return bools_to_mask(col)
+        return as_column(cai_mask).as_mask()
     else:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index b95fb0a0d39..ba98e28f6a2 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -255,7 +255,7 @@ def from_sequences(
             data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
-            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
+            mask=as_column(mask_col).as_mask(),
             offset=0,
             null_count=0,
             children=(offset_col, data_col),
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 9514aaeab50..790cd6ea9bb 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -7,9 +7,10 @@
 
 import numpy as np
 import pandas as pd
+from numba.np import numpy_support
 from typing_extensions import Self
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 import cudf.core.column.column as column
@@ -17,11 +18,13 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop, unary
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -179,13 +182,27 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
+    @acquire_spill_lock()
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        plc_column = plc.transform.transform(
+            self.to_pylibcudf(mode="read"),
+            compiled_op[0],
+            plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+            True,
+        )
+        return type(self).from_pylibcudf(plc_column)
+
     def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         if callable(unaryop):
-            return libcudf.transform.transform(self, unaryop)
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
 
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = pylibcudf.unary.UnaryOperator[unaryop]
+        unaryop = plc.unary.UnaryOperator[unaryop]
         return unary.unary_operation(self, unaryop)
 
     def __invert__(self):
@@ -298,8 +315,11 @@ def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
         if self.dtype.kind != "f" or self.nan_count == 0:
             return self
-        newmask = libcudf.transform.nans_to_nulls(self)
-        return self.set_mask(newmask)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.nans_to_nulls(
+                self.to_pylibcudf(mode="read")
+            )
+            return self.set_mask(as_buffer(mask))
 
     def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 656274bca38..325601e5311 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6772,9 +6772,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             )
             result = column.as_column(result, dtype=result_dtype)
             if mask is not None:
-                result = result.set_mask(
-                    cudf._lib.transform.bools_to_mask(mask._column)
-                )
+                result = result.set_mask(mask._column.as_mask())
             return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result, index=self.index)
@@ -7883,6 +7881,16 @@ def interleave_columns(self):
             )
         return self._constructor_sliced._from_column(result_col)
 
+    @acquire_spill_lock()
+    def _compute_columns(self, expr: str) -> ColumnBase:
+        plc_column = plc.transform.compute_column(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in self._columns]
+            ),
+            plc.expressions.to_expression(expr, self._column_names),
+        )
+        return libcudf.column.Column.from_pylibcudf(plc_column)
+
     @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
@@ -8010,11 +8018,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_column(
-                libcudf.transform.compute_column(
-                    [*self._columns], self._column_names, statements[0]
-                )
-            )
+            return Series._from_column(self._compute_columns(statements[0]))
 
         targets = []
         exprs = []
@@ -8030,15 +8034,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
             targets.append(t.strip())
             exprs.append(e.strip())
 
-        cols = (
-            libcudf.transform.compute_column(
-                [*self._columns], self._column_names, e
-            )
-            for e in exprs
-        )
         ret = self if inplace else self.copy(deep=False)
-        for name, col in zip(targets, cols):
-            ret._data[name] = col
+        for name, expr in zip(targets, exprs):
+            ret._data[name] = self._compute_columns(expr)
         if not inplace:
             return ret
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index aa601a2b322..a798041699e 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -799,8 +799,7 @@ def _set_missing_values(
             valid_mask = _ensure_gpu_buffer(
                 valid_mask[0], valid_mask[1], allow_copy
             )
-            boolmask = as_column(valid_mask._buf, dtype="bool")
-            bitmask = cudf._lib.transform.bools_to_mask(boolmask)
+            bitmask = as_column(valid_mask._buf, dtype="bool").as_mask()
             return cudf_col.set_mask(bitmask)
         elif null == _MaskKind.BITMASK:
             valid_mask = _ensure_gpu_buffer(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0a7e6fefe6e..84a3caf905f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1457,7 +1457,14 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode(list(self._columns))
+        plc_table, plc_column = plc.transform.encode(
+            plc.Table([col.to_pylibcudf(mode="read") for col in self._columns])
+        )
+        columns = [
+            libcudf.column.Column.from_pylibcudf(col)
+            for col in plc_table.columns()
+        ]
+        indices = libcudf.column.Column.from_pylibcudf(plc_column)
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 21ac009e7ff..95f3d4d01d5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
 
         col = _post_process_output_col(ans_col, retty)
 
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        col.set_base_mask(ans_mask.as_mask())
         result = cudf.Series._from_column(col, index=self.index)
 
         return result
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 84c653c5b3f..59a3e9dbf3b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,7 +12,6 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
@@ -1338,7 +1337,11 @@ def _one_hot_encode_column(
             f"np.iinfo({size_type_dtype}).max. Consider reducing "
             "size of category"
         )
-    data = one_hot_encode(column, categories)
+    result_labels = (
+        x if x is not None else "<NA>"
+        for x in categories.to_arrow().to_pylist()
+    )
+    data = dict(zip(result_labels, column.one_hot_encode(categories)))
 
     if drop_first and len(data):
         data.pop(next(iter(data)))
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 58cefc6554e..be74b0f867a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -17,7 +17,6 @@
 from typing_extensions import Self, assert_never
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -526,7 +525,7 @@ def from_categorical(cls, categorical, codes=None):
 
             mask = None
             if not valid_codes.all():
-                mask = libcudf.transform.bools_to_mask(valid_codes)
+                mask = valid_codes.as_mask()
             col = CategoricalColumn(
                 data=col.data,
                 size=codes.size,
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index e8d634598f4..a91a4951306 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -4,7 +4,6 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.transform import bools_to_mask
 
 __all__ = ["randomdata", "timeseries"]
 
@@ -70,7 +69,7 @@ def timeseries(
             size=len(index),
             p=[1 - nulls_frequency, nulls_frequency],
         )
-        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
+        mask_buf = cudf.core.column.as_column(mask).as_mask()
         masked_col = gdf[col]._column.set_mask(mask_buf)
         gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 65947efc2df..c3c9a1c5338 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -7,7 +7,6 @@
 import pytest
 
 import cudf
-from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal
@@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data["a"]]
-    got_mask = mask_to_bools(
-        gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data["a"]._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data]
-    got_mask = mask_to_bools(
-        gd_data._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 

From ebad043967e8bb6a2a56ecfcb0b0612ea2894fa2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:37:41 -0500
Subject: [PATCH 09/14] Remove unused `BufferArrayFromVector` (#17549)

Follow up to #17506. This PR removes an unused buffer class.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17549
---
 python/cudf/cudf/_lib/parquet.pyx | 46 +------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 1b4c18d13a7..00c434ae374 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,11 +20,8 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
-from libc.stdint cimport int64_t, uint8_t
+from libc.stdint cimport int64_t
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from pylibcudf.expressions cimport Expression
 from pylibcudf.io.parquet cimport ChunkedParquetReader
@@ -47,47 +44,6 @@ from pylibcudf cimport Table
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
 from pylibcudf.io.parquet cimport ParquetChunkedWriter
-from cython.operator cimport dereference
-
-
-cdef class BufferArrayFromVector:
-    cdef Py_ssize_t length
-    cdef unique_ptr[vector[uint8_t]] in_vec
-
-    # these two things declare part of the buffer interface
-    cdef Py_ssize_t shape[1]
-    cdef Py_ssize_t strides[1]
-
-    @staticmethod
-    cdef BufferArrayFromVector from_unique_ptr(
-        unique_ptr[vector[uint8_t]] in_vec
-    ):
-        cdef BufferArrayFromVector buf = BufferArrayFromVector()
-        buf.in_vec = move(in_vec)
-        buf.length = dereference(buf.in_vec).size()
-        return buf
-
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        cdef Py_ssize_t itemsize = sizeof(uint8_t)
-
-        self.shape[0] = self.length
-        self.strides[0] = 1
-
-        buffer.buf = dereference(self.in_vec).data()
-
-        buffer.format = NULL  # byte
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.length * itemsize   # product(shape) * itemsize
-        buffer.ndim = 1
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        pass
 
 
 def _parse_metadata(meta):

From 47643959aaa7331523d79178bf37ea5106a01c05 Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:13:02 +0900
Subject: [PATCH 10/14] Enable rounding for Decimal32 and Decimal64 in cuDF
 (#17332)

Authors:
  - Hirota Akio (https://github.com/a-hirota)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17332
---
 python/cudf/cudf/core/indexed_frame.py |  8 +++-
 python/cudf/cudf/tests/test_series.py  | 63 ++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 95f3d4d01d5..0e6a5e03ea6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3970,7 +3970,13 @@ def round(self, decimals=0, how="half_even"):
 
         cols = (
             col.round(decimals[name], how=how)
-            if name in decimals and col.dtype.kind in "fiu"
+            if name in decimals
+            and (
+                col.dtype.kind in "fiu"
+                or isinstance(
+                    col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)
+                )
+            )
             else col.copy(deep=True)
             for name, col in self._column_labels_and_values
         )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 99bd9adb034..f8697c5c6b8 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal):
     assert_eq(result, expected, atol=1e-10)
 
 
+@pytest.mark.parametrize(
+    "data, dtype, decimals, expected_half_up, expected_half_even",
+    [
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.34, 3.46],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            0,
+            [1.0, 2.0, 3.0],
+            [1.0, 2.0, 3.0],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            3,
+            [1.234, 2.345, 3.456],
+            [1.234, 2.345, 3.456],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            4,
+            [1.2346, 2.3457, 3.4568],
+            [1.2346, 2.3457, 3.4568],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.35, 3.46],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            6,
+            [1.234567, 2.345678, 3.456789],
+            [1.234567, 2.345678, 3.456789],
+        ),
+    ],
+)
+def test_series_round_decimal(
+    data, dtype, decimals, expected_half_up, expected_half_even
+):
+    ser = cudf.Series(data).astype(dtype)
+
+    result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype)
+    expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype)
+    assert_eq(result_half_up, expected_ser_half_up)
+
+    result_half_even = ser.round(decimals=decimals, how="half_even").astype(
+        dtype
+    )
+    expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype)
+    assert_eq(result_half_even, expected_ser_half_even)
+
+
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):

From c53ace8f381af7c9e9dce161dcc756d07f8f147c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:35:11 -0800
Subject: [PATCH 11/14] Fix CMake format in cudf/_lib/CMakeLists.txt (#17559)

Due to 2 of my cudf._lib refactoring PRs going in which then impacted formatting of `cudf/_lib/CMakeLists.txt`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17559
---
 python/cudf/cudf/_lib/CMakeLists.txt | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 084fc19a61e..efe96ff6c3e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -13,20 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    column.pyx
-    copying.pyx
-    csv.pyx
-    groupby.pyx
-    interop.pyx
-    parquet.pyx
-    reduce.pyx
-    scalar.pyx
-    sort.pyx
-    stream_compaction.pyx
-    string_casting.pyx
-    strings_udf.pyx
-    types.pyx
-    utils.pyx
+    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx
+    sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 

From e16b3a3c499bda40082c1990f94ef0aa3bb23b35 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 10 Dec 2024 01:16:54 -0600
Subject: [PATCH 12/14] Remove Thrust patch in favor of CMake definition for
 Thrust 32-bit offset types. (#17527)

Follow-up for #17523 to use `target_compile_definitions` and drop the Thrust patch.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17527
---
 cpp/CMakeLists.txt                            |  4 ++++
 .../thirdparty/patches/cccl_override.json     |  5 -----
 .../thrust_disable_64bit_dispatching.diff     | 22 -------------------
 3 files changed, 4 insertions(+), 27 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12e6826f301..e54c71de4fa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -928,6 +928,10 @@ if(TARGET CUDA::cuFile${_cufile_suffix})
   target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
 endif()
 
+# Remove this after upgrading to a CCCL that has a proper CMake option. See
+# https://github.com/NVIDIA/cccl/pull/2844
+target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 2f29578f7ae..d5cadce40c2 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
deleted file mode 100644
index 9f68d85e7db..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 3d004aa55..71ce86bea 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -63,7 +63,7 @@
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1)                \
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2)
- 
--#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
-+#if 0
- //! @brief Always dispatches to 64 bit offset version of an algorithm
- #  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
-@@ -89,7 +89,7 @@
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
-     _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
- 
--#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE)
-+#elif 1
- 
- //! @brief Ensures that the size of the input does not overflow the offset type
- #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \

From 13e983eafecad5a3d4053157febd714e40a410c3 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@nvidia.com>
Date: Tue, 10 Dec 2024 09:15:37 -0600
Subject: [PATCH 13/14] gate telemetry dispatch calls on TELEMETRY_ENABLED env
 var (#17551)

Because of the switch away from certificates/mTLS, we are having to rework a few things. In the meantime, telemetry jobs are failing. This PR adds a switch to turn all of the telemetry stuff off - to skip it instead. It is meant to be controlled by an org-wide environment variable, which can be applied to individual repos by ops. At the time of submitting this PR, the environment variable is 'false' and no telemetry is being reported.

Authors:
  - Mike Sarahan (https://github.com/msarahan)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17551
---
 .github/workflows/pr.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7c0bd6d52e2..49ca5ca0fb9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,6 +52,7 @@ jobs:
       OTEL_SERVICE_NAME: 'pr-cudf'
     steps:
       - name: Telemetry setup
+        if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
@@ -329,7 +330,7 @@ jobs:
   telemetry-summarize:
     runs-on: ubuntu-latest
     needs: pr-builder
-    if: always()
+    if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
       - name: Load stashed telemetry env vars

From 3468e9259960b4f16cd849e8497be4f5bee0839b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 10 Dec 2024 12:32:10 -0500
Subject: [PATCH 14/14] Replace cudf::detail::copy_if logic with
 thrust::copy_if and gather (#17520)

Replaces the custom kernels for `cudf::detail::copy_if` with a call to `thrust::copy_if` to build indices to call `cudf::detail::gather`.
This is easier to maintain and faster for some cases but slower in others.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17520
---
 .../stream_compaction/apply_boolean_mask.cpp  |   8 +-
 cpp/include/cudf/detail/copy_if.cuh           | 363 +-----------------
 cpp/src/dictionary/remove_keys.cu             |   1 +
 3 files changed, 23 insertions(+), 349 deletions(-)

diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index fa017ca9e29..267aa3a93f3 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<Data
   data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
     input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table =
-    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
+  auto source_table = create_random_table(
+    cycle_dtypes({input_type, cudf::type_id::STRING}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
@@ -85,6 +85,6 @@ using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view
 NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
   .set_name("apply_boolean_mask")
   .set_type_axes_names({"type"})
-  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("columns", {1, 4, 9})
   .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
-  .add_int64_axis("hits_%", {10, 50, 100});
+  .add_int64_axis("hits_%", {10, 20, 50, 80, 90, 100});
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4159e324472..9226697a7f6 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -16,300 +16,25 @@
 
 #pragma once
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cub/cub.cuh>
-#include <cuda/atomic>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
-
 namespace cudf {
 namespace detail {
 
-// Compute the count of elements that pass the mask within each block
-template <typename Filter, int block_size>
-CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                      cudf::size_type size,
-                                      cudf::size_type per_thread,
-                                      Filter filter)
-{
-  int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
-  int count = 0;
-
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-    count += __syncthreads_count(mask_true);
-    tid += block_size;
-  }
-
-  if (threadIdx.x == 0) block_counts[blockIdx.x] = count;
-}
-
-// Compute the exclusive prefix sum of each thread's mask value within each block
-template <int block_size>
-__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum)
-{
-  int offset = 0;
-
-  using BlockScan = cub::BlockScan<cudf::size_type, block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum);
-
-  return offset;
-}
-
-// This kernel scatters data and validity mask of a column based on the
-// scan of the boolean mask. The block offsets for the scan are already computed.
-// Just compute the scan of the mask in each block and add it to the block's
-// output offset. This is the output index of each element. Scattering
-// the valid mask is not as easy, because each thread is only responsible for
-// one bit. Warp-level processing (ballot) makes this simpler.
-// To make scattering efficient, we "coalesce" the block's scattered data and
-// valids in shared memory, and then write from shared memory to global memory
-// in a contiguous manner.
-// The has_validity template parameter specializes this kernel for the
-// non-nullable case for performance without writing another kernel.
-//
-// Note: `filter` is not run on indices larger than the input column size
-template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) CUDF_KERNEL
-  void scatter_kernel(cudf::mutable_column_device_view output_view,
-                      cudf::size_type* output_null_count,
-                      cudf::column_device_view input_view,
-                      cudf::size_type const* __restrict__ block_offsets,
-                      cudf::size_type size,
-                      cudf::size_type per_thread,
-                      Filter filter)
-{
-  T* __restrict__ output_data                   = output_view.data<T>();
-  cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  static_assert(block_size <= 1024, "Maximum thread block size exceeded");
-
-  int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
-  cudf::size_type block_offset = block_offsets[blockIdx.x];
-
-  // one extra warp worth in case the block is not aligned
-  __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
-  __shared__ T temp_data[block_size];
-
-  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
-  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
-
-  // Note that since the maximum gridDim.x on all supported GPUs is as big as
-  // cudf::size_type, this loop is sufficient to cover our maximum column size
-  // regardless of the value of block_size and per_thread.
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-
-    cudf::size_type tmp_block_sum = 0;
-    // get output location using a scan of the mask result
-    cudf::size_type const local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
-    block_sum += tmp_block_sum;
-
-    if (has_validity) {
-      temp_valids[threadIdx.x] = false;  // init shared memory
-      if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();  // wait for init
-    }
-
-    if (mask_true) {
-      temp_data[local_index] = input_view.data<T>()[tid];  // scatter data to shared
-
-      // scatter validity mask to shared memory
-      if (has_validity and input_view.is_valid(tid)) {
-        // determine aligned offset for this warp's output
-        cudf::size_type const aligned_offset      = block_offset % cudf::detail::warp_size;
-        temp_valids[local_index + aligned_offset] = true;
-      }
-    }
-
-    __syncthreads();  // wait for shared data and validity mask to be complete
-
-    // Copy output data coalesced from shared to global
-    if (threadIdx.x < tmp_block_sum)
-      output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x];
-
-    if (has_validity) {
-      // Since the valid bools are contiguous in shared memory now, we can use
-      // __popc to combine them into a single mask element.
-      // Then, most mask elements can be directly copied from shared to global
-      // memory. Only the first and last 32-bit mask elements of each block must
-      // use an atomicOr, because these are where other blocks may overlap.
-
-      constexpr int num_warps = block_size / cudf::detail::warp_size;
-      // account for partial blocks with non-warp-aligned offsets
-      int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
-      int const last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
-      int const wid        = threadIdx.x / cudf::detail::warp_size;
-      int const lane       = threadIdx.x % cudf::detail::warp_size;
-
-      cudf::size_type tmp_warp_valid_counts{0};
-
-      if (tmp_block_sum > 0 && wid <= last_warp) {
-        int valid_index = (block_offset / cudf::detail::warp_size) + wid;
-
-        // compute the valid mask for this warp
-        uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]);
-
-        // Note the atomicOr's below assume that output_valid has been set to
-        // all zero before the kernel
-        if (lane == 0 && valid_warp != 0) {
-          tmp_warp_valid_counts = __popc(valid_warp);
-          if (wid > 0 && wid < last_warp)
-            output_valid[valid_index] = valid_warp;
-          else {
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-
-        // if the block is full and not aligned then we have one more warp to cover
-        if ((wid == 0) && (last_warp == num_warps)) {
-          uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
-          if (lane == 0 && valid_warp != 0) {
-            tmp_warp_valid_counts += __popc(valid_warp);
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index + num_warps]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-      }
-      warp_valid_counts += tmp_warp_valid_counts;
-    }
-
-    block_offset += tmp_block_sum;
-    tid += block_size;
-  }
-  // Compute total null_count for this block and add it to global count
-  constexpr cudf::size_type leader_lane{0};
-  cudf::size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
-
-  if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
-    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
-  }
-}
-
-template <typename T, typename Enable = void>
-struct DeviceType {
-  using type = T;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
-  using type = typename T::rep;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-// Dispatch functor which performs the scatter for fixed column types and gather for other
-template <typename Filter, int block_size>
-struct scatter_gather_functor {
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const* block_offsets,
-                                           Filter filter,
-                                           cudf::size_type per_thread,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    auto output_column =
-      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
-    auto output = output_column->mutable_view();
-
-    bool has_valid = input.nullable();
-
-    using Type = typename DeviceType<T>::type;
-
-    auto scatter = (has_valid) ? scatter_kernel<Type, Filter, block_size, true>
-                               : scatter_kernel<Type, Filter, block_size, false>;
-
-    cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
-
-    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
-    if (output.nullable()) {
-      // Have to initialize the output mask to all zeros because we may update
-      // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
-    }
-
-    auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
-    auto input_device_view  = cudf::column_device_view::create(input, stream);
-    scatter<<<grid.num_blocks, block_size, 0, stream.value()>>>(*output_device_view,
-                                                                null_count.data(),
-                                                                *input_device_view,
-                                                                block_offsets,
-                                                                input.size(),
-                                                                per_thread,
-                                                                filter);
-
-    if (has_valid) { output_column->set_null_count(null_count.value(stream)); }
-    return output_column;
-  }
-
-  template <typename T,
-            std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const*,
-                                           Filter filter,
-                                           cudf::size_type,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    rmm::device_uvector<cudf::size_type> indices(output_size, stream);
-
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::counting_iterator<cudf::size_type>(0),
-                    thrust::counting_iterator<cudf::size_type>(input.size()),
-                    indices.begin(),
-                    filter);
-
-    auto output_table = cudf::detail::gather(cudf::table_view{{input}},
-                                             indices,
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-
-    // There will be only one column
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-};
-
 /**
  * @brief Filters `input` using a Filter function object
  *
@@ -319,9 +44,11 @@ struct scatter_gather_functor {
  * false otherwise.
  *
  * @tparam Filter the filter functor type
- * @param[in] input The table_view to filter
- * @param[in] filter A function object that takes an index and returns a bool
- * @return unique_ptr<table> The table generated from filtered `input`.
+ * @param input The table_view to filter
+ * @param filter A function object that takes an index and returns a bool
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for allocating the returned memory
+ * @return The table generated from filtered `input`
  */
 template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
@@ -333,76 +60,22 @@ std::unique_ptr<table> copy_if(table_view const& input,
 
   if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); }
 
-  constexpr int block_size = 256;
-  cudf::size_type per_thread =
-    elements_per_thread(compute_block_counts<Filter, block_size>, input.num_rows(), block_size);
-  cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread};
-
-  // temp storage for block counts and offsets
-  rmm::device_uvector<cudf::size_type> block_counts(grid.num_blocks, stream);
-  rmm::device_uvector<cudf::size_type> block_offsets(grid.num_blocks + 1, stream);
-
-  // 1. Find the count of elements in each block that "pass" the mask
-  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    block_counts.begin(), input.num_rows(), per_thread, filter);
-
-  // initialize just the first element of block_offsets to 0 since the InclusiveSum below
-  // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
-
-  // 2. Find the offset for each block's output using a scan of block counts
-  if (grid.num_blocks > 1) {
-    // Determine and allocate temporary device storage
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(nullptr,
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-    rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-
-    // Run exclusive prefix sum
-    cub::DeviceScan::InclusiveSum(d_temp_storage.data(),
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-  }
-
-  // As it is InclusiveSum, last value in block_offsets will be output_size
-  // unless num_blocks == 1, in which case output_size is just block_counts[0]
-  cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &output_size,
-    grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
-    sizeof(cudf::size_type),
-    cudaMemcpyDefault,
-    stream.value()));
+  auto indices     = rmm::device_uvector<size_type>(input.num_rows(), stream);
+  auto const begin = thrust::counting_iterator<size_type>(0);
+  auto const end   = begin + input.num_rows();
+  auto const indices_end =
+    thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter);
 
-  stream.synchronize();
+  auto const output_size = static_cast<size_type>(thrust::distance(indices.begin(), indices_end));
 
-  if (output_size == input.num_rows()) {
-    return std::make_unique<table>(input, stream, mr);
-  } else if (output_size > 0) {
-    std::vector<std::unique_ptr<column>> out_columns(input.num_columns());
-    std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) {
-      return cudf::type_dispatcher(col_view.type(),
-                                   scatter_gather_functor<Filter, block_size>{},
-                                   col_view,
-                                   output_size,
-                                   block_offsets.begin(),
-                                   filter,
-                                   per_thread,
-                                   stream,
-                                   mr);
-    });
+  // nothing selected
+  if (output_size == 0) { return empty_like(input); }
+  // everything selected
+  if (output_size == input.num_rows()) { return std::make_unique<table>(input, stream, mr); }
 
-    return std::make_unique<table>(std::move(out_columns));
-  } else {
-    return empty_like(input);
-  }
+  auto const map = device_span<size_type const>(indices.data(), output_size);
+  return cudf::detail::gather(
+    input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 59c8453cf33..4715931a7a9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>