From 305182e58c19add98a5abd6a5b00d9b266f41732 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 22 Nov 2024 08:45:32 -0600
Subject: [PATCH 01/23] Enable unified memory by default in `cudf_polars`
 (#17375)

This PR enables Unified memory as the default memory resource for
`cudf_polars`

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Co-authored-by: Lawrence Mitchell <wence@gmx.li>
Co-authored-by: Matthew Murray <matthewmurray711@gmail.com>
---
 .../cudf/source/cudf_polars/engine_options.md |  7 +++
 docs/cudf/source/cudf_polars/index.rst        |  6 ++
 python/cudf_polars/cudf_polars/callback.py    | 56 +++++++++++++++++--
 python/cudf_polars/tests/test_config.py       | 20 +++++++
 4 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md
index 4c930c7392d..afb2bb6e8b9 100644
--- a/docs/cudf/source/cudf_polars/engine_options.md
+++ b/docs/cudf/source/cudf_polars/engine_options.md
@@ -23,3 +23,10 @@ engine = GPUEngine(
 result = query.collect(engine=engine)
 ```
 Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.
+
+## Disabling CUDA Managed Memory
+
+By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool
+allocator is used.
+Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here](
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory).
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
index 6fd98a6b5da..a9b4bb2dff2 100644
--- a/docs/cudf/source/cudf_polars/index.rst
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -9,6 +9,12 @@ and run on the CPU.
 
 Benchmark
 ---------
+
+.. note::
+   The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`.
+   Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors.
+   Peak performance can still be attained by setting the environment variable to 1.
+
 We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
 
 .. figure:: ../_static/pds_benchmark_polars.png
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 7915c9e6b18..8dc5715195d 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -15,6 +15,7 @@
 
 from polars.exceptions import ComputeError, PerformanceWarning
 
+import pylibcudf
 import rmm
 from rmm._cuda import gpu
 
@@ -32,8 +33,26 @@
 __all__: list[str] = ["execute_with_cudf"]
 
 
+_SUPPORTED_PREFETCHES = {
+    "column_view::get_data",
+    "mutable_column_view::get_data",
+    "gather",
+    "hash_join",
+}
+
+
+def _env_get_int(name, default):
+    try:
+        return int(os.getenv(name, default))
+    except (ValueError, TypeError):  # pragma: no cover
+        return default  # pragma: no cover
+
+
 @cache
-def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
+def default_memory_resource(
+    device: int,
+    cuda_managed_memory: bool,  # noqa: FBT001
+) -> rmm.mr.DeviceMemoryResource:
     """
     Return the default memory resource for cudf-polars.
 
@@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
     device
         Disambiguating device id when selecting the device. Must be
         the active device when this function is called.
+    cuda_managed_memory
+        Whether to use managed memory or not.
 
     Returns
     -------
     rmm.mr.DeviceMemoryResource
         The default memory resource that cudf-polars uses. Currently
-        an async pool resource.
+        a managed memory resource, if `cuda_managed_memory` is `True`.
+        else, an async pool resource is returned.
     """
     try:
-        return rmm.mr.CudaAsyncMemoryResource()
+        if (
+            cuda_managed_memory
+            and pylibcudf.utils._is_concurrent_managed_access_supported()
+        ):
+            # Allocating 80% of the available memory for the pool.
+            # Leaving a 20% headroom to avoid OOM errors.
+            free_memory, _ = rmm.mr.available_device_memory()
+            free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+            for key in _SUPPORTED_PREFETCHES:
+                pylibcudf.experimental.enable_prefetching(key)
+            mr = rmm.mr.PrefetchResourceAdaptor(
+                rmm.mr.PoolMemoryResource(
+                    rmm.mr.ManagedMemoryResource(),
+                    initial_pool_size=free_memory,
+                )
+            )
+        else:
+            mr = rmm.mr.CudaAsyncMemoryResource()
     except RuntimeError as e:  # pragma: no cover
         msg, *_ = e.args
         if (
@@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
             ) from None
         else:
             raise
+    else:
+        return mr
 
 
 @contextlib.contextmanager
@@ -89,10 +130,15 @@ def set_memory_resource(
     at entry. If a memory resource is provided, it must be valid to
     use with the currently active device.
     """
+    previous = rmm.mr.get_current_device_resource()
     if mr is None:
         device: int = gpu.getDevice()
-        mr = default_memory_resource(device)
-    previous = rmm.mr.get_current_device_resource()
+        mr = default_memory_resource(
+            device=device,
+            cuda_managed_memory=bool(
+                _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
+            ),
+        )
     rmm.mr.set_current_device_resource(mr)
     try:
         yield mr
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 25b71716eed..52c5c9894fe 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,6 +10,7 @@
 
 import rmm
 
+from cudf_polars.callback import default_memory_resource
 from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr):
         q.collect(engine=pl.GPUEngine(memory_resource=mr))
 
 
+@pytest.mark.parametrize("disable_managed_memory", ["1", "0"])
+def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory):
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setenv(
+            "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory
+        )
+        result = q.collect(engine=pl.GPUEngine())
+        mr = default_memory_resource(0, bool(disable_managed_memory == "1"))
+        if disable_managed_memory == "1":
+            assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor)
+            assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource)
+        else:
+            assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource)
+        monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY")
+    assert_frame_equal(q.collect(), result)
+
+
 def test_explicit_device_zero():
     q = pl.LazyFrame({"a": [1, 2, 3]})
 

From 53e452539070fe86f68b543fa7237816d657a01a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 22 Nov 2024 11:01:36 -0800
Subject: [PATCH 02/23] Add write_parquet to pylibcudf (#17263)

Broken off from https://github.com/rapidsai/cudf/pull/17252 since also replacing cudf Python's `write_parquet` usage would have made the PR fairly large.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17263
---
 .../pylibcudf/pylibcudf/contiguous_split.pxd  |  13 +
 .../pylibcudf/pylibcudf/contiguous_split.pyx  |  13 +-
 python/pylibcudf/pylibcudf/io/parquet.pxd     |  62 ++-
 python/pylibcudf/pylibcudf/io/parquet.pyi     |  46 ++-
 python/pylibcudf/pylibcudf/io/parquet.pyx     | 364 +++++++++++++++++-
 python/pylibcudf/pylibcudf/io/types.pxd       |  39 ++
 python/pylibcudf/pylibcudf/io/types.pyi       |  19 +
 python/pylibcudf/pylibcudf/io/types.pyx       | 227 +++++++++++
 .../pylibcudf/libcudf/io/parquet.pxd          |   6 +-
 .../pylibcudf/tests/io/test_parquet.py        |  69 ++++
 10 files changed, 840 insertions(+), 18 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
index 3745e893c3e..14ad84709d5 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -1,12 +1,25 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
 from pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
 
 
+cdef class HostBuffer:
+    cdef unique_ptr[vector[uint8_t]] c_obj
+    cdef size_t nbytes
+    cdef Py_ssize_t[1] shape
+    cdef Py_ssize_t[1] strides
+
+    @staticmethod
+    cdef HostBuffer from_unique_ptr(
+        unique_ptr[vector[uint8_t]] vec
+    )
+
 cdef class PackedColumns:
     cdef unique_ptr[packed_columns] c_obj
 
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
index 2a40d42e6e9..b24b7a21af1 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyx
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -29,18 +29,14 @@ __all__ = [
 
 cdef class HostBuffer:
     """Owning host buffer that implements the buffer protocol"""
-    cdef unique_ptr[vector[uint8_t]] c_obj
-    cdef size_t nbytes
-    cdef Py_ssize_t[1] shape
-    cdef Py_ssize_t[1] strides
-
     @staticmethod
     cdef HostBuffer from_unique_ptr(
         unique_ptr[vector[uint8_t]] vec
     ):
-        cdef HostBuffer out = HostBuffer()
+        cdef HostBuffer out = HostBuffer.__new__(HostBuffer)
+        # Allow construction from nullptr
+        out.nbytes = 0 if vec.get() == NULL else dereference(vec).size()
         out.c_obj = move(vec)
-        out.nbytes = dereference(out.c_obj).size()
         out.shape[0] = out.nbytes
         out.strides[0] = 1
         return out
@@ -48,7 +44,8 @@ cdef class HostBuffer:
     __hash__ = None
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        buffer.buf = dereference(self.c_obj).data()
+        # Empty vec produces empty buffer
+        buffer.buf = NULL if self.nbytes == 0 else dereference(self.c_obj).data()
         buffer.format = NULL  # byte
         buffer.internal = NULL
         buffer.itemsize = 1
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 9c476030ded..1a61c20d783 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -1,14 +1,26 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
 from pylibcudf.expressions cimport Expression
-from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.io.types cimport (
+    compression_type,
+    dictionary_policy,
+    statistics_freq,
+    SinkInfo,
+    SourceInfo,
+    TableInputMetadata,
+    TableWithMetadata,
+)
 from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
+    parquet_writer_options,
+    parquet_writer_options_builder,
 )
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.table cimport Table
 from pylibcudf.types cimport DataType
 
 
@@ -33,3 +45,49 @@ cpdef read_parquet(
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
 )
+
+cdef class ParquetWriterOptions:
+    cdef parquet_writer_options c_obj
+    cdef Table table_ref
+    cdef SinkInfo sink_ref
+
+    cpdef void set_partitions(self, list partitions)
+
+    cpdef void set_column_chunks_file_paths(self, list file_paths)
+
+    cpdef void set_row_group_size_bytes(self, size_t size_bytes)
+
+    cpdef void set_row_group_size_rows(self, size_type size_rows)
+
+    cpdef void set_max_page_size_bytes(self, size_t size_bytes)
+
+    cpdef void set_max_page_size_rows(self, size_type size_rows)
+
+    cpdef void set_max_dictionary_size(self, size_t size_bytes)
+
+cdef class ParquetWriterOptionsBuilder:
+    cdef parquet_writer_options_builder c_obj
+    cdef Table table_ref
+    cdef SinkInfo sink_ref
+
+    cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata)
+
+    cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata)
+
+    cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression)
+
+    cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf)
+
+    cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled)
+
+    cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled)
+
+    cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy val)
+
+    cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled)
+
+    cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled)
+
+    cpdef ParquetWriterOptions build(self)
+
+cpdef memoryview write_parquet(ParquetWriterOptions options)
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index bcf1d1cce09..eb2ca68109b 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -1,7 +1,20 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from collections.abc import Mapping
+from typing import Self
+
 from pylibcudf.expressions import Expression
-from pylibcudf.io.types import SourceInfo, TableWithMetadata
+from pylibcudf.io.types import (
+    CompressionType,
+    DictionaryPolicy,
+    PartitionInfo,
+    SinkInfo,
+    SourceInfo,
+    StatisticsFreq,
+    TableInputMetadata,
+    TableWithMetadata,
+)
+from pylibcudf.table import Table
 
 class ChunkedParquetReader:
     def __init__(
@@ -34,3 +47,34 @@ def read_parquet(
     # reader_column_schema: ReaderColumnSchema = *,
     # timestamp_type: DataType = *
 ) -> TableWithMetadata: ...
+
+class ParquetWriterOptions:
+    def __init__(self): ...
+    @staticmethod
+    def builder(
+        sink: SinkInfo, table: Table
+    ) -> ParquetWriterOptionsBuilder: ...
+    def set_partitions(self, partitions: list[PartitionInfo]) -> None: ...
+    def set_column_chunks_file_paths(self, file_paths: list[str]) -> None: ...
+    def set_row_group_size_bytes(self, size_bytes: int) -> None: ...
+    def set_row_group_size_rows(self, size_rows: int) -> None: ...
+    def set_max_page_size_bytes(self, size_bytes: int) -> None: ...
+    def set_max_page_size_rows(self, size_rows: int) -> None: ...
+    def set_max_dictionary_size(self, size_bytes: int) -> None: ...
+
+class ParquetWriterOptionsBuilder:
+    def __init__(self): ...
+    def metadata(self, metadata: TableInputMetadata) -> Self: ...
+    def key_value_metadata(
+        self, metadata: list[Mapping[str, str]]
+    ) -> Self: ...
+    def compression(self, compression: CompressionType) -> Self: ...
+    def stats_level(self, sf: StatisticsFreq) -> Self: ...
+    def int96_timestamps(self, enabled: bool) -> Self: ...
+    def write_v2_headers(self, enabled: bool) -> Self: ...
+    def dictionary_policy(self, val: DictionaryPolicy) -> Self: ...
+    def utc_timestamps(self, enabled: bool) -> Self: ...
+    def write_arrow_schema(self, enabled: bool) -> Self: ...
+    def build(self) -> ParquetWriterOptions: ...
+
+def write_parquet(options: ParquetWriterOptions) -> memoryview: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index b76a352d633..b95b1f39de1 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -1,22 +1,45 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from cython.operator cimport dereference
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from pylibcudf.contiguous_split cimport HostBuffer
 from pylibcudf.expressions cimport Expression
-from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    PartitionInfo,
+    TableInputMetadata,
+    TableWithMetadata
+)
 from pylibcudf.libcudf.expressions cimport expression
 from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_reader_options,
     read_parquet as cpp_read_parquet,
+    write_parquet as cpp_write_parquet,
+    parquet_writer_options,
+)
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    dictionary_policy as dictionary_policy_t,
+    partition_info,
+    statistics_freq,
+    table_with_metadata,
 )
-from pylibcudf.libcudf.io.types cimport table_with_metadata
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.table cimport Table
 
-__all__ = ["ChunkedParquetReader", "read_parquet"]
+__all__ = [
+    "ChunkedParquetReader",
+    "ParquetWriterOptions",
+    "ParquetWriterOptionsBuilder",
+    "read_parquet",
+    "write_parquet"
+]
 
 
 cdef parquet_reader_options _setup_parquet_reader_options(
@@ -221,3 +244,336 @@ cpdef read_parquet(
         c_result = move(cpp_read_parquet(opts))
 
     return TableWithMetadata.from_libcudf(c_result)
+
+
+cdef class ParquetWriterOptions:
+
+    @staticmethod
+    def builder(SinkInfo sink, Table table):
+        """
+        Create builder to create ParquetWriterOptionsBuilder.
+
+        Parameters
+        ----------
+        sink : SinkInfo
+            The sink used for writer output
+
+        table : Table
+            Table to be written to output
+
+        Returns
+        -------
+        ParquetWriterOptionsBuilder
+        """
+        cdef ParquetWriterOptionsBuilder bldr = ParquetWriterOptionsBuilder.__new__(
+            ParquetWriterOptionsBuilder
+        )
+        bldr.c_obj = parquet_writer_options.builder(sink.c_obj, table.view())
+        bldr.table_ref = table
+        bldr.sink_ref = sink
+        return bldr
+
+    cpdef void set_partitions(self, list partitions):
+        """
+        Sets partitions.
+
+        Parameters
+        ----------
+        partitions : list[Partitions]
+            Partitions of input table in {start_row, num_rows} pairs.
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[partition_info] c_partions
+        cdef PartitionInfo partition
+
+        c_partions.reserve(len(partitions))
+        for partition in partitions:
+            c_partions.push_back(partition.c_obj)
+
+        self.c_obj.set_partitions(c_partions)
+
+    cpdef void set_column_chunks_file_paths(self, list file_paths):
+        """
+        Sets column chunks file path to be set in the raw output metadata.
+
+        Parameters
+        ----------
+        file_paths : list[str]
+            Vector of strings which indicate file paths.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_column_chunks_file_paths([fp.encode() for fp in file_paths])
+
+    cpdef void set_row_group_size_bytes(self, size_t size_bytes):
+        """
+        Sets the maximum row group size, in bytes.
+
+        Parameters
+        ----------
+        size_bytes : int
+            Maximum row group size, in bytes to set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_row_group_size_bytes(size_bytes)
+
+    cpdef void set_row_group_size_rows(self, size_type size_rows):
+        """
+        Sets the maximum row group size, in rows.
+
+        Parameters
+        ----------
+        size_rows : int
+            Maximum row group size, in rows to set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_row_group_size_rows(size_rows)
+
+    cpdef void set_max_page_size_bytes(self, size_t size_bytes):
+        """
+        Sets the maximum uncompressed page size, in bytes.
+
+        Parameters
+        ----------
+        size_bytes : int
+            Maximum uncompressed page size, in bytes to set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_max_page_size_bytes(size_bytes)
+
+    cpdef void set_max_page_size_rows(self, size_type size_rows):
+        """
+        Sets the maximum page size, in rows.
+
+        Parameters
+        ----------
+        size_rows : int
+            Maximum page size, in rows to set.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_max_page_size_rows(size_rows)
+
+    cpdef void set_max_dictionary_size(self, size_t size_bytes):
+        """
+        Sets the maximum dictionary size, in bytes.
+
+        Parameters
+        ----------
+        size_bytes : int
+            Sets the maximum dictionary size, in bytes.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_max_dictionary_size(size_bytes)
+
+
+cdef class ParquetWriterOptionsBuilder:
+
+    cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata):
+        """
+        Sets metadata.
+
+        Parameters
+        ----------
+        metadata : TableInputMetadata
+            Associated metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.metadata(metadata.c_obj)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata):
+        """
+        Sets Key-Value footer metadata.
+
+        Parameters
+        ----------
+        metadata : list[dict[str, str]]
+            Key-Value footer metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.key_value_metadata(
+            [
+                {key.encode(): value.encode() for key, value in mapping.items()}
+                for mapping in metadata
+            ]
+        )
+        return self
+
+    cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        compression : CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(compression)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf):
+        """
+        Sets the level of statistics.
+
+        Parameters
+        ----------
+        sf : StatisticsFreq
+            Level of statistics requested in the output file
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.stats_level(sf)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled):
+        """
+        Sets whether timestamps are written as int96 or timestamp micros.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable int96 timestamps
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.int96_timestamps(enabled)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled):
+        """
+        Set to true to write V2 page headers, otherwise false to write V1 page headers.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable writing of V2 page headers.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.write_v2_headers(enabled)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy_t val):
+        """
+        Sets the policy for dictionary use.
+
+        Parameters
+        ----------
+        val : DictionaryPolicy
+            Policy for dictionary use.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.dictionary_policy(val)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled):
+        """
+        Set to true if timestamps are to be written as UTC.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable writing of timestamps as UTC.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.utc_timestamps(enabled)
+        return self
+
+    cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled):
+        """
+        Set to true if arrow schema is to be written.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable writing of arrow schema.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.write_arrow_schema(enabled)
+        return self
+
+    cpdef ParquetWriterOptions build(self):
+        """
+        Create a ParquetWriterOptions from the set options.
+
+        Returns
+        -------
+        ParquetWriterOptions
+        """
+        cdef ParquetWriterOptions parquet_options = ParquetWriterOptions.__new__(
+            ParquetWriterOptions
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.table_ref = self.table_ref
+        parquet_options.sink_ref = self.sink_ref
+        return parquet_options
+
+
+cpdef memoryview write_parquet(ParquetWriterOptions options):
+    """
+    Writes a set of columns to parquet format.
+
+    Parameters
+    ----------
+    options : ParquetWriterOptions
+        Settings for controlling writing behavior
+
+    Returns
+    -------
+    memoryview
+        A blob that contains the file metadata
+        (parquet FileMetadata thrift message) if requested in
+        parquet_writer_options (empty blob otherwise).
+    """
+    cdef parquet_writer_options c_options = options.c_obj
+    cdef unique_ptr[vector[uint8_t]] c_result
+
+    with nogil:
+        c_result = cpp_write_parquet(c_options)
+
+    return memoryview(HostBuffer.from_unique_ptr(move(c_result)))
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index 0ab28cb0973..90b43cf0ff5 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -1,4 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint8_t, int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.io.data_sink cimport data_sink
@@ -18,9 +20,46 @@ from pylibcudf.libcudf.io.types cimport (
     table_metadata,
     table_with_metadata,
 )
+from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.table cimport Table
 
 
+cdef class PartitionInfo:
+    cdef partition_info c_obj
+
+cdef class ColumnInMetadata:
+    cdef column_in_metadata c_obj
+
+    @staticmethod
+    cdef ColumnInMetadata from_metadata(column_in_metadata metadata)
+
+    cpdef ColumnInMetadata set_name(self, str name)
+
+    cpdef ColumnInMetadata set_name(self, str name)
+
+    cpdef ColumnInMetadata set_nullability(self, bool nullable)
+
+    cpdef ColumnInMetadata set_list_column_as_map(self)
+
+    cpdef ColumnInMetadata set_int96_timestamps(self, bool req)
+
+    cpdef ColumnInMetadata set_decimal_precision(self, uint8_t req)
+
+    cpdef ColumnInMetadata child(self, size_type i)
+
+    cpdef ColumnInMetadata set_output_as_binary(self, bool binary)
+
+    cpdef ColumnInMetadata set_type_length(self, int32_t type_length)
+
+    cpdef ColumnInMetadata set_skip_compression(self, bool skip)
+
+    cpdef ColumnInMetadata set_encoding(self, column_encoding encoding)
+
+    cpdef str get_name(self)
+
+cdef class TableInputMetadata:
+    cdef table_input_metadata c_obj
+
 cdef class TableWithMetadata:
     cdef public Table tbl
     cdef table_metadata metadata
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
index a4f4fc13bdc..04f276cfeee 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyi
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -59,6 +59,25 @@ class QuoteStyle(IntEnum):
 ColumnNameSpec: TypeAlias = tuple[str, list[ColumnNameSpec]]
 ChildNameSpec: TypeAlias = Mapping[str, ChildNameSpec]
 
+class PartitionInfo:
+    def __init__(self, start_row: int, num_rows: int): ...
+
+class TableInputMetadata:
+    def __init__(self, table: Table): ...
+
+class ColumnInMetadata:
+    def set_name(self, name: str) -> ColumnInMetadata: ...
+    def set_nullability(self, nullable: bool) -> ColumnInMetadata: ...
+    def set_list_column_as_map(self) -> ColumnInMetadata: ...
+    def set_int96_timestamps(self, req: bool) -> ColumnInMetadata: ...
+    def set_decimal_precision(self, precision: int) -> ColumnInMetadata: ...
+    def child(self, i: int) -> ColumnInMetadata: ...
+    def set_output_as_binary(self, binary: bool) -> ColumnInMetadata: ...
+    def set_type_length(self, type_length: int) -> ColumnInMetadata: ...
+    def set_skip_compression(self, skip: bool) -> ColumnInMetadata: ...
+    def set_encoding(self, encoding: ColumnEncoding) -> ColumnInMetadata: ...
+    def get_name(self) -> str: ...
+
 class TableWithMetadata:
     tbl: Table
     def __init__(
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 51d5bda75c7..460ab6844c3 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -2,6 +2,8 @@
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
+from libc.stdint cimport uint8_t, int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -10,11 +12,16 @@ from pylibcudf.io.datasource cimport Datasource
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.datasource cimport datasource
 from pylibcudf.libcudf.io.types cimport (
+    column_encoding,
+    column_in_metadata,
     column_name_info,
     host_buffer,
+    partition_info,
     source_info,
+    table_input_metadata,
     table_with_metadata,
 )
+from pylibcudf.libcudf.types cimport size_type
 
 import codecs
 import errno
@@ -37,13 +44,233 @@ __all__ = [
     "CompressionType",
     "DictionaryPolicy",
     "JSONRecoveryMode",
+    "PartitionInfo",
     "QuoteStyle",
     "SinkInfo",
     "SourceInfo",
     "StatisticsFreq",
+    "TableInputMetadata",
     "TableWithMetadata",
 ]
 
+cdef class PartitionInfo:
+    """
+    Information used while writing partitioned datasets.
+
+    Parameters
+    ----------
+    start_row : int
+        The start row of the partition.
+
+    num_rows : int
+        The number of rows in the partition.
+    """
+    def __init__(self, size_type start_row, size_type num_rows):
+        self.c_obj = partition_info(start_row, num_rows)
+
+
+cdef class ColumnInMetadata:
+    """
+    Metadata for a column
+    """
+
+    @staticmethod
+    cdef ColumnInMetadata from_metadata(column_in_metadata metadata):
+        """
+        Construct a ColumnInMetadata.
+
+        Parameters
+        ----------
+        metadata : column_in_metadata
+        """
+        cdef ColumnInMetadata col_metadata = ColumnInMetadata.__new__(ColumnInMetadata)
+        col_metadata.c_obj = metadata
+        return col_metadata
+
+    cpdef ColumnInMetadata set_name(self, str name):
+        """
+        Set the name of this column.
+
+        Parameters
+        ----------
+        name : str
+            Name of the column
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_name(name.encode())
+        return self
+
+    cpdef ColumnInMetadata set_nullability(self, bool nullable):
+        """
+        Set the nullability of this column.
+
+        Parameters
+        ----------
+        nullable : bool
+            Whether this column is nullable
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_nullability(nullable)
+        return self
+
+    cpdef ColumnInMetadata set_list_column_as_map(self):
+        """
+        Specify that this list column should be encoded as a map in the
+        written file.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_list_column_as_map()
+        return self
+
+    cpdef ColumnInMetadata set_int96_timestamps(self, bool req):
+        """
+        Specifies whether this timestamp column should be encoded using
+        the deprecated int96.
+
+        Parameters
+        ----------
+        req : bool
+            True = use int96 physical type. False = use int64 physical type.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_int96_timestamps(req)
+        return self
+
+    cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision):
+        """
+        Set the decimal precision of this column.
+        Only valid if this column is a decimal (fixed-point) type.
+
+        Parameters
+        ----------
+        precision : int
+            The integer precision to set for this decimal column
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_decimal_precision(precision)
+        return self
+
+    cpdef ColumnInMetadata child(self, size_type i):
+        """
+        Get reference to a child of this column.
+
+        Parameters
+        ----------
+        i : int
+            Index of the child to get.
+
+        Returns
+        -------
+        ColumnInMetadata
+        """
+        return ColumnInMetadata.from_metadata(self.c_obj.child(i))
+
+    cpdef ColumnInMetadata set_output_as_binary(self, bool binary):
+        """
+        Specifies whether this column should be written as binary or string data.
+
+        Parameters
+        ----------
+        binary : bool
+            True = use binary data type. False = use string data type
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_output_as_binary(binary)
+        return self
+
+    cpdef ColumnInMetadata set_type_length(self, int32_t type_length):
+        """
+        Sets the length of fixed length data.
+
+        Parameters
+        ----------
+        type_length : int
+            Size of the data type in bytes
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_type_length(type_length)
+        return self
+
+    cpdef ColumnInMetadata set_skip_compression(self, bool skip):
+        """
+        Specifies whether this column should not be compressed
+        regardless of the compression.
+
+        Parameters
+        ----------
+        skip : bool
+            If `true` do not compress this column
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.set_skip_compression(skip)
+        return self
+
+    cpdef ColumnInMetadata set_encoding(self, column_encoding encoding):
+        """
+        Specifies whether this column should not be compressed
+        regardless of the compression.
+
+        Parameters
+        ----------
+        encoding : ColumnEncoding
+            The encoding to use
+
+        Returns
+        -------
+        ColumnInMetadata
+        """
+        self.c_obj.set_encoding(encoding)
+        return self
+
+    cpdef str get_name(self):
+        """
+        Get the name of this column.
+
+        Returns
+        -------
+        str
+            The name of this column
+        """
+        return self.c_obj.get_name().decode()
+
+
+cdef class TableInputMetadata:
+    """
+    Metadata for a table
+
+    Parameters
+    ----------
+    table : Table
+        The Table to construct metadata for
+    """
+    def __init__(self, Table table):
+        self.c_obj = table_input_metadata(table.view())
+
+
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
     (e.g. column names)
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index 110c9d4a0b9..e03fe7e921e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -156,7 +156,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             table_input_metadata m
         ) except +libcudf_exception_handler
         BuilderT& key_value_metadata(
-            vector[map[string, string]] kvm
+            vector[map[string, string]] metadata
         ) except +libcudf_exception_handler
         BuilderT& stats_level(
             statistics_freq sf
@@ -189,7 +189,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             size_t val
         ) except +libcudf_exception_handler
         BuilderT& write_v2_headers(
-            bool val
+            bool enabled
         ) except +libcudf_exception_handler
         BuilderT& dictionary_policy(
             dictionary_policy val
@@ -212,7 +212,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +libcudf_exception_handler
 
     cdef unique_ptr[vector[uint8_t]] write_parquet(
-        parquet_writer_options args
+        parquet_writer_options options
     ) except +libcudf_exception_handler
 
     cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index 41298601539..94524acbcc8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,4 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
@@ -107,3 +109,70 @@ def test_read_parquet_filters(
 # ^^^ This one is not tested since it's not in pyarrow/pandas, deprecate?
 # bool convert_strings_to_categories = False,
 # bool use_pandas_metadata = True
+
+
+@pytest.mark.parametrize("write_v2_headers", [True, False])
+@pytest.mark.parametrize("utc_timestamps", [True, False])
+@pytest.mark.parametrize("write_arrow_schema", [True, False])
+@pytest.mark.parametrize(
+    "partitions",
+    [None, [plc.io.types.PartitionInfo(0, 10)]],
+)
+@pytest.mark.parametrize("column_chunks_file_paths", [None, ["tmp.parquet"]])
+@pytest.mark.parametrize("row_group_size_bytes", [None, 1024])
+@pytest.mark.parametrize("row_group_size_rows", [None, 1])
+@pytest.mark.parametrize("max_page_size_bytes", [None, 1024])
+@pytest.mark.parametrize("max_page_size_rows", [None, 1])
+@pytest.mark.parametrize("max_dictionary_size", [None, 100])
+def test_write_parquet(
+    table_data,
+    write_v2_headers,
+    utc_timestamps,
+    write_arrow_schema,
+    partitions,
+    column_chunks_file_paths,
+    row_group_size_bytes,
+    row_group_size_rows,
+    max_page_size_bytes,
+    max_page_size_rows,
+    max_dictionary_size,
+):
+    _, pa_table = table_data
+    if len(pa_table) == 0 and partitions is not None:
+        pytest.skip("https://github.com/rapidsai/cudf/issues/17361")
+    plc_table = plc.interop.from_arrow(pa_table)
+    table_meta = plc.io.types.TableInputMetadata(plc_table)
+    sink = plc.io.SinkInfo([io.BytesIO()])
+    user_data = [{"foo": "{'bar': 'baz'}"}]
+    compression = plc.io.types.CompressionType.SNAPPY
+    stats_level = plc.io.types.StatisticsFreq.STATISTICS_COLUMN
+    dictionary_policy = plc.io.types.DictionaryPolicy.ADAPTIVE
+    options = (
+        plc.io.parquet.ParquetWriterOptions.builder(sink, plc_table)
+        .metadata(table_meta)
+        .key_value_metadata(user_data)
+        .compression(compression)
+        .stats_level(stats_level)
+        .write_v2_headers(write_v2_headers)
+        .dictionary_policy(dictionary_policy)
+        .utc_timestamps(utc_timestamps)
+        .write_arrow_schema(write_arrow_schema)
+        .build()
+    )
+    if partitions is not None:
+        options.set_partitions(partitions)
+    if column_chunks_file_paths is not None:
+        options.set_column_chunks_file_paths(column_chunks_file_paths)
+    if row_group_size_bytes is not None:
+        options.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        options.set_row_group_size_rows(row_group_size_rows)
+    if max_page_size_bytes is not None:
+        options.set_max_page_size_bytes(max_page_size_bytes)
+    if max_page_size_rows is not None:
+        options.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        options.set_max_dictionary_size(max_dictionary_size)
+
+    result = plc.io.parquet.write_parquet(options)
+    assert isinstance(result, memoryview)

From 5a89d0066b5cfbb38d5a392b425865d66b82a8b6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 22 Nov 2024 13:26:13 -0600
Subject: [PATCH 03/23] Run clang-tidy checks in PR CI (#17407)

We discussed clang-tidy during the cuDF brown bag session. We decided to enable clang-tidy in PR CI and follow up by enabling more checks later.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17407
---
 .github/workflows/pr.yaml            |  8 +++
 ci/cpp_linters.sh                    |  7 ++-
 cpp/.clang-tidy                      |  2 +-
 cpp/CMakeLists.txt                   | 82 +++++++++++++++-------------
 cpp/include/cudf/ast/expressions.hpp |  6 +-
 cpp/src/io/json/parser_features.cpp  |  2 +-
 6 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a8c4e481683..a8afede4821 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -16,6 +16,7 @@ jobs:
       - changed-files
       - checks
       - conda-cpp-build
+      - cpp-linters
       - conda-cpp-checks
       - conda-cpp-tests
       - conda-python-build
@@ -113,6 +114,13 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
+  cpp-linters:
+    secrets: inherit
+    needs: checks
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    with:
+      build_type: pull-request
+      run_script: "ci/cpp_linters.sh"
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
index 286c7bfbc66..4d5b62ba280 100755
--- a/ci/cpp_linters.sh
+++ b/ci/cpp_linters.sh
@@ -25,7 +25,12 @@ RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 source rapids-configure-sccache
 
 # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.
-cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_STATIC_LINTERS=ON -GNinja
+
+iwyu_flag=""
+if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
+  iwyu_flag="-DCUDF_IWYU=ON"
+fi
+cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja
 cmake --build cpp/build 2>&1 | python cpp/scripts/parse_iwyu_output.py
 
 # Remove invalid components of the path for local usage. The path below is
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index 0e5699876fc..60c0b5d3ba7 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -39,7 +39,7 @@ Checks:
        -clang-analyzer-optin.core.EnumCastOutOfRange,
        -clang-analyzer-optin.cplusplus.UninitializedObject'
 
-WarningsAsErrors: ''
+WarningsAsErrors: '*'
 HeaderFilterRegex: '.*cudf/cpp/(src|include).*'
 ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 506f6c185f5..e4fa3b28383 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -91,7 +91,8 @@ option(
   ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL}
 )
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
-option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF)
+option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
+option(CUDF_IWYU "Enable IWYU during compilation" OFF)
 
 option(
   CUDF_KVIKIO_REMOTE_IO
@@ -159,9 +160,7 @@ endif()
 
 # ##################################################################################################
 # * linter configuration ---------------------------------------------------------------------------
-if(CUDF_STATIC_LINTERS)
-  # For simplicity, for now we assume that all linters can be installed into an environment where
-  # any linter is being run. We could relax this requirement if desired.
+if(CUDF_CLANG_TIDY)
   find_program(
     CLANG_TIDY_EXE
     NAMES "clang-tidy"
@@ -188,7 +187,9 @@ if(CUDF_STATIC_LINTERS)
         "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}"
     )
   endif()
+endif()
 
+if(CUDF_IWYU)
   find_program(IWYU_EXE NAMES include-what-you-use iwyu REQUIRED)
 endif()
 
@@ -201,38 +202,36 @@ function(enable_static_checkers target)
     _LINT "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
   )
 
-  if(CUDF_STATIC_LINTERS)
-    if(_LINT_CLANG_TIDY)
-      # clang will complain about unused link libraries on the compile line unless we specify
-      # -Qunused-arguments.
-      set_target_properties(
-        ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
-      )
-    endif()
-    if(_LINT_IWYU)
-      # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
-      # relevant since they don't show up in any other build so it's better to suppress them until
-      # we can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not
-      # appear to be sufficient, we must also ensure that it is set to the underlying target's CXX
-      # compile flags. To do this completely cleanly we should modify the flags on the target rather
-      # than the global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run
-      # the linters on real builds.
-      foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration)
-        list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index)
-        if(_flag_index EQUAL -1)
-          list(APPEND CUDF_CXX_FLAGS ${_flag})
-        endif()
-      endforeach()
-      set(CUDF_CXX_FLAGS
-          "${CUDF_CXX_FLAGS}"
-          PARENT_SCOPE
-      )
-      set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}")
-    endif()
-    foreach(file IN LISTS _LINT_SKIPPED_FILES)
-      set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
+  if(_LINT_CLANG_TIDY)
+    # clang will complain about unused link libraries on the compile line unless we specify
+    # -Qunused-arguments.
+    set_target_properties(
+      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+    )
+  endif()
+  if(_LINT_IWYU)
+    # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
+    # relevant since they don't show up in any other build so it's better to suppress them until we
+    # can figure out the cause. Setting this as part of CXX_INCLUDE_WHAT_YOU_USE does not appear to
+    # be sufficient, we must also ensure that it is set to the underlying target's CXX compile
+    # flags. To do this completely cleanly we should modify the flags on the target rather than the
+    # global CUDF_CXX_FLAGS, but this solution is good enough for now since we never run the linters
+    # on real builds.
+    foreach(_flag -Wno-missing-braces -Wno-unneeded-internal-declaration)
+      list(FIND CUDF_CXX_FLAGS "${_flag}" _flag_index)
+      if(_flag_index EQUAL -1)
+        list(APPEND CUDF_CXX_FLAGS ${_flag})
+      endif()
     endforeach()
+    set(CUDF_CXX_FLAGS
+        "${CUDF_CXX_FLAGS}"
+        PARENT_SCOPE
+    )
+    set_target_properties(${target} PROPERTIES CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}")
   endif()
+  foreach(file IN LISTS _LINT_SKIPPED_FILES)
+    set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
+  endforeach()
 endfunction()
 
 # ##################################################################################################
@@ -812,9 +811,18 @@ set_target_properties(
 
 # Note: This must come before the target_compile_options below so that the function can modify the
 # flags if necessary.
-enable_static_checkers(
-  cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp CLANG_TIDY IWYU
-)
+if(CUDF_CLANG_TIDY OR CUDF_IWYU)
+  set(linters)
+  if(CUDF_CLANG_TIDY)
+    list(APPEND linters CLANG_TIDY)
+  endif()
+  if(CUDF_IWYU)
+    list(APPEND linters IWYU)
+  endif()
+  enable_static_checkers(
+    cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp ${linters}
+  )
+endif()
 target_compile_options(
   cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index bcc9ad1b391..85289a52831 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -612,19 +612,19 @@ class tree {
    * @brief get the first expression in the tree
    * @returns the first inserted expression into the tree
    */
-  expression const& front() const { return *expressions.front(); }
+  [[nodiscard]] expression const& front() const { return *expressions.front(); }
 
   /**
    * @brief get the last expression in the tree
    * @returns the last inserted expression into the tree
    */
-  expression const& back() const { return *expressions.back(); }
+  [[nodiscard]] expression const& back() const { return *expressions.back(); }
 
   /**
    * @brief get the number of expressions added to the tree
    * @returns the number of expressions added to the tree
    */
-  size_t size() const { return expressions.size(); }
+  [[nodiscard]] size_t size() const { return expressions.size(); }
 
   /**
    * @brief get the expression at an index in the tree. Index is checked.
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 401a6e992de..e795e8e09d8 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -109,7 +109,7 @@ struct allnull_column_functor {
   rmm::device_async_resource_ref mr;
 
  private:
-  auto make_zeroed_offsets(size_type size) const
+  [[nodiscard]] auto make_zeroed_offsets(size_type size) const
   {
     auto offsets_buff =
       cudf::detail::make_zeroed_device_uvector_async<size_type>(size + 1, stream, mr);

From 881afd123572dac518e0baeb0537572a869e751c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:18:25 -0800
Subject: [PATCH 04/23] Remove cudf._lib.transpose in favor of inlining
 pylibcudf (#17365)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17365
---
 python/cudf/cudf/_lib/CMakeLists.txt |  1 -
 python/cudf/cudf/_lib/__init__.py    |  1 -
 python/cudf/cudf/_lib/transpose.pyx  | 18 ------------------
 python/cudf/cudf/core/dataframe.py   | 10 +++++++++-
 4 files changed, 9 insertions(+), 21 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/transpose.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2958c286d20..8a521f19350 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -44,7 +44,6 @@ set(cython_sources
     text.pyx
     timezone.pyx
     transform.pyx
-    transpose.pyx
     types.pyx
     utils.pyx
 )
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 19dc4488560..27bb486f55b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -30,7 +30,6 @@
     strings_udf,
     text,
     timezone,
-    transpose,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
deleted file mode 100644
index 995d278cb88..00000000000
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def transpose(list source_columns):
-    """Transpose m n-row columns into n m-row columns
-    """
-    input_table = plc.table.Table(
-        [col.to_pylibcudf(mode="read") for col in source_columns]
-    )
-    result_table = plc.transpose.transpose(input_table)
-    return [
-        Column.from_pylibcudf(col, data_ptr_exposed=True)
-        for col in result_table.columns()
-    ]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bd78d5dd9f1..728cc47a7c9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4113,7 +4113,15 @@ def transpose(self):
         if any(c.dtype != source_columns[0].dtype for c in source_columns):
             raise ValueError("Columns must all have the same dtype")
 
-        result_columns = libcudf.transpose.transpose(source_columns)
+        result_table = plc.transpose.transpose(
+            plc.table.Table(
+                [col.to_pylibcudf(mode="read") for col in source_columns]
+            )
+        )
+        result_columns = [
+            libcudf.column.Column.from_pylibcudf(col, data_ptr_exposed=True)
+            for col in result_table.columns()
+        ]
 
         if isinstance(source_dtype, cudf.CategoricalDtype):
             result_columns = [

From 092fdff63ec6d166b4feca1c59bd0582dde932f5 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 22 Nov 2024 14:24:05 -0800
Subject: [PATCH 05/23] Update xxhash_64 to utilize the cuco equivalent
 implementation (#17393)

This PR updates `xxhash_64` to leverage the cuco equivalent implementation and makes it available as a cudf detail utility, as requested by @mhaseeb123. Additionally, it fixes a minor issue in the `murmurhash3_x64_128.cuh` header by adding the missing `#pragma once` directive.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17393
---
 .../hashing/detail/murmurhash3_x64_128.cuh    |   4 +-
 cpp/include/cudf/hashing/detail/xxhash_64.cuh |  99 +++++++++
 cpp/src/hash/xxhash_64.cu                     | 203 +-----------------
 3 files changed, 103 insertions(+), 203 deletions(-)
 create mode 100644 cpp/include/cudf/hashing/detail/xxhash_64.cuh

diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
index c986a908706..5e88b905023 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 
diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
new file mode 100644
index 00000000000..b00e8297ac9
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "hash_functions.cuh"
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
+
+namespace cudf::hashing::detail {
+
+template <typename Key>
+struct XXHash_64 : public cuco::xxhash_64<Key> {
+  using result_type = typename cuco::xxhash_64<Key>::result_type;
+
+  __device__ result_type operator()(Key const& key) const
+  {
+    return cuco::xxhash_64<Key>::operator()(key);
+  }
+
+  template <typename Extent>
+  __device__ result_type compute_hash(cuda::std::byte const* bytes, Extent size) const
+  {
+    return cuco::xxhash_64<Key>::compute_hash(bytes, size);
+  }
+};
+
+template <>
+XXHash_64<bool>::result_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
+{
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(key));
+}
+
+template <>
+XXHash_64<float>::result_type __device__ inline XXHash_64<float>::operator()(float const& key) const
+{
+  return cuco::xxhash_64<float>::operator()(normalize_nans(key));
+}
+
+template <>
+XXHash_64<double>::result_type __device__ inline XXHash_64<double>::operator()(
+  double const& key) const
+{
+  return cuco::xxhash_64<double>::operator()(normalize_nans(key));
+}
+
+template <>
+XXHash_64<cudf::string_view>::result_type
+  __device__ inline XXHash_64<cudf::string_view>::operator()(cudf::string_view const& key) const
+{
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(key.data()), key.size_bytes());
+}
+
+template <>
+XXHash_64<numeric::decimal32>::result_type
+  __device__ inline XXHash_64<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+template <>
+XXHash_64<numeric::decimal64>::result_type
+  __device__ inline XXHash_64<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+template <>
+XXHash_64<numeric::decimal128>::result_type
+  __device__ inline XXHash_64<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  auto const val = key.value();
+  auto const len = sizeof(val);
+  return this->compute_hash(reinterpret_cast<cuda::std::byte const*>(&val), len);
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index fad8383210b..bdbe13b1ffb 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -16,8 +16,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_64.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -35,207 +35,6 @@ namespace {
 
 using hash_value_type = uint64_t;
 
-template <typename Key>
-struct XXHash_64 {
-  using result_type = hash_value_type;
-
-  constexpr XXHash_64() = default;
-  constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {}
-
-  __device__ inline uint32_t getblock32(std::byte const* data, std::size_t offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  __device__ inline uint64_t getblock64(std::byte const* data, std::size_t offset) const
-  {
-    uint64_t result = getblock32(data, offset + 4);
-    result          = result << 32;
-    return result | getblock32(data, offset);
-  }
-
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-    return compute_bytes(data);
-  }
-
-  result_type __device__ inline compute_remaining_bytes(device_span<std::byte const>& in,
-                                                        std::size_t offset,
-                                                        result_type h64) const
-  {
-    // remaining data can be processed in 8-byte chunks
-    if ((in.size() % 32) >= 8) {
-      for (; offset <= in.size() - 8; offset += 8) {
-        uint64_t k1 = getblock64(in.data(), offset) * prime2;
-
-        k1 = rotate_bits_left(k1, 31) * prime1;
-        h64 ^= k1;
-        h64 = rotate_bits_left(h64, 27) * prime1 + prime4;
-      }
-    }
-
-    // remaining data can be processed in 4-byte chunks
-    if ((in.size() % 8) >= 4) {
-      for (; offset <= in.size() - 4; offset += 4) {
-        h64 ^= (getblock32(in.data(), offset) & 0xfffffffful) * prime1;
-        h64 = rotate_bits_left(h64, 23) * prime2 + prime3;
-      }
-    }
-
-    // and the rest
-    if (in.size() % 4) {
-      while (offset < in.size()) {
-        h64 ^= (std::to_integer<uint8_t>(in[offset]) & 0xff) * prime5;
-        h64 = rotate_bits_left(h64, 11) * prime1;
-        ++offset;
-      }
-    }
-    return h64;
-  }
-
-  result_type __device__ compute_bytes(device_span<std::byte const>& in) const
-  {
-    uint64_t offset = 0;
-    uint64_t h64;
-    // data can be processed in 32-byte chunks
-    if (in.size() >= 32) {
-      auto limit  = in.size() - 32;
-      uint64_t v1 = m_seed + prime1 + prime2;
-      uint64_t v2 = m_seed + prime2;
-      uint64_t v3 = m_seed;
-      uint64_t v4 = m_seed - prime1;
-
-      do {
-        // pipeline 4*8byte computations
-        v1 += getblock64(in.data(), offset) * prime2;
-        v1 = rotate_bits_left(v1, 31);
-        v1 *= prime1;
-        offset += 8;
-        v2 += getblock64(in.data(), offset) * prime2;
-        v2 = rotate_bits_left(v2, 31);
-        v2 *= prime1;
-        offset += 8;
-        v3 += getblock64(in.data(), offset) * prime2;
-        v3 = rotate_bits_left(v3, 31);
-        v3 *= prime1;
-        offset += 8;
-        v4 += getblock64(in.data(), offset) * prime2;
-        v4 = rotate_bits_left(v4, 31);
-        v4 *= prime1;
-        offset += 8;
-      } while (offset <= limit);
-
-      h64 = rotate_bits_left(v1, 1) + rotate_bits_left(v2, 7) + rotate_bits_left(v3, 12) +
-            rotate_bits_left(v4, 18);
-
-      v1 *= prime2;
-      v1 = rotate_bits_left(v1, 31);
-      v1 *= prime1;
-      h64 ^= v1;
-      h64 = h64 * prime1 + prime4;
-
-      v2 *= prime2;
-      v2 = rotate_bits_left(v2, 31);
-      v2 *= prime1;
-      h64 ^= v2;
-      h64 = h64 * prime1 + prime4;
-
-      v3 *= prime2;
-      v3 = rotate_bits_left(v3, 31);
-      v3 *= prime1;
-      h64 ^= v3;
-      h64 = h64 * prime1 + prime4;
-
-      v4 *= prime2;
-      v4 = rotate_bits_left(v4, 31);
-      v4 *= prime1;
-      h64 ^= v4;
-      h64 = h64 * prime1 + prime4;
-    } else {
-      h64 = m_seed + prime5;
-    }
-
-    h64 += in.size();
-
-    h64 = compute_remaining_bytes(in, offset, h64);
-
-    return finalize(h64);
-  }
-
-  constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept
-  {
-    h ^= h >> 33;
-    h *= prime2;
-    h ^= h >> 29;
-    h *= prime3;
-    h ^= h >> 32;
-    return h;
-  }
-
- private:
-  hash_value_type m_seed{};
-  static constexpr uint64_t prime1 = 0x9e3779b185ebca87ul;
-  static constexpr uint64_t prime2 = 0xc2b2ae3d27d4eb4ful;
-  static constexpr uint64_t prime3 = 0x165667b19e3779f9ul;
-  static constexpr uint64_t prime4 = 0x85ebca77c2b2ae63ul;
-  static constexpr uint64_t prime5 = 0x27d4eb2f165667c5ul;
-};
-
-template <>
-hash_value_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
-{
-  return compute(static_cast<uint8_t>(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
-{
-  return compute(normalize_nans(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
-{
-  return compute(normalize_nans(key));
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const len = key.size_bytes();
-  auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(key.data()), len);
-  return compute_bytes(data);
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline XXHash_64<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  return compute(key.value());
-}
-
 /**
  * @brief Computes the hash value of a row in the given table.
  *

From 8b7127fe92aa28321b6422643a3d1d3ca2301ae1 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Fri, 22 Nov 2024 18:16:42 -0500
Subject: [PATCH 06/23] Fix Debug-mode failing Arrow test (#17405)

Fixes #17153 by fixing the appropriate element nullability data types

Authors:
  - Matt Topol (https://github.com/zeroshade)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17405
---
 cpp/tests/interop/arrow_utils.hpp     | 3 +--
 cpp/tests/interop/from_arrow_test.cpp | 5 ++++-
 cpp/tests/interop/to_arrow_test.cpp   | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 70a9fe64d70..e785845394a 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -212,9 +212,8 @@ std::shared_ptr<arrow::Array> get_arrow_list_array(std::vector<T> data,
                "Failed to append values to buffer builder");
   CUDF_EXPECTS(buff_builder.Finish(&offset_buffer).ok(), "Failed to allocate buffer");
 
-  auto nullable = std::accumulate(list_validity.begin(), list_validity.end(), 0) > 0;
   return std::make_shared<arrow::ListArray>(
-    arrow::list(arrow::field("", data_array->type(), nullable)),
+    arrow::list(arrow::field("element", data_array->type(), data_array->null_count() > 0)),
     offsets.size() - 1,
     offset_buffer,
     data_array,
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 18efae75cb1..62e38cbfd45 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -279,7 +279,10 @@ TEST_F(FromArrowTest, StructColumn)
   auto list_arr = get_arrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
   std::vector<int32_t> offset{0, 3, 4, 6};
   auto nested_list_arr = std::make_shared<arrow::ListArray>(
-    arrow::list(list(arrow::int64())), offset.size() - 1, arrow::Buffer::Wrap(offset), list_arr);
+    arrow::list(list(arrow::field("element", arrow::int64(), false))),
+    offset.size() - 1,
+    arrow::Buffer::Wrap(offset),
+    list_arr);
 
   std::vector<std::shared_ptr<arrow::Array>> child_arrays2({str2_array, int2_array});
   auto fields2 = std::vector<std::shared_ptr<arrow::Field>>{
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 86295d8efb1..faa07ba3311 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -261,14 +261,14 @@ TEST_F(ToArrowTest, NestedList)
   std::vector<int32_t> offset{0, 0, 2};
   auto mask_buffer     = arrow::internal::BytesToBits({0, 1}).ValueOrDie();
   auto nested_list_arr = std::make_shared<arrow::ListArray>(
-    arrow::list(arrow::field("a", arrow::list(arrow::int64()), false)),
+    arrow::list(arrow::field("element", arrow::list(arrow::int64()), false)),
     offset.size() - 1,
     arrow::Buffer::Wrap(offset),
     list_arr,
     mask_buffer);
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
-    {arrow::field("a", nested_list_arr->type())});
+    {arrow::field("a", nested_list_arr->type(), false)});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
   auto expected_arrow_table                         = arrow::Table::Make(schema, {nested_list_arr});

From d1d44209673f3b789bcbc8dc628231d4ace76bef Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:24:49 -0800
Subject: [PATCH 07/23] Move cudf._lib.search to cudf.core._internals (#17411)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17411
---
 python/cudf/cudf/_lib/CMakeLists.txt       |  1 -
 python/cudf/cudf/_lib/__init__.py          |  1 -
 python/cudf/cudf/_lib/search.pyx           | 68 ----------------------
 python/cudf/cudf/core/_internals/search.py | 56 ++++++++++++++++++
 python/cudf/cudf/core/column/column.py     | 23 +++++++-
 python/cudf/cudf/core/column/datetime.py   |  2 +-
 python/cudf/cudf/core/column/numerical.py  |  4 +-
 python/cudf/cudf/core/column/string.py     | 10 +---
 python/cudf/cudf/core/frame.py             |  8 ++-
 python/cudf/cudf/core/index.py             |  2 +-
 10 files changed, 87 insertions(+), 88 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/search.pyx
 create mode 100644 python/cudf/cudf/core/_internals/search.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8a521f19350..1c2b24d2391 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -36,7 +36,6 @@ set(cython_sources
     rolling.pyx
     round.pyx
     scalar.pyx
-    search.pyx
     sort.pyx
     stream_compaction.pyx
     string_casting.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 27bb486f55b..13d05033c11 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -22,7 +22,6 @@
     reshape,
     rolling,
     round,
-    search,
     sort,
     stream_compaction,
     string_casting,
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
deleted file mode 100644
index 8108361052b..00000000000
--- a/python/cudf/cudf/_lib/search.pyx
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def search_sorted(
-    list source, list values, side, ascending=True, na_position="last"
-):
-    """Find indices where elements should be inserted to maintain order
-
-    Parameters
-    ----------
-    source : list of columns
-        List of columns to search in
-    values : List of columns
-        List of value columns to search for
-    side : str {'left', 'right'} optional
-        If 'left', the index of the first suitable location is given.
-        If 'right', return the last such index
-    """
-    # Note: We are ignoring index columns here
-    column_order = [
-        pylibcudf.types.Order.ASCENDING
-        if ascending
-        else pylibcudf.types.Order.DESCENDING
-    ] * len(source)
-    null_precedence = [
-        pylibcudf.types.NullOrder.AFTER
-        if na_position == "last"
-        else pylibcudf.types.NullOrder.BEFORE
-    ] * len(source)
-
-    func = getattr(
-        pylibcudf.search,
-        "lower_bound" if side == "left" else "upper_bound",
-    )
-    return Column.from_pylibcudf(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            column_order,
-            null_precedence,
-        )
-    )
-
-
-@acquire_spill_lock()
-def contains(Column haystack, Column needles):
-    """Check whether column contains multiple values
-
-    Parameters
-    ----------
-    column : NumericalColumn
-        Column to search in
-    needles :
-        A column of values to search for
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.search.contains(
-            haystack.to_pylibcudf(mode="read"),
-            needles.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py
new file mode 100644
index 00000000000..a0ffe078de9
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/search.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def search_sorted(
+    source: list[ColumnBase],
+    values: list[ColumnBase],
+    side: Literal["left", "right"],
+    ascending: bool = True,
+    na_position: Literal["first", "last"] = "last",
+) -> ColumnBase:
+    """Find indices where elements should be inserted to maintain order
+
+    Parameters
+    ----------
+    source : list of columns
+        List of columns to search in
+    values : List of columns
+        List of value columns to search for
+    side : str {'left', 'right'} optional
+        If 'left', the index of the first suitable location is given.
+        If 'right', return the last such index
+    """
+    # Note: We are ignoring index columns here
+    column_order = [
+        plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+    ] * len(source)
+    null_precedence = [
+        plc.types.NullOrder.AFTER
+        if na_position == "last"
+        else plc.types.NullOrder.BEFORE
+    ] * len(source)
+
+    func = getattr(
+        plc.search,
+        "lower_bound" if side == "left" else "upper_bound",
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in source]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            column_order,
+            null_precedence,
+        )
+    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f6eaea4b783..53946be1c49 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -757,7 +757,7 @@ def indices_of(
             raise ValueError("value must be a scalar")
         else:
             value = as_column(value, dtype=self.dtype, length=1)
-        mask = libcudf.search.contains(value, self)
+        mask = value.contains(self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
@@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         # self.isin(other) asks "which values of self are in other"
         # contains(haystack, needles) asks "which needles are in haystack"
         # hence this argument ordering.
-        result = libcudf.search.contains(rhs, self)
+        result = rhs.contains(self)
         if self.null_count > 0:
             # If one of the needles is null, then the result contains
             # nulls, these nulls should be replaced by whether or not the
@@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool:
             [self], [False], None
         )
 
+    def contains(self, other: ColumnBase) -> ColumnBase:
+        """
+        Check whether column contains multiple values.
+
+        Parameters
+        ----------
+        other : Column
+            A column of values to search for
+        """
+        with acquire_spill_lock():
+            return Column.from_pylibcudf(
+                plc.search.contains(
+                    self.to_pylibcudf(mode="read"),
+                    other.to_pylibcudf(mode="read"),
+                )
+            )
+
     def sort_values(
         self: Self,
         ascending: bool = True,
@@ -1190,7 +1207,7 @@ def searchsorted(
             raise ValueError(
                 "Column searchsorted expects values to be column of same dtype"
             )
-        return libcudf.search.search_sorted(
+        return cudf.core._internals.search.search_sorted(  # type: ignore[return-value]
             [self],
             [value],
             side=side,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 16124cf0a7d..b40ea4eedd3 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,9 +18,9 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import unary
+from cudf.core._internals.search import search_sorted
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 36d1bdb45b6..a7538c1c947 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool:
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
-        return libcudf.search.contains(
-            self, column.as_column([search_item], dtype=self.dtype)
+        return self.contains(
+            column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 3d70b01b7e4..a9ab2d373fd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5857,14 +5857,8 @@ def sum(
             return result_col
 
     def __contains__(self, item: ScalarLike) -> bool:
-        if is_scalar(item):
-            return True in libcudf.search.contains(
-                self, column.as_column([item], dtype=self.dtype)
-            )
-        else:
-            return True in libcudf.search.contains(
-                self, column.as_column(item, dtype=self.dtype)
-            )
+        other = [item] if is_scalar(item) else item
+        return self.contains(column.as_column(other, dtype=self.dtype)).any()
 
     def as_numerical_column(
         self, dtype: Dtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 30868924bcd..838fde260df 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -8,8 +8,6 @@
 from collections import abc
 from typing import TYPE_CHECKING, Any, Literal
 
-# TODO: The `numpy` import is needed for typing purposes during doc builds
-# only, need to figure out why the `np` alias is insufficient then remove.
 import cupy
 import numpy
 import numpy as np
@@ -19,9 +17,13 @@
 import pylibcudf as plc
 
 import cudf
+
+# TODO: The `numpy` import is needed for typing purposes during doc builds
+# only, need to figure out why the `np` alias is insufficient then remove.
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals.search import search_sorted
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -1302,7 +1304,7 @@ def searchsorted(
             for val, common_dtype in zip(values, common_dtype_list)
         ]
 
-        outcol = libcudf.search.search_sorted(
+        outcol = search_sorted(
             sources,
             values,
             side,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1b90e9f9df0..0a2b15a16b9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -19,7 +19,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.filling import sequence
-from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -32,6 +31,7 @@
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals.search import search_sorted
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,

From 44b2e798bd6c280985d052634c0c1e495f57a609 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 24 Nov 2024 15:11:32 -0800
Subject: [PATCH 08/23] Remove the unused detail `int_fastdiv.h` header
 (#17426)

This PR removes the unused detail  `int_fastdiv.h` header.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17426
---
 .../cudf/detail/utilities/int_fastdiv.h       | 175 ------------------
 1 file changed, 175 deletions(-)
 delete mode 100644 cpp/include/cudf/detail/utilities/int_fastdiv.h

diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h
deleted file mode 100644
index ff442af5194..00000000000
--- a/cpp/include/cudf/detail/utilities/int_fastdiv.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Copyright 2014 Maxim Milakov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-class int_fastdiv {
- public:
-  // divisor != 0
-  __host__ __device__ __forceinline__ int_fastdiv(int divisor = 0) : d(divisor)
-  {
-    update_magic_numbers();
-  }
-
-  __host__ __device__ __forceinline__ int_fastdiv& operator=(int divisor)
-  {
-    this->d = divisor;
-    update_magic_numbers();
-    return *this;
-  }
-
-  __host__ __device__ __forceinline__ operator int() const { return d; }
-
- private:
-  int d;
-  int M;
-  int s;
-  int n_add_sign;
-
-  // Hacker's Delight, Second Edition, Chapter 10, Integer Division By Constants
-  __host__ __device__ __forceinline__ void update_magic_numbers()
-  {
-    if (d == 1) {
-      M          = 0;
-      s          = -1;
-      n_add_sign = 1;
-      return;
-    } else if (d == -1) {
-      M          = 0;
-      s          = -1;
-      n_add_sign = -1;
-      return;
-    }
-
-    int p;
-    unsigned int ad, anc, delta, q1, r1, q2, r2, t;
-    unsigned const two31 = 0x8000'0000u;
-    ad                   = (d == 0) ? 1 : abs(d);
-    t                    = two31 + ((unsigned int)d >> 31);
-    anc                  = t - 1 - t % ad;
-    p                    = 31;
-    q1                   = two31 / anc;
-    r1                   = two31 - q1 * anc;
-    q2                   = two31 / ad;
-    r2                   = two31 - q2 * ad;
-    do {
-      ++p;
-      q1 = 2 * q1;
-      r1 = 2 * r1;
-      if (r1 >= anc) {
-        ++q1;
-        r1 -= anc;
-      }
-      q2 = 2 * q2;
-      r2 = 2 * r2;
-      if (r2 >= ad) {
-        ++q2;
-        r2 -= ad;
-      }
-      delta = ad - r2;
-    } while (q1 < delta || (q1 == delta && r1 == 0));
-    this->M = q2 + 1;
-    if (d < 0) this->M = -this->M;
-    this->s = p - 32;
-
-    if ((d > 0) && (M < 0))
-      n_add_sign = 1;
-    else if ((d < 0) && (M > 0))
-      n_add_sign = -1;
-    else
-      n_add_sign = 0;
-  }
-
-  __host__ __device__ __forceinline__ friend int operator/(int const divident,
-                                                           int_fastdiv const& divisor);
-};
-
-__host__ __device__ __forceinline__ int operator/(int const n, int_fastdiv const& divisor)
-{
-  int q;
-#ifdef __CUDA_ARCH__
-  asm("mul.hi.s32 %0, %1, %2;" : "=r"(q) : "r"(divisor.M), "r"(n));
-#else
-  q = (((unsigned long long)((long long)divisor.M * (long long)n)) >> 32);
-#endif
-  q += n * divisor.n_add_sign;
-  if (divisor.s >= 0) {
-    q >>= divisor.s;  // we rely on this to be implemented as arithmetic shift
-    q += (((unsigned int)q) >> 31);
-  }
-  return q;
-}
-
-__host__ __device__ __forceinline__ int operator%(int const n, int_fastdiv const& divisor)
-{
-  int quotient  = n / divisor;
-  int remainder = n - quotient * divisor;
-  return remainder;
-}
-
-__host__ __device__ __forceinline__ int operator/(unsigned int const n, int_fastdiv const& divisor)
-{
-  return ((int)n) / divisor;
-}
-
-__host__ __device__ __forceinline__ int operator%(unsigned int const n, int_fastdiv const& divisor)
-{
-  return ((int)n) % divisor;
-}
-
-__host__ __device__ __forceinline__ int operator/(short const n, int_fastdiv const& divisor)
-{
-  return ((int)n) / divisor;
-}
-
-__host__ __device__ __forceinline__ int operator%(short const n, int_fastdiv const& divisor)
-{
-  return ((int)n) % divisor;
-}
-
-__host__ __device__ __forceinline__ int operator/(unsigned short const n,
-                                                  int_fastdiv const& divisor)
-{
-  return ((int)n) / divisor;
-}
-
-__host__ __device__ __forceinline__ int operator%(unsigned short const n,
-                                                  int_fastdiv const& divisor)
-{
-  return ((int)n) % divisor;
-}
-
-__host__ __device__ __forceinline__ int operator/(char const n, int_fastdiv const& divisor)
-{
-  return ((int)n) / divisor;
-}
-
-__host__ __device__ __forceinline__ int operator%(char const n, int_fastdiv const& divisor)
-{
-  return ((int)n) % divisor;
-}
-
-__host__ __device__ __forceinline__ int operator/(unsigned char const n, int_fastdiv const& divisor)
-{
-  return ((int)n) / divisor;
-}
-
-__host__ __device__ __forceinline__ int operator%(unsigned char const n, int_fastdiv const& divisor)
-{
-  return ((int)n) % divisor;
-}

From 8d8cd7818b310845f4c2ad3ffc1521a267df3973 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 25 Nov 2024 05:57:47 -0500
Subject: [PATCH 09/23] Expose stream-ordering to groupby APIs (#17324)

Adds stream parameter to
```
cudf::groupby::scan
cudf::groupby::aggregate
cudf::groupby::shift
cudf::groupby::get_groups
cudf::groupby::replace_nulls
```

Added stream gtests to verify correct stream forwarding.

Reference: https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17324
---
 cpp/include/cudf/groupby.hpp       | 20 +++++------
 cpp/src/groupby/groupby.cu         | 22 +++++-------
 cpp/tests/streams/groupby_test.cpp | 57 +++++++++++++++++++++++++++++-
 3 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index c9df02f167a..ca3c97880df 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -178,6 +178,7 @@ class groupby {
    *
    * @param requests The set of columns to aggregate and the aggregations to
    * perform
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned table and columns' device memory
    * @return Pair containing the table with each group's unique key and
    * a vector of aggregation_results for each request in the same order as
@@ -185,16 +186,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
-
-  /**
-   * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
-    host_span<aggregation_request const> requests,
-    rmm::cuda_stream_view stream,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   /**
    * @brief Performs grouped scans on the specified values.
@@ -242,6 +234,7 @@ class groupby {
    * ```
    *
    * @param requests The set of columns to scan and the scans to perform
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned table and columns' device memory
    * @return Pair containing the table with each group's key and
    * a vector of aggregation_results for each request in the same order as
@@ -249,6 +242,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
@@ -295,6 +289,7 @@ class groupby {
    * @param values Table whose columns to be shifted
    * @param offsets The offsets by which to shift the input
    * @param fill_values Fill values for indeterminable outputs
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned table and columns' device memory
    * @return Pair containing the tables with each group's key and the columns shifted
    *
@@ -305,6 +300,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
@@ -329,11 +325,13 @@ class groupby {
    * and the `values` of the `groups` object will be `nullptr`.
    *
    * @param values Table representing values on which a groupby operation is to be performed
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned tables's device memory in the
    * returned groups
    * @return A `groups` object representing grouped keys and values
    */
   groups get_groups(cudf::table_view values           = {},
+                    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
@@ -367,6 +365,7 @@ class groupby {
    * @param[in] values A table whose column null values will be replaced
    * @param[in] replace_policies Specify the position of replacement values relative to null values,
    * one for each column
+   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    * @param[in] mr Device memory resource used to allocate device memory of the returned column
    *
    * @return Pair that contains a table with the sorted keys and the result column
@@ -374,6 +373,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
  private:
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 6eb82618e2a..c42038026e5 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -191,13 +191,6 @@ void verify_valid_requests(host_span<RequestType const> requests)
 
 }  // namespace
 
-// Compute aggregation requests
-std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
-  host_span<aggregation_request const> requests, rmm::device_async_resource_ref mr)
-{
-  return aggregate(requests, cudf::get_default_stream(), mr);
-}
-
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
   host_span<aggregation_request const> requests,
@@ -220,7 +213,9 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<scan_request const> requests, rmm::device_async_resource_ref mr)
+  host_span<scan_request const> requests,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -233,13 +228,14 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 
   if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
-  return sort_scan(requests, cudf::get_default_stream(), mr);
+  return sort_scan(requests, stream, mr);
 }
 
-groupby::groups groupby::get_groups(table_view values, rmm::device_async_resource_ref mr)
+groupby::groups groupby::get_groups(table_view values,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto const stream = cudf::get_default_stream();
   auto grouped_keys = helper().sorted_keys(stream, mr);
 
   auto const& group_offsets       = helper().group_offsets(stream);
@@ -262,6 +258,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::device_async_resourc
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
   table_view const& values,
   host_span<cudf::replace_policy const> replace_policies,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -271,7 +268,6 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
                "Size mismatch between num_columns and replace_policies.");
 
   if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); }
-  auto const stream = cudf::get_default_stream();
 
   auto const& group_labels = helper().group_labels(stream);
   std::vector<std::unique_ptr<column>> results;
@@ -306,6 +302,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   table_view const& values,
   host_span<size_type const> offsets,
   std::vector<std::reference_wrapper<scalar const>> const& fill_values,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -320,7 +317,6 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
                           }),
                "values and fill_value should have the same type.",
                cudf::data_type_error);
-  auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
   std::transform(
diff --git a/cpp/tests/streams/groupby_test.cpp b/cpp/tests/streams/groupby_test.cpp
index 03cabbc4de0..73d6d31b282 100644
--- a/cpp/tests/streams/groupby_test.cpp
+++ b/cpp/tests/streams/groupby_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,9 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 
 using K = int32_t;  // Key type.
 
@@ -65,3 +67,56 @@ TYPED_TEST(groupby_stream_test, test_count)
   this->test_groupby(make_count_agg(), force_use_sort_impl::YES);
   this->test_groupby(make_count_agg(cudf::null_policy::INCLUDE));
 }
+
+struct GroupbyTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupbyTest, Scan)
+{
+  using key_wrapper   = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using value_wrapper = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+  key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4});
+
+  auto agg = cudf::make_min_aggregation<cudf::groupby_scan_aggregation>();
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+  // cudf::groupby scan uses sort implementation
+  auto result = gb_obj.scan(requests, cudf::test::get_default_stream());
+}
+
+TEST_F(GroupbyTest, Shift)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> key{1, 2, 1, 2, 2, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> val{3, 4, 5, 6, 7, 8, 9};
+  cudf::size_type offset = 2;
+  auto slr               = cudf::make_default_constructed_scalar(cudf::column_view(val).type(),
+                                                   cudf::test::get_default_stream());
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({key}));
+  std::vector<cudf::size_type> offsets{offset};
+  auto got =
+    gb_obj.shift(cudf::table_view{{val}}, offsets, {*slr}, cudf::test::get_default_stream());
+}
+
+TEST_F(GroupbyTest, GetGroups)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> keys{1, 1, 2, 1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<int32_t> values({0, 0, 1, 1, 2, 2});
+  cudf::groupby::groupby gb(cudf::table_view({keys}));
+  auto gb_groups = gb.get_groups(cudf::table_view({values}), cudf::test::get_default_stream());
+}
+
+TEST_F(GroupbyTest, ReplaceNullsTest)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> key{0, 1, 0, 1, 0, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> val({42, 7, 24, 10, 1, 1000}, {1, 1, 1, 0, 0, 0});
+  cudf::groupby::groupby gb_obj(cudf::table_view({key}));
+  std::vector<cudf::replace_policy> policies{cudf::replace_policy::PRECEDING};
+  auto p =
+    gb_obj.replace_nulls(cudf::table_view({val}), policies, cudf::test::get_default_stream());
+}

From d93e9c267ac7a1a8792d9fc77d2ba8ab7be2683c Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 25 Nov 2024 05:58:16 -0500
Subject: [PATCH 10/23] Expose stream-ordering to strings attribute APIs
 (#17398)

Adds stream parameter to
```
cudf::strings::count_characters
cudf::strings::count_bytes
cudf::strings::code_points
```
Added stream gtests to verify correct stream forwarding.

Reference: https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17398
---
 cpp/include/cudf/strings/attributes.hpp       |  6 ++
 cpp/src/strings/attributes.cu                 |  9 ++-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/strings/attributes_test.cpp | 59 +++++++++++++++++++
 4 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 cpp/tests/streams/strings/attributes_test.cpp

diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 5f2eda8fa5b..0de57b556ad 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -41,11 +41,13 @@ namespace strings {
  * Any null string will result in a null entry for that row in the output column.
  *
  * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with lengths for each string
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -59,11 +61,13 @@ std::unique_ptr<column> count_characters(
  * Any null string will result in a null entry for that row in the output column.
  *
  * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with the number of bytes for each string
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -79,11 +83,13 @@ std::unique_ptr<column> count_bytes(
  * Any null string is ignored. No null entries will appear in the output column.
  *
  * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New INT32 column with code point integer values for each character
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of strings_apis group
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index c56d25fde2b..1c14cab4b1f 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -264,24 +264,27 @@ std::unique_ptr<column> code_points(strings_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
+                                         rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_characters(input, cudf::get_default_stream(), mr);
+  return detail::count_characters(input, stream, mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
+                                    rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_bytes(input, cudf::get_default_stream(), mr);
+  return detail::count_bytes(input, stream, mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& input,
+                                    rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::code_points(input, cudf::get_default_stream(), mr);
+  return detail::code_points(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 91c00d6af34..8928d27a871 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -724,6 +724,7 @@ ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
+  streams/strings/attributes_test.cpp
   streams/strings/case_test.cpp
   streams/strings/combine_test.cpp
   streams/strings/contains_test.cpp
diff --git a/cpp/tests/streams/strings/attributes_test.cpp b/cpp/tests/streams/strings/attributes_test.cpp
new file mode 100644
index 00000000000..e15681eb8a7
--- /dev/null
+++ b/cpp/tests/streams/strings/attributes_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/attributes.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+struct StringsAttributesTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsAttributesTest, CodePoints)
+{
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto strings_view = cudf::strings_column_view(strings);
+
+  auto results = cudf::strings::code_points(strings_view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsAttributesTest, CountCharacters)
+{
+  std::vector<std::string> h_strings(
+    40000, "something a bit longer than 32 bytes ééé ééé ééé ééé ééé ééé ééé");
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  auto strings_view = cudf::strings_column_view(strings);
+
+  auto results = cudf::strings::count_characters(strings_view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsAttributesTest, CountBytes)
+{
+  std::vector<char const*> h_strings{
+    "eee", "bb", nullptr, "", "aa", "ééé", "something a bit longer than 32 bytes"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto strings_view = cudf::strings_column_view(strings);
+
+  auto results = cudf::strings::count_bytes(strings_view, cudf::test::get_default_stream());
+}

From f05e89db8f6750232a452d072fa9f9ea988a6b34 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 25 Nov 2024 13:03:54 -0600
Subject: [PATCH 11/23] Single-partition Dask executor for cuDF-Polars (#17262)

The goal here is to lay down the initial foundation for dask-based evaluation of `IR` graphs in cudf-polars. The first pass will only support single-partition workloads. This functionality could be achieved with much less-complicated changes to cudf-polars. However, we **do** want to build multi-partition support on top of this.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/17262
---
 ci/run_cudf_polars_pytests.sh                 |   4 +
 python/cudf_polars/cudf_polars/callback.py    |  18 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  25 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |   3 +-
 .../cudf_polars/experimental/parallel.py      | 236 ++++++++++++++++++
 .../cudf_polars/testing/asserts.py            |  11 +-
 python/cudf_polars/tests/conftest.py          |  16 ++
 .../tests/experimental/test_parallel.py       |  21 ++
 python/cudf_polars/tests/test_executors.py    |  68 +++++
 9 files changed, 388 insertions(+), 14 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/experimental/parallel.py
 create mode 100644 python/cudf_polars/tests/experimental/test_parallel.py
 create mode 100644 python/cudf_polars/tests/test_executors.py

diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
index c10612a065a..bf5a3ccee8e 100755
--- a/ci/run_cudf_polars_pytests.sh
+++ b/ci/run_cudf_polars_pytests.sh
@@ -8,4 +8,8 @@ set -euo pipefail
 # Support invoking run_cudf_polars_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
+# Test the default "cudf" executor
 python -m pytest --cache-clear "$@" tests
+
+# Test the "dask-experimental" executor
+python -m pytest --cache-clear "$@" tests --executor dask-experimental
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 8dc5715195d..95527028aa9 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -9,7 +9,7 @@
 import os
 import warnings
 from functools import cache, partial
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import nvtx
 
@@ -181,6 +181,7 @@ def _callback(
     *,
     device: int | None,
     memory_resource: int | None,
+    executor: Literal["pylibcudf", "dask-experimental"] | None,
 ) -> pl.DataFrame:
     assert with_columns is None
     assert pyarrow_predicate is None
@@ -191,7 +192,14 @@ def _callback(
         set_device(device),
         set_memory_resource(memory_resource),
     ):
-        return ir.evaluate(cache={}).to_polars()
+        if executor is None or executor == "pylibcudf":
+            return ir.evaluate(cache={}).to_polars()
+        elif executor == "dask-experimental":
+            from cudf_polars.experimental.parallel import evaluate_dask
+
+            return evaluate_dask(ir).to_polars()
+        else:
+            raise ValueError(f"Unknown executor '{executor}'")
 
 
 def validate_config_options(config: dict) -> None:
@@ -208,7 +216,9 @@ def validate_config_options(config: dict) -> None:
     ValueError
         If the configuration contains unsupported options.
     """
-    if unsupported := (config.keys() - {"raise_on_fail", "parquet_options"}):
+    if unsupported := (
+        config.keys() - {"raise_on_fail", "parquet_options", "executor"}
+    ):
         raise ValueError(
             f"Engine configuration contains unsupported settings: {unsupported}"
         )
@@ -243,6 +253,7 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     device = config.device
     memory_resource = config.memory_resource
     raise_on_fail = config.config.get("raise_on_fail", False)
+    executor = config.config.get("executor", None)
     validate_config_options(config.config)
 
     with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
@@ -272,5 +283,6 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
                     ir,
                     device=device,
                     memory_resource=memory_resource,
+                    executor=executor,
                 )
             )
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 62a2da9dcea..6617b71be81 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1599,13 +1599,15 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
+            self.options = (tuple(to_explode),)
         elif self.name == "rename":
-            old, new, _ = self.options
+            old, new, strict = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
                 set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
+            self.options = (tuple(old), tuple(new), strict)
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             value_name = "value" if value_name is None else value_name
@@ -1623,13 +1625,15 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
             self.options = (
                 tuple(indices),
                 tuple(pivotees),
-                (variable_name, schema[variable_name]),
-                (value_name, schema[value_name]),
+                variable_name,
+                value_name,
             )
-        self._non_child_args = (name, self.options)
+        self._non_child_args = (schema, name, self.options)
 
     @classmethod
-    def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, schema: Schema, name: str, options: Any, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if name == "rechunk":
             # No-op in our data model
@@ -1651,8 +1655,8 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
             (
                 indices,
                 pivotees,
-                (variable_name, variable_dtype),
-                (value_name, value_dtype),
+                variable_name,
+                value_name,
             ) = options
             npiv = len(pivotees)
             index_columns = [
@@ -1669,7 +1673,7 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
                         plc.interop.from_arrow(
                             pa.array(
                                 pivotees,
-                                type=plc.interop.to_arrow(variable_dtype),
+                                type=plc.interop.to_arrow(schema[variable_name]),
                             ),
                         )
                     ]
@@ -1677,7 +1681,10 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
                 df.num_rows,
             ).columns()
             value_column = plc.concatenate.concatenate(
-                [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees]
+                [
+                    df.column_map[pivotee].astype(schema[value_name]).obj
+                    for pivotee in pivotees
+                ]
             )
             return DataFrame(
                 [
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 12fc2a196cd..9480ce6e535 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -633,9 +633,10 @@ def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.E
 
 @_translate_expr.register
 def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr:
+    options = node.sort_options
     return expr.SortBy(
         dtype,
-        node.sort_options,
+        (options[0], tuple(options[1]), tuple(options[2])),
         translator.translate_expr(n=node.expr),
         *(translator.translate_expr(n=n) for n in node.by),
     )
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
new file mode 100644
index 00000000000..6518dd60c7d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Partitioned LogicalPlan nodes."""
+
+from __future__ import annotations
+
+import operator
+from functools import reduce, singledispatch
+from typing import TYPE_CHECKING, Any
+
+from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.traversal import traversal
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import TypeAlias
+
+    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.nodebase import Node
+    from cudf_polars.typing import GenericTransformer
+
+
+class PartitionInfo:
+    """
+    Partitioning information.
+
+    This class only tracks the partition count (for now).
+    """
+
+    __slots__ = ("count",)
+
+    def __init__(self, count: int):
+        self.count = count
+
+
+LowerIRTransformer: TypeAlias = (
+    "GenericTransformer[IR, MutableMapping[IR, PartitionInfo]]"
+)
+"""Protocol for Lowering IR nodes."""
+
+
+def get_key_name(node: Node) -> str:
+    """Generate the key name for a Node."""
+    return f"{type(node).__name__.lower()}-{hash(node)}"
+
+
+@singledispatch
+def lower_ir_node(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Rewrite an IR node and extract partitioning information.
+
+    Parameters
+    ----------
+    ir
+        IR node to rewrite.
+    rec
+        Recursive LowerIRTransformer callable.
+
+    Returns
+    -------
+    new_ir, partition_info
+        The rewritten node, and a mapping from unique nodes in
+        the full IR graph to associated partitioning information.
+
+    Notes
+    -----
+    This function is used by `lower_ir_graph`.
+
+    See Also
+    --------
+    lower_ir_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+
+
+@lower_ir_node.register(IR)
+def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    if len(ir.children) == 0:
+        # Default leaf node has single partition
+        return ir, {ir: PartitionInfo(count=1)}
+
+    # Lower children
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False)
+    partition_info = reduce(operator.or_, _partition_info)
+
+    # Check that child partitioning is supported
+    count = max(partition_info[c].count for c in children)
+    if count > 1:
+        raise NotImplementedError(
+            f"Class {type(ir)} does not support multiple partitions."
+        )  # pragma: no cover
+
+    # Return reconstructed node and partition-info dict
+    partition = PartitionInfo(count=1)
+    new_node = ir.reconstruct(children)
+    partition_info[new_node] = partition
+    return new_node, partition_info
+
+
+def lower_ir_graph(ir: IR) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Rewrite an IR graph and extract partitioning information.
+
+    Parameters
+    ----------
+    ir
+        Root of the graph to rewrite.
+
+    Returns
+    -------
+    new_ir, partition_info
+        The rewritten graph, and a mapping from unique nodes
+        in the new graph to associated partitioning information.
+
+    Notes
+    -----
+    This function traverses the unique nodes of the graph with
+    root `ir`, and applies :func:`lower_ir_node` to each node.
+
+    See Also
+    --------
+    lower_ir_node
+    """
+    from cudf_polars.dsl.traversal import CachingVisitor
+
+    mapper = CachingVisitor(lower_ir_node)
+    return mapper(ir)
+
+
+@singledispatch
+def generate_ir_tasks(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    """
+    Generate a task graph for evaluation of an IR node.
+
+    Parameters
+    ----------
+    ir
+        IR node to generate tasks for.
+    partition_info
+        Partitioning information, obtained from :func:`lower_ir_graph`.
+
+    Returns
+    -------
+    mapping
+        A (partial) dask task graph for the evaluation of an ir node.
+
+    Notes
+    -----
+    Task generation should only produce the tasks for the current node,
+    referring to child tasks by name.
+
+    See Also
+    --------
+    task_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+
+
+@generate_ir_tasks.register(IR)
+def _(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Single-partition default behavior.
+    # This is used by `generate_ir_tasks` for all unregistered IR sub-types.
+    if partition_info[ir].count > 1:
+        raise NotImplementedError(
+            f"Failed to generate multiple output tasks for {ir}."
+        )  # pragma: no cover
+
+    child_names = []
+    for child in ir.children:
+        child_names.append(get_key_name(child))
+        if partition_info[child].count > 1:
+            raise NotImplementedError(
+                f"Failed to generate tasks for {ir} with child {child}."
+            )  # pragma: no cover
+
+    key_name = get_key_name(ir)
+    return {
+        (key_name, 0): (
+            ir.do_evaluate,
+            *ir._non_child_args,
+            *((child_name, 0) for child_name in child_names),
+        )
+    }
+
+
+def task_graph(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
+    """
+    Construct a task graph for evaluation of an IR graph.
+
+    Parameters
+    ----------
+    ir
+        Root of the graph to rewrite.
+    partition_info
+        A mapping from all unique IR nodes to the
+        associated partitioning information.
+
+    Returns
+    -------
+    graph
+        A Dask-compatible task graph for the entire
+        IR graph with root `ir`.
+
+    Notes
+    -----
+    This function traverses the unique nodes of the
+    graph with root `ir`, and extracts the tasks for
+    each node with :func:`generate_ir_tasks`.
+
+    See Also
+    --------
+    generate_ir_tasks
+    """
+    graph = reduce(
+        operator.or_,
+        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
+    )
+    return graph, (get_key_name(ir), 0)
+
+
+def evaluate_dask(ir: IR) -> DataFrame:
+    """Evaluate an IR graph with Dask."""
+    from dask import get
+
+    ir, partition_info = lower_ir_graph(ir)
+
+    graph, key = task_graph(ir, partition_info)
+    return get(graph, key)
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index ba0bb12a0fb..d986f150b2e 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -20,6 +20,11 @@
 __all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"]
 
 
+# Will be overriden by `conftest.py` with the value from the `--executor`
+# command-line argument
+Executor = None
+
+
 def assert_gpu_result_equal(
     lazydf: pl.LazyFrame,
     *,
@@ -34,6 +39,7 @@ def assert_gpu_result_equal(
     rtol: float = 1e-05,
     atol: float = 1e-08,
     categorical_as_str: bool = False,
+    executor: str | None = None,
 ) -> None:
     """
     Assert that collection of a lazyframe on GPU produces correct results.
@@ -71,6 +77,9 @@ def assert_gpu_result_equal(
         Absolute tolerance for float comparisons
     categorical_as_str
         Decat categoricals to strings before comparing
+    executor
+        The executor configuration to pass to `GPUEngine`. If not specified
+        uses the module level `Executor` attribute.
 
     Raises
     ------
@@ -80,7 +89,7 @@ def assert_gpu_result_equal(
         If GPU collection failed in some way.
     """
     if engine is None:
-        engine = GPUEngine(raise_on_fail=True)
+        engine = GPUEngine(raise_on_fail=True, executor=executor or Executor)
 
     final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
         collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
index 9bbce6bc080..6338bf0cae1 100644
--- a/python/cudf_polars/tests/conftest.py
+++ b/python/cudf_polars/tests/conftest.py
@@ -8,3 +8,19 @@
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session")
 def with_nulls(request):
     return request.param
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--executor",
+        action="store",
+        default="pylibcudf",
+        choices=("pylibcudf", "dask-experimental"),
+        help="Executor to use for GPUEngine.",
+    )
+
+
+def pytest_configure(config):
+    import cudf_polars.testing.asserts
+
+    cudf_polars.testing.asserts.Executor = config.getoption("--executor")
diff --git a/python/cudf_polars/tests/experimental/test_parallel.py b/python/cudf_polars/tests/experimental/test_parallel.py
new file mode 100644
index 00000000000..d46ab88eebf
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_parallel.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import polars as pl
+from polars import GPUEngine
+from polars.testing import assert_frame_equal
+
+
+def test_evaluate_dask():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    expected = q.collect(engine="cpu")
+    got_gpu = q.collect(engine=GPUEngine(raise_on_fail=True))
+    got_dask = q.collect(
+        engine=GPUEngine(raise_on_fail=True, executor="dask-experimental")
+    )
+    assert_frame_equal(expected, got_gpu)
+    assert_frame_equal(expected, got_dask)
diff --git a/python/cudf_polars/tests/test_executors.py b/python/cudf_polars/tests/test_executors.py
new file mode 100644
index 00000000000..3eaea2ec9ea
--- /dev/null
+++ b/python/cudf_polars/tests/test_executors.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("executor", [None, "pylibcudf", "dask-experimental"])
+def test_executor_basics(executor):
+    if executor == "dask-experimental":
+        pytest.importorskip("dask")
+
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())),
+            "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())),
+            "c": pl.Series(
+                [
+                    [["1", "2", "3"], ["4", "567"]],
+                    [["8", "9"], []],
+                ],
+                dtype=pl.List(pl.List(pl.String())),
+            ),
+            "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))),
+        }
+    )
+
+    assert_gpu_result_equal(df, executor=executor)
+
+
+def test_cudf_cache_evaluate():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a"))
+    query = pl.concat([ldf, ldf2], how="diagonal")
+    assert_gpu_result_equal(query, executor="pylibcudf")
+
+
+def test_dask_experimental_map_function_get_hashable():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([11, 12, 13], dtype=pl.UInt16),
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+            "c": pl.Series([2, 4, 6], dtype=pl.Float32),
+            "d": ["a", "b", "c"],
+        }
+    )
+    q = df.unpivot(index="d")
+    assert_gpu_result_equal(q, executor="dask-experimental")
+
+
+def test_unknown_executor():
+    df = pl.LazyFrame({})
+
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="ValueError: Unknown executor 'unknown-executor'",
+    ):
+        assert_gpu_result_equal(df, executor="unknown-executor")

From 4e3afd26127ebd0c04b739032873d3fce01eb1b7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:00:11 -0500
Subject: [PATCH 12/23] Change binops for-each kernel to thrust::for_each_n
 (#17419)

Replaces the custom `for_each_kernel` in `binary_ops.cuh` with `thrust::for_each_n`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17419
---
 cpp/src/binaryop/compiled/binary_ops.cuh  | 56 ++++-------------------
 cpp/src/binaryop/compiled/equality_ops.cu | 38 ++++++++-------
 2 files changed, 31 insertions(+), 63 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 06987139188..ec63504a414 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -244,44 +244,6 @@ struct binary_op_double_device_dispatcher {
   }
 };
 
-/**
- * @brief Simplified for_each kernel
- *
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
-{
-  auto start        = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-#pragma unroll
-  for (auto i = start; i < size; i += stride) {
-    f(i);
-  }
-}
-
-/**
- * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions.
- *
- * @tparam Functor
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
-{
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
-  auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */);
-  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    size, std::forward<Functor&&>(f));
-}
-
 template <class BinaryOperator>
 void apply_binary_op(mutable_column_view& out,
                      column_view const& lhs,
@@ -298,16 +260,18 @@ void apply_binary_op(mutable_column_view& out,
   // Create binop functor instance
   if (common_dtype) {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_device_dispatcher<BinaryOperator>{
-               *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_device_dispatcher<BinaryOperator>{
+                         *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   } else {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_double_device_dispatcher<BinaryOperator>{
-               *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_double_device_dispatcher<BinaryOperator>{
+                         *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index 041fca76494..d8c50683026 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out,
   auto rhsd         = column_device_view::create(rhs, stream);
   if (common_dtype) {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::Equal>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::Equal>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::NotEqual>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::NotEqual>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   } else {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::Equal>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::Equal>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::NotEqual>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::NotEqual>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   }
 }

From ccc8833170b8099bf0de56f885ad1fdad5c43ada Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:19:13 -0800
Subject: [PATCH 13/23] Remove cudf._lib.timezone in favor of inlining
 pylibcudf (#17366)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17366
---
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 -
 python/cudf/cudf/_lib/__init__.py             |  1 -
 python/cudf/cudf/_lib/timezone.pyx            | 10 ----------
 python/cudf/cudf/core/_internals/timezones.py | 11 ++++++++---
 4 files changed, 8 insertions(+), 15 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/timezone.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1c2b24d2391..391c0eac858 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -41,7 +41,6 @@ set(cython_sources
     string_casting.pyx
     strings_udf.pyx
     text.pyx
-    timezone.pyx
     transform.pyx
     types.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 13d05033c11..8ce6d5bf5c9 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -28,7 +28,6 @@
     strings,
     strings_udf,
     text,
-    timezone,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
deleted file mode 100644
index 54624a5a2fd..00000000000
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def make_timezone_transition_table(tzdir, tzname):
-    plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname)
-    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index fd89904e766..4d001577581 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -10,8 +10,10 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.timezone import make_timezone_transition_table
+from cudf._lib.column import Column
 
 if TYPE_CHECKING:
     from cudf.core.column.datetime import DatetimeColumn
@@ -109,11 +111,14 @@ def _find_and_read_tzfile_tzdata(
 
 
 def _read_tzfile_as_columns(
-    tzdir, zone_name: str
+    tzdir: str, zone_name: str
 ) -> tuple[DatetimeColumn, TimeDeltaColumn]:
-    transition_times_and_offsets = make_timezone_transition_table(
+    plc_table = plc.io.timezone.make_timezone_transition_table(
         tzdir, zone_name
     )
+    transition_times_and_offsets = [
+        Column.from_pylibcudf(col) for col in plc_table.columns()
+    ]
 
     if not transition_times_and_offsets:
         from cudf.core.column.column import as_column

From df177400e2aa956651f3ee5343416b5fb3d9e86c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:21:48 -0800
Subject: [PATCH 14/23] Remove cudf._lib.partitioning in favor of inlining
 pylibcudf (#17369)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17369
---
 python/cudf/cudf/_lib/CMakeLists.txt   |  1 -
 python/cudf/cudf/_lib/__init__.py      |  1 -
 python/cudf/cudf/_lib/partitioning.pyx | 53 --------------------------
 python/cudf/cudf/core/dataframe.py     | 43 +++++++++++++++++++--
 4 files changed, 39 insertions(+), 59 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/partitioning.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 391c0eac858..b430d9f3d76 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -29,7 +29,6 @@ set(cython_sources
     null_mask.pyx
     orc.pyx
     parquet.pyx
-    partitioning.pyx
     reduce.pyx
     replace.pyx
     reshape.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 8ce6d5bf5c9..942e32747bc 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -16,7 +16,6 @@
     nvtext,
     orc,
     parquet,
-    partitioning,
     reduce,
     replace,
     reshape,
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
deleted file mode 100644
index 13997da8403..00000000000
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-from cudf._lib.reduce import minmax
-from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
-
-
-@acquire_spill_lock()
-def partition(list source_columns, Column partition_map,
-              object num_partitions):
-    """Partition source columns given a partitioning map
-
-    Parameters
-    ----------
-    source_columns: list[Column]
-        Columns to partition
-    partition_map: Column
-        Column of integer values that map each row in the input to a
-        partition
-    num_partitions: Optional[int]
-        Number of output partitions (deduced from unique values in
-        partition_map if None)
-
-    Returns
-    -------
-    Pair of reordered columns and partition offsets
-
-    Raises
-    ------
-    ValueError
-        If the partition map has invalid entries (not all in [0,
-        num_partitions)).
-    """
-
-    if num_partitions is None:
-        num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
-
-    if partition_map.size > 0:
-        lo, hi = minmax(partition_map)
-        if lo < 0 or hi >= num_partitions:
-            raise ValueError("Partition map has invalid values")
-
-    plc_table, offsets = plc.partitioning.partition(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
-        partition_map.to_pylibcudf(mode="read"),
-        num_partitions
-    )
-    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 728cc47a7c9..7d523d2c5ad 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2487,11 +2487,46 @@ def scatter_by_map(
                     f"ERROR: map_size must be >= {count} (got {map_size})."
                 )
 
-        partitioned_columns, output_offsets = libcudf.partitioning.partition(
-            [*(self.index._columns if keep_index else ()), *self._columns],
-            map_index,
-            map_size,
+        source_columns = (
+            itertools.chain(self.index._columns, self._columns)
+            if keep_index
+            else self._columns
         )
+
+        with acquire_spill_lock():
+            if map_size is None:
+                map_size = plc.stream_compaction.distinct_count(
+                    map_index.to_pylibcudf(mode="read"),
+                    plc.types.NullPolicy.EXCLUDE,
+                    plc.types.NanPolicy.NAN_IS_VALID,
+                )
+
+            if map_index.size > 0:
+                plc_lo, plc_hi = plc.reduce.minmax(
+                    map_index.to_pylibcudf(mode="read")
+                )
+                # TODO: Use pylibcudf Scalar once APIs are more developed
+                lo = libcudf.column.Column.from_pylibcudf(
+                    plc.Column.from_scalar(plc_lo, 1)
+                ).element_indexing(0)
+                hi = libcudf.column.Column.from_pylibcudf(
+                    plc.Column.from_scalar(plc_hi, 1)
+                ).element_indexing(0)
+                if lo < 0 or hi >= map_size:
+                    raise ValueError("Partition map has invalid values")
+
+            plc_table, output_offsets = plc.partitioning.partition(
+                plc.Table(
+                    [col.to_pylibcudf(mode="read") for col in source_columns]
+                ),
+                map_index.to_pylibcudf(mode="read"),
+                map_size,
+            )
+            partitioned_columns = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_table.columns()
+            ]
+
         partitioned = self._from_columns_like_self(
             partitioned_columns,
             column_names=self._column_names,

From d8277bffacc8fd91f4009be310b5347e9c8a6397 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:23:24 -0800
Subject: [PATCH 15/23] Remove cudf._lib.lists in favor of inlining pylibcudf
 (#17425)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17425
---
 python/cudf/cudf/_lib/CMakeLists.txt   |   1 -
 python/cudf/cudf/_lib/lists.pyx        | 150 ---------------------
 python/cudf/cudf/core/column/lists.py  | 180 ++++++++++++++++++++-----
 python/cudf/cudf/core/indexed_frame.py |  19 ++-
 4 files changed, 159 insertions(+), 191 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/lists.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index b430d9f3d76..a0457120935 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -24,7 +24,6 @@ set(cython_sources
     interop.pyx
     join.pyx
     json.pyx
-    lists.pyx
     merge.pyx
     null_mask.pyx
     orc.pyx
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
deleted file mode 100644
index 90a137dd546..00000000000
--- a/python/cudf/cudf/_lib/lists.pyx
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def count_elements(Column col):
-    return Column.from_pylibcudf(
-        plc.lists.count_elements(
-            col.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def explode_outer(list source_columns, int explode_column_idx):
-    return columns_from_pylibcudf_table(
-        plc.lists.explode_outer(
-            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
-            explode_column_idx,
-        )
-    )
-
-
-@acquire_spill_lock()
-def distinct(Column col, bool nulls_equal, bool nans_all_equal):
-    return Column.from_pylibcudf(
-        plc.lists.distinct(
-            col.to_pylibcudf(mode="read"),
-            (
-                plc.types.NullEquality.EQUAL
-                if nulls_equal
-                else plc.types.NullEquality.UNEQUAL
-            ),
-            (
-                plc.types.NanEquality.ALL_EQUAL
-                if nans_all_equal
-                else plc.types.NanEquality.UNEQUAL
-            ),
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort_lists(Column col, bool ascending, str na_position):
-    return Column.from_pylibcudf(
-        plc.lists.sort_lists(
-            col.to_pylibcudf(mode="read"),
-            plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING,
-            (
-                plc.types.NullOrder.BEFORE
-                if na_position == "first"
-                else plc.types.NullOrder.AFTER
-            ),
-            False,
-        )
-    )
-
-
-@acquire_spill_lock()
-def extract_element_scalar(Column col, size_type index):
-    return Column.from_pylibcudf(
-        plc.lists.extract_list_element(
-            col.to_pylibcudf(mode="read"),
-            index,
-        )
-    )
-
-
-@acquire_spill_lock()
-def extract_element_column(Column col, Column index):
-    return Column.from_pylibcudf(
-        plc.lists.extract_list_element(
-            col.to_pylibcudf(mode="read"),
-            index.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def contains_scalar(Column col, py_search_key):
-    return Column.from_pylibcudf(
-        plc.lists.contains(
-            col.to_pylibcudf(mode="read"),
-            py_search_key.device_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def index_of_scalar(Column col, object py_search_key):
-    return Column.from_pylibcudf(
-        plc.lists.index_of(
-            col.to_pylibcudf(mode="read"),
-            py_search_key.device_value.c_value,
-            plc.lists.DuplicateFindOption.FIND_FIRST,
-        )
-    )
-
-
-@acquire_spill_lock()
-def index_of_column(Column col, Column search_keys):
-    return Column.from_pylibcudf(
-        plc.lists.index_of(
-            col.to_pylibcudf(mode="read"),
-            search_keys.to_pylibcudf(mode="read"),
-            plc.lists.DuplicateFindOption.FIND_FIRST,
-        )
-    )
-
-
-@acquire_spill_lock()
-def concatenate_rows(list source_columns):
-    return Column.from_pylibcudf(
-        plc.lists.concatenate_rows(
-            plc.Table([
-                c.to_pylibcudf(mode="read") for c in source_columns
-            ])
-        )
-    )
-
-
-@acquire_spill_lock()
-def concatenate_list_elements(Column input_column, dropna=False):
-    return Column.from_pylibcudf(
-        plc.lists.concatenate_list_elements(
-            input_column.to_pylibcudf(mode="read"),
-            plc.lists.ConcatenateNullPolicy.IGNORE
-            if dropna
-            else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
-        )
-    )
-
-
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    return Column.from_pylibcudf(
-        plc.lists.segmented_gather(
-            source_column.to_pylibcudf(mode="read"),
-            gather_map.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 6b25e568f00..9962663e811 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,31 +2,21 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Literal, cast
 
-import numpy as np
 import pandas as pd
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.lists import (
-    concatenate_list_elements,
-    concatenate_rows,
-    contains_scalar,
-    count_elements,
-    distinct,
-    extract_element_column,
-    extract_element_scalar,
-    index_of_column,
-    index_of_scalar,
-    segmented_gather,
-    sort_lists,
-)
 from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.column.numerical import NumericalColumn
@@ -139,7 +129,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             return NotImplemented
         if isinstance(other.dtype, ListDtype):
             if op == "__add__":
-                return concatenate_rows([self, other])
+                return self.concatenate_rows([other])  # type: ignore[list-item]
             else:
                 raise NotImplementedError(
                     "Lists concatenation for this operation is not yet"
@@ -326,6 +316,129 @@ def to_pandas(
         else:
             return pd.Index(self.to_arrow().tolist(), dtype="object")
 
+    @acquire_spill_lock()
+    def count_elements(self) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.count_elements(self.to_pylibcudf(mode="read"))
+        )
+
+    @acquire_spill_lock()
+    def distinct(self, nulls_equal: bool, nans_all_equal: bool) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.distinct(
+                self.to_pylibcudf(mode="read"),
+                (
+                    plc.types.NullEquality.EQUAL
+                    if nulls_equal
+                    else plc.types.NullEquality.UNEQUAL
+                ),
+                (
+                    plc.types.NanEquality.ALL_EQUAL
+                    if nans_all_equal
+                    else plc.types.NanEquality.UNEQUAL
+                ),
+            )
+        )
+
+    @acquire_spill_lock()
+    def sort_lists(
+        self, ascending: bool, na_position: Literal["first", "last"]
+    ) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.sort_lists(
+                self.to_pylibcudf(mode="read"),
+                plc.types.Order.ASCENDING
+                if ascending
+                else plc.types.Order.DESCENDING,
+                (
+                    plc.types.NullOrder.BEFORE
+                    if na_position == "first"
+                    else plc.types.NullOrder.AFTER
+                ),
+                False,
+            )
+        )
+
+    @acquire_spill_lock()
+    def extract_element_scalar(self, index: int) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.extract_list_element(
+                self.to_pylibcudf(mode="read"),
+                index,
+            )
+        )
+
+    @acquire_spill_lock()
+    def extract_element_column(self, index: ColumnBase) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.extract_list_element(
+                self.to_pylibcudf(mode="read"),
+                index.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.contains(
+                self.to_pylibcudf(mode="read"),
+                search_key.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.index_of(
+                self.to_pylibcudf(mode="read"),
+                search_key.device_value.c_value,
+                plc.lists.DuplicateFindOption.FIND_FIRST,
+            )
+        )
+
+    @acquire_spill_lock()
+    def index_of_column(self, search_keys: ColumnBase) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.index_of(
+                self.to_pylibcudf(mode="read"),
+                search_keys.to_pylibcudf(mode="read"),
+                plc.lists.DuplicateFindOption.FIND_FIRST,
+            )
+        )
+
+    @acquire_spill_lock()
+    def concatenate_rows(self, other_columns: list[ColumnBase]) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.concatenate_rows(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain([self], other_columns)
+                    ]
+                )
+            )
+        )
+
+    @acquire_spill_lock()
+    def concatenate_list_elements(self, dropna: bool) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.concatenate_list_elements(
+                self.to_pylibcudf(mode="read"),
+                plc.lists.ConcatenateNullPolicy.IGNORE
+                if dropna
+                else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
+            )
+        )
+
+    @acquire_spill_lock()
+    def segmented_gather(self, gather_map: ColumnBase) -> ColumnBase:
+        return type(self).from_pylibcudf(
+            plc.lists.segmented_gather(
+                self.to_pylibcudf(mode="read"),
+                gather_map.to_pylibcudf(mode="read"),
+            )
+        )
+
 
 class ListMethods(ColumnMethods):
     """
@@ -397,18 +510,16 @@ def get(
         2   6
         dtype: int64
         """
-        if is_scalar(index):
-            out = extract_element_scalar(self._column, cudf.Scalar(index))
+        if isinstance(index, int):
+            out = self._column.extract_element_scalar(index)
         else:
             index = as_column(index)
-            out = extract_element_column(self._column, as_column(index))
+            out = self._column.extract_element_column(index)
 
         if not (default is None or default is NA):
             # determine rows for which `index` is out-of-bounds
-            lengths = count_elements(self._column)
-            out_of_bounds_mask = (np.negative(index) > lengths) | (
-                index >= lengths
-            )
+            lengths = self._column.count_elements()
+            out_of_bounds_mask = ((-1 * index) > lengths) | (index >= lengths)
 
             # replace the value in those rows (should be NA) with `default`
             if out_of_bounds_mask.any():
@@ -445,7 +556,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
         dtype: bool
         """
         return self._return_or_inplace(
-            contains_scalar(self._column, cudf.Scalar(search_key))
+            self._column.contains_scalar(cudf.Scalar(search_key))
         )
 
     def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
@@ -494,13 +605,10 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
         """
 
         if is_scalar(search_key):
-            return self._return_or_inplace(
-                index_of_scalar(self._column, cudf.Scalar(search_key))
-            )
+            result = self._column.index_of_scalar(cudf.Scalar(search_key))
         else:
-            return self._return_or_inplace(
-                index_of_column(self._column, as_column(search_key))
-            )
+            result = self._column.index_of_column(as_column(search_key))
+        return self._return_or_inplace(result)
 
     @property
     def leaves(self) -> ParentType:
@@ -550,7 +658,7 @@ def len(self) -> ParentType:
         2       2
         dtype: int32
         """
-        return self._return_or_inplace(count_elements(self._column))
+        return self._return_or_inplace(self._column.count_elements())
 
     def take(self, lists_indices: ColumnLike) -> ParentType:
         """
@@ -598,7 +706,7 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             )
 
         return self._return_or_inplace(
-            segmented_gather(self._column, lists_indices_col)
+            self._column.segmented_gather(lists_indices_col)
         )
 
     def unique(self) -> ParentType:
@@ -631,7 +739,7 @@ def unique(self) -> ParentType:
             raise NotImplementedError("Nested lists unique is not supported.")
 
         return self._return_or_inplace(
-            distinct(self._column, nulls_equal=True, nans_all_equal=True)
+            self._column.distinct(nulls_equal=True, nans_all_equal=True)
         )
 
     def sort_values(
@@ -639,7 +747,7 @@ def sort_values(
         ascending: bool = True,
         inplace: bool = False,
         kind: str = "quicksort",
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
         ignore_index: bool = False,
     ) -> ParentType:
         """
@@ -692,7 +800,7 @@ def sort_values(
             raise NotImplementedError("Nested lists sort is not supported.")
 
         return self._return_or_inplace(
-            sort_lists(self._column, ascending, na_position),
+            self._column.sort_lists(ascending, na_position),
             retain_index=not ignore_index,
         )
 
@@ -742,7 +850,7 @@ def concat(self, dropna=True) -> ParentType:
         dtype: list
         """
         return self._return_or_inplace(
-            concatenate_list_elements(self._column, dropna=dropna)
+            self._column.concatenate_list_elements(dropna)
         )
 
     def astype(self, dtype):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9130779c3e9..4e839aaeb6a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import itertools
 import operator
 import textwrap
 import warnings
@@ -5316,10 +5317,20 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         else:
             idx_cols = ()
 
-        exploded = libcudf.lists.explode_outer(
-            [*idx_cols, *self._columns],
-            column_index + len(idx_cols),
-        )
+        with acquire_spill_lock():
+            plc_table = plc.lists.explode_outer(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(idx_cols, self._columns)
+                    ]
+                ),
+                column_index + len(idx_cols),
+            )
+            exploded = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_table.columns()
+            ]
         # We must copy inner datatype of the exploded list column to
         # maintain struct dtype key names
         element_type = cast(

From 0bd95c93b44bb626ce26bd0788f9f2158008fe4d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:24:41 -0800
Subject: [PATCH 16/23] Remove cudf._lib.rolling in favor of inlining pylibcudf
 (#17423)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17423
---
 python/cudf/cudf/_lib/CMakeLists.txt    |  1 -
 python/cudf/cudf/_lib/__init__.py       |  1 -
 python/cudf/cudf/_lib/rolling.pyx       | 67 -------------------------
 python/cudf/cudf/core/window/rolling.py | 44 ++++++++++++----
 4 files changed, 34 insertions(+), 79 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/rolling.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index a0457120935..8d3af5205fa 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -31,7 +31,6 @@ set(cython_sources
     reduce.pyx
     replace.pyx
     reshape.pyx
-    rolling.pyx
     round.pyx
     scalar.pyx
     sort.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 942e32747bc..1e0bf931c97 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -19,7 +19,6 @@
     reduce,
     replace,
     reshape,
-    rolling,
     round,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
deleted file mode 100644
index 687b261c2c7..00000000000
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf
-
-from cudf._lib.aggregation import make_aggregation
-
-
-@acquire_spill_lock()
-def rolling(Column source_column,
-            Column pre_column_window,
-            Column fwd_column_window,
-            window,
-            min_periods,
-            center,
-            op,
-            agg_params):
-    """
-    Rolling on input executing operation within the given window for each row
-
-    Parameters
-    ----------
-    source_column : input column on which rolling operation is executed
-    pre_column_window : prior window for each element of source_column
-    fwd_column_window : forward window for each element of source_column
-    window : Size of the moving window, can be integer or None
-    min_periods : Minimum number of observations in window required to have
-                  a value (otherwise result is null)
-    center : Set the labels at the center of the window
-    op : operation to be executed
-    agg_params : dict, parameter for the aggregation (e.g. ddof for VAR/STD)
-
-    Returns
-    -------
-    A Column with rolling calculations
-    """
-
-    if window is None:
-        if center:
-            # TODO: we can support this even though Pandas currently does not
-            raise NotImplementedError(
-                "center is not implemented for offset-based windows"
-            )
-        pre = pre_column_window.to_pylibcudf(mode="read")
-        fwd = fwd_column_window.to_pylibcudf(mode="read")
-    else:
-        if center:
-            pre = (window // 2) + 1
-            fwd = window - (pre)
-        else:
-            pre = window
-            fwd = 0
-
-    return Column.from_pylibcudf(
-        pylibcudf.rolling.rolling_window(
-            source_column.to_pylibcudf(mode="read"),
-            pre,
-            fwd,
-            min_periods,
-            make_aggregation(
-                op, {'dtype': source_column.dtype} if callable(op) else agg_params
-            ).c_obj,
-        )
-    )
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 967edc2ab15..d2cb5e8c190 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -8,8 +8,11 @@
 import pandas as pd
 from pandas.api.indexers import BaseIndexer
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.aggregation import make_aggregation
 from cudf.api.types import is_integer, is_number
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import as_column
@@ -284,16 +287,37 @@ def _apply_agg_column(self, source_column, agg_name):
             )
             window = None
 
-        return libcudf.rolling.rolling(
-            source_column=source_column,
-            pre_column_window=preceding_window,
-            fwd_column_window=following_window,
-            window=window,
-            min_periods=min_periods,
-            center=self.center,
-            op=agg_name,
-            agg_params=self.agg_params,
-        )
+        with acquire_spill_lock():
+            if window is None:
+                if self.center:
+                    # TODO: we can support this even though Pandas currently does not
+                    raise NotImplementedError(
+                        "center is not implemented for offset-based windows"
+                    )
+                pre = preceding_window.to_pylibcudf(mode="read")
+                fwd = following_window.to_pylibcudf(mode="read")
+            else:
+                if self.center:
+                    pre = (window // 2) + 1
+                    fwd = window - (pre)
+                else:
+                    pre = window
+                    fwd = 0
+
+            return libcudf.column.Column.from_pylibcudf(
+                plc.rolling.rolling_window(
+                    source_column.to_pylibcudf(mode="read"),
+                    pre,
+                    fwd,
+                    min_periods,
+                    make_aggregation(
+                        agg_name,
+                        {"dtype": source_column.dtype}
+                        if callable(agg_name)
+                        else self.agg_params,
+                    ).c_obj,
+                )
+            )
 
     def _reduce(
         self,

From ab36fc6f7ac91f63841666e59176229944f869b2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 19:56:46 -0800
Subject: [PATCH 17/23] Return categorical values in to_numpy/to_cupy (#17388)

closes #17381

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17388
---
 python/cudf/cudf/core/frame.py           |  5 +++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 838fde260df..0c0f271fe6f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -427,6 +427,8 @@ def to_array(
         ) -> cupy.ndarray | numpy.ndarray:
             if na_value is not None:
                 col = col.fillna(na_value)
+            if isinstance(col.dtype, cudf.CategoricalDtype):
+                col = col._get_decategorized_column()  # type: ignore[attr-defined]
             array = get_array(col)
             casted_array = module.asarray(array, dtype=dtype)
             if copy and casted_array is array:
@@ -448,6 +450,9 @@ def to_array(
             else:
                 dtype = find_common_type([dtype for _, dtype in self._dtypes])
 
+            if isinstance(dtype, cudf.CategoricalDtype):
+                dtype = dtype.categories.dtype
+
             if not isinstance(dtype, numpy.dtype):
                 raise NotImplementedError(
                     f"{dtype} cannot be exposed as an array"
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 0f2b41888fa..509ee0d65a5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1306,6 +1306,18 @@ def test_dataframe_to_cupy_null_values():
         np.testing.assert_array_equal(refvalues[k], mat[:, i])
 
 
+@pytest.mark.parametrize("method", ["to_cupy", "to_numpy"])
+@pytest.mark.parametrize("value", [1, True, 1.5])
+@pytest.mark.parametrize("constructor", ["DataFrame", "Series"])
+def test_to_array_categorical(method, value, constructor):
+    data = [value]
+    expected = getattr(pd, constructor)(data, dtype="category").to_numpy()
+    result = getattr(
+        getattr(cudf, constructor)(data, dtype="category"), method
+    )()
+    assert_eq(result, expected)
+
+
 def test_dataframe_append_empty():
     pdf = pd.DataFrame(
         {

From 79a986067688c18b3d431c7a3acc23e2307fb668 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 22:02:06 -0800
Subject: [PATCH 18/23] Remove cudf._lib.reshape in favor of inlining pylibcudf
 (#17368)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17368
---
 python/cudf/cudf/_lib/CMakeLists.txt     |  1 -
 python/cudf/cudf/_lib/__init__.py        |  1 -
 python/cudf/cudf/_lib/reshape.pyx        | 35 --------------
 python/cudf/cudf/core/dataframe.py       | 61 +++++++++++++++++-------
 python/cudf/cudf/core/groupby/groupby.py | 15 +++++-
 python/cudf/cudf/core/indexed_frame.py   | 22 +++++++--
 6 files changed, 75 insertions(+), 60 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/reshape.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8d3af5205fa..61d3bcbe24e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -30,7 +30,6 @@ set(cython_sources
     parquet.pyx
     reduce.pyx
     replace.pyx
-    reshape.pyx
     round.pyx
     scalar.pyx
     sort.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 1e0bf931c97..efa437eebb7 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -18,7 +18,6 @@
     parquet,
     reduce,
     replace,
-    reshape,
     round,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
deleted file mode 100644
index 6cebeb2bc16..00000000000
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def interleave_columns(list source_columns):
-    return Column.from_pylibcudf(
-        plc.reshape.interleave_columns(
-            plc.Table([
-                c.to_pylibcudf(mode="read") for c in source_columns
-            ])
-        )
-    )
-
-
-@acquire_spill_lock()
-def tile(list source_columns, size_type count):
-    cdef size_type c_count = count
-
-    return columns_from_pylibcudf_table(
-        plc.reshape.tile(
-            plc.Table([
-                c.to_pylibcudf(mode="read") for c in source_columns
-            ]),
-            c_count
-        )
-    )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7d523d2c5ad..73c0af45293 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7251,13 +7251,22 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         repeated_index = self.index.repeat(len(unique_named_levels))
 
         # Each column name should tile itself by len(df) times
-        tiled_index = libcudf.reshape.tile(
-            [
-                as_column(unique_named_levels.get_level_values(i))
-                for i in range(unique_named_levels.nlevels)
-            ],
-            self.shape[0],
-        )
+        with acquire_spill_lock():
+            plc_table = plc.reshape.tile(
+                plc.Table(
+                    [
+                        as_column(
+                            unique_named_levels.get_level_values(i)
+                        ).to_pylibcudf(mode="read")
+                        for i in range(unique_named_levels.nlevels)
+                    ]
+                ),
+                self.shape[0],
+            )
+            tiled_index = [
+                libcudf.column.Column.from_pylibcudf(plc)
+                for plc in plc_table.columns()
+            ]
 
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
@@ -7271,7 +7280,6 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
             data=range(self._num_columns), index=named_levels
         )
 
-        column_indices: list[list[int]] = []
         if has_unnamed_levels:
             unnamed_level_values = list(
                 map(column_name_idx.get_level_values, unnamed_levels_indices)
@@ -7307,13 +7315,11 @@ def unnamed_group_generator():
                 else:
                     yield column_idx_df.sort_index().values
 
-        column_indices = list(unnamed_group_generator())
-
         # For each of the group constructed from the unnamed levels,
         # invoke `interleave_columns` to stack the values.
         stacked = []
 
-        for column_idx in column_indices:
+        for column_idx in unnamed_group_generator():
             # Collect columns based on indices, append None for -1 indices.
             columns = [
                 None if i == -1 else self._data.select_by_index(i).columns[0]
@@ -7332,12 +7338,23 @@ def unnamed_group_generator():
             )
 
             # homogenize the dtypes of the columns
-            homogenized = [
+            homogenized = (
                 col.astype(common_type) if col is not None else all_nulls()
                 for col in columns
-            ]
+            )
 
-            stacked.append(libcudf.reshape.interleave_columns(homogenized))
+            with acquire_spill_lock():
+                interleaved_col = libcudf.column.Column.from_pylibcudf(
+                    plc.reshape.interleave_columns(
+                        plc.Table(
+                            [
+                                col.to_pylibcudf(mode="read")
+                                for col in homogenized
+                            ]
+                        )
+                    )
+                )
+            stacked.append(interleaved_col)
 
         # Construct the resulting dataframe / series
         if not has_unnamed_levels:
@@ -7838,10 +7855,18 @@ def interleave_columns(self):
             raise ValueError(
                 "interleave_columns does not support 'category' dtype."
             )
-
-        return self._constructor_sliced._from_column(
-            libcudf.reshape.interleave_columns([*self._columns])
-        )
+        with acquire_spill_lock():
+            result_col = libcudf.column.Column.from_pylibcudf(
+                plc.reshape.interleave_columns(
+                    plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in self._columns
+                        ]
+                    )
+                )
+            )
+        return self._constructor_sliced._from_column(result_col)
 
     @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e59b948aba9..b274bdea76d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -14,17 +14,19 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
 from cudf._lib.null_mask import bitmask_or
-from cudf._lib.reshape import interleave_columns
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import GatherMap
@@ -2201,6 +2203,17 @@ def _cov_or_corr(self, func, method_name):
 
         # interleave: combines the correlation or covariance results for each
         # column-pair into a single column
+
+        @acquire_spill_lock()
+        def interleave_columns(source_columns):
+            return libcudf.column.Column.from_pylibcudf(
+                plc.reshape.interleave_columns(
+                    plc.Table(
+                        [c.to_pylibcudf(mode="read") for c in source_columns]
+                    )
+                )
+            )
+
         res = cudf.DataFrame._from_data(
             {
                 x: interleave_columns([gb_cov_corr._data[y] for y in ys])
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 4e839aaeb6a..2f8c2587937 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5349,7 +5349,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         )
 
     @_performance_tracking
-    def tile(self, count):
+    def tile(self, count: int):
         """Repeats the rows `count` times to form a new Frame.
 
         Parameters
@@ -5373,10 +5373,24 @@ def tile(self, count):
         -------
         The indexed frame containing the tiled "rows".
         """
+        with acquire_spill_lock():
+            plc_table = plc.reshape.tile(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            self.index._columns, self._columns
+                        )
+                    ]
+                ),
+                count,
+            )
+            tiled = [
+                libcudf.column.Column.from_pylibcudf(plc)
+                for plc in plc_table.columns()
+            ]
         return self._from_columns_like_self(
-            libcudf.reshape.tile(
-                [*self.index._columns, *self._columns], count
-            ),
+            tiled,
             column_names=self._column_names,
             index_names=self._index_names,
         )

From d10eae79ec06ec002a34420a34380e077c4540f7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:01:00 -0500
Subject: [PATCH 19/23] Update strings/text source to use grid_1d for
 thread/block/stride calculations (#17404)

Replaces `threadIdx.x + blockDim.x * blockIdx.x` logic with `grid_1d::global_thread_id()` and `blockDim.x * gridDim.x` with `grid_1d::grid_stride()` in libcudf strings and text source.

Reference #10368

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17404
---
 cpp/include/cudf/strings/detail/gather.cuh  | 10 +++++-----
 cpp/src/strings/convert/convert_urls.cu     |  6 ++++--
 cpp/src/strings/copying/concatenate.cu      | 10 +++++-----
 cpp/src/strings/regex/utilities.cuh         |  4 ++--
 cpp/src/strings/search/find.cu              | 16 +++++++---------
 cpp/src/text/minhash.cu                     |  9 +++------
 cpp/src/text/subword/data_normalizer.cu     |  5 ++---
 cpp/src/text/subword/subword_tokenize.cu    | 11 +++--------
 cpp/src/text/subword/wordpiece_tokenizer.cu | 14 ++++----------
 cpp/src/text/vocabulary_tokenize.cu         |  9 +++------
 10 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 4216523df97..58665fbf27e 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
   constexpr size_t out_datatype_size = sizeof(uint4);
   constexpr size_t in_datatype_size  = sizeof(uint);
 
-  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int warp_lane        = global_thread_id % cudf::detail::warp_size;
-  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  auto const warp_lane        = global_thread_id % cudf::detail::warp_size;
+  auto const nwarps           = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size;
 
   auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
   uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
 
-  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+  for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
     auto const out_start = out_offsets[istring];
     auto const out_end   = out_offsets[istring + 1];
 
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 520f5897415..9d0186b7a51 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
   auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
   auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  auto const nwarps =
+    static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
@@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
   auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
   auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  auto const nwarps =
+    static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 9e4ef47ff79..3712b0e7fc6 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
   bitmask_type* output_mask,
   size_type* out_valid_count)
 {
-  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type warp_valid_count           = 0;
+  auto output_index          = cudf::detail::grid_1d::global_thread_id();
+  size_type warp_valid_count = 0;
 
   unsigned active_mask;
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
@@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
       warp_valid_count += __popc(new_word);
     }
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += cudf::detail::grid_1d::grid_stride();
     if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
   }
 
@@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
                                                        size_type const output_size,
                                                        char* output_data)
 {
-  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_index = cudf::detail::grid_1d::global_thread_id();
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
@@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
     auto const first_char     = input_offsets_data[input_view.offset()];
     output_data[output_index] = input_chars_data[offset_index + first_char];
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += cudf::detail::grid_1d::grid_stride();
   }
 }
 
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 679907788bb..2594fd7b6da 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog,
   __syncthreads();
   auto const s_prog = reprog_device::load(d_prog, shmem);
 
-  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = s_prog.thread_count();
   if (thread_idx < stride) {
     for (auto idx = thread_idx; idx < size; idx += stride) {
@@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn,
   __syncthreads();
   auto const s_prog = reprog_device::load(d_prog, shmem);
 
-  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = s_prog.thread_count();
   if (thread_idx < stride) {
     for (auto idx = thread_idx; idx < size; idx += stride) {
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 9bd1abb5542..3cf4970d36e 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
                                          size_type const stop,
                                          size_type* d_results)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
-
-  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = idx % cudf::detail::warp_size;
 
   if (d_strings.is_null(str_idx)) { return; }
@@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  using warp_reduce   = cub::WarpReduce<bool>;
+  auto const idx    = cudf::detail::grid_1d::global_thread_id();
+  using warp_reduce = cub::WarpReduce<bool>;
   __shared__ typename warp_reduce::TempStorage temp_storage;
 
-  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
-
-  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = idx % cudf::detail::warp_size;
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index aee83ab35ed..b7a719a2041 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 cudf::size_type width,
                                 hash_value_type* d_hashes)
 {
-  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
-  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
-              static_cast<std::size_t>(cudf::detail::warp_size))) {
-    return;
-  }
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
 
   if (d_strings.is_null(str_idx)) { return; }
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index c662581b3f4..a3bed45e4bd 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
 
-  cudf::thread_index_type const char_for_thread =
-    threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
-  uint32_t num_new_chars = 0;
+  auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
+  uint32_t num_new_chars     = 0;
 
   if (char_for_thread < total_bytes) {
     auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index dee589d6daf..6302b478c14 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
   uint32_t* attn_mask,
   uint32_t* metadata)
 {
-  cudf::thread_index_type const output_idx =
-    threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
-                    static_cast<cudf::thread_index_type>(blockDim.x);
-  if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
-                     static_cast<cudf::thread_index_type>(max_sequence_length))) {
-    return;
-  }
+  auto const output_idx = cudf::detail::grid_1d::global_thread_id();
 
-  uint32_t const absolute_row_id         = output_idx / max_sequence_length;
+  uint32_t const absolute_row_id = output_idx / max_sequence_length;
+  if (absolute_row_id >= nrows_tensor_token_ids) { return; }
   uint32_t const tensor_id               = row2tensor[absolute_row_id];
   uint32_t const row_within_tensor       = row2row_within_tensor[absolute_row_id];
   uint32_t const offset_token_ids_tensor = offsets[tensor_id];
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index c094537ebc2..dd1e8ddb027 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
                                                         uint32_t* token_ids,
                                                         uint8_t* tokens_per_word)
 {
-  cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                              static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                            threadIdx.x;
+  auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
@@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
                                             uint32_t* end_word_indices,
                                             uint32_t num_strings)
 {
-  cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                  static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                threadIdx.x;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
   // Ensure the starting character of each strings is written to the word start array.
   if (idx <= num_strings) {
     auto const offset = strings_offsets[idx];
@@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
                                             uint32_t* token_ids,
                                             uint8_t* tokens_per_word)
 {
-  cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                             threadIdx.x;
+  auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id();
 
-  if (word_to_tokenize >= total_words) return;
+  if (word_to_tokenize >= total_words) { return; }
   // Each thread gets the start code_point offset for each word and resets the token_id memory to
   // the default value. In a post processing step, all of these values will be removed.
   auto const token_start = word_starts[word_to_tokenize];
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index a2297987732..caf2b1d8b30 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
                                  int8_t* d_results)
 {
   // string per warp
-  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
-  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
-              static_cast<std::size_t>(cudf::detail::warp_size))) {
-    return;
-  }
-  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
 
   if (d_strings.is_null(str_idx)) {

From e7022fbc22eda538783e67f32d35ea8ea0798be8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:01:53 -0500
Subject: [PATCH 20/23] Use thread_index_type in binary-ops jit kernel.cu
 (#17420)

Follow on to #17354 to prevent overflow in jit kernel binary-ops.
This uses the `thread_index_type` directly since the `detail/utilities/cuda.cuh` cannot be included in the jit'd kernel source.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17420
---
 cpp/src/binaryop/jit/kernel.cu | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index 985fc87521c..1133e9ac22e 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -51,15 +51,10 @@ CUDF_KERNEL void kernel_v_v(cudf::size_type size,
                             TypeLhs* lhs_data,
                             TypeRhs* rhs_data)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
+  auto const start = threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) * blockDim.x;
+  auto const step  = static_cast<cudf::thread_index_type>(blockDim.x) * gridDim.x;
 
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < size; i += step) {
     out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[i]);
   }
 }
@@ -75,15 +70,10 @@ CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size,
                                           cudf::bitmask_type const* rhs_mask,
                                           cudf::size_type rhs_offset)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
+  auto const start = threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) * blockDim.x;
+  auto const step  = static_cast<cudf::thread_index_type>(blockDim.x) * gridDim.x;
 
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < size; i += step) {
     bool output_valid = false;
     out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
       lhs_data[i],

From f5954a44ce86af82b7750f64e511d063e35e9625 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:04:36 -0500
Subject: [PATCH 21/23] Remove nvtx/ranges.hpp include from cuda.cuh (#17427)

Removes unused header include `nvtx/ranges.hpp` from `cuda.cuh` and fixes up all the source files that were dependent on this include.
Found while trying to include `cuda.cuh` in a jit'd kernel source.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17427
---
 cpp/include/cudf/detail/utilities/cuda.cuh        | 6 ++----
 cpp/src/copying/contiguous_split.cu               | 1 +
 cpp/src/join/conditional_join.cu                  | 1 +
 cpp/src/join/hash_join.cu                         | 1 +
 cpp/src/join/join.cu                              | 1 +
 cpp/src/join/mixed_join.cu                        | 1 +
 cpp/src/join/mixed_join_semi.cu                   | 1 +
 cpp/src/json/json_path.cu                         | 1 +
 cpp/src/labeling/label_bins.cu                    | 1 +
 cpp/src/lists/contains.cu                         | 1 +
 cpp/src/lists/copying/segmented_gather.cu         | 1 +
 cpp/src/quantiles/tdigest/tdigest.cu              | 1 +
 cpp/src/reductions/minmax.cu                      | 1 +
 cpp/src/rolling/detail/rolling_fixed_window.cu    | 1 +
 cpp/src/rolling/detail/rolling_variable_window.cu | 1 +
 cpp/src/rolling/grouped_rolling.cu                | 1 +
 cpp/src/strings/copying/concatenate.cu            | 1 +
 17 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index d31ca3d92d1..61a8e9f7ec3 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -25,8 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
-
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace detail {
@@ -164,7 +162,7 @@ template <int32_t block_size, int32_t leader_lane = 0, typename T>
 __device__ T single_lane_block_sum_reduce(T lane_value)
 {
   static_assert(block_size <= 1024, "Invalid block size.");
-  static_assert(std::is_arithmetic_v<T>, "Invalid non-arithmetic type.");
+  static_assert(cuda::std::is_arithmetic_v<T>, "Invalid non-arithmetic type.");
   constexpr auto warps_per_block{block_size / warp_size};
   auto const lane_id{threadIdx.x % warp_size};
   auto const warp_id{threadIdx.x / warp_size};
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 15aa31ff5ee..e9443980320 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 781fda215fd..81287e9a3fd 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
 #include <cudf/detail/device_scalar.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index beeaabfdaab..05b85fed1a8 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -18,6 +18,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/join.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 7b13c260364..bbafb159caf 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -16,6 +16,7 @@
 #include "join_common_utils.hpp"
 
 #include <cudf/detail/gather.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 90b0d0a45ad..56044bb1264 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 62ba558b0bd..6c37f801693 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -22,6 +22,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index fb5cf66dd60..fd8629ed6f3 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 18a500069ad..40a48d919cd 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/label_bins.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 9556ef23784..03fbd8e5e89 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/detail/contains.hpp>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index f6e48f141e1..9d11035cfdc 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/gather.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index fb5aebb4b39..3a365477366 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 4f6eb23ce5b..98fd9f679c8 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index 23424da13cd..0603f27852a 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -19,6 +19,7 @@
 #include <cudf_test/column_utilities.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index c2324947ef6..d4851df740b 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -17,6 +17,7 @@
 #include "rolling.cuh"
 
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index ac6c7b11ef5..3cf292f5abb 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -21,6 +21,7 @@
 #include "detail/rolling_jit.hpp"
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/rolling.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 3712b0e7fc6..ba96e2cb988 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>

From b89728b7ccdb7f39b70087eccc2c8c36765742bd Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 26 Nov 2024 18:10:45 +0100
Subject: [PATCH 22/23] Abstract polars function expression nodes to ensure
 they are serializable (#17418)

Use `Enum`s to define Python types as references to `polars.polars._expr_nodes.*Function` as to ensure `cudf_polars.dsl.expressions` specializations of `Expr` are serializable.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17418
---
 .../cudf_polars/dsl/expressions/boolean.py    |  77 +++++++---
 .../cudf_polars/dsl/expressions/datetime.py   |  98 ++++++++++---
 .../cudf_polars/dsl/expressions/string.py     | 135 +++++++++++++-----
 python/cudf_polars/cudf_polars/dsl/to_ast.py  |  10 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  14 +-
 .../tests/dsl/test_serialization.py           |  56 ++++++++
 6 files changed, 304 insertions(+), 86 deletions(-)
 create mode 100644 python/cudf_polars/tests/dsl/test_serialization.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 8db8172ebd1..1682e7a8a9c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -6,13 +6,12 @@
 
 from __future__ import annotations
 
+from enum import IntEnum, auto
 from functools import partial, reduce
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
 
-from polars.polars import _expr_nodes as pl_expr
-
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -24,7 +23,10 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
+    from typing_extensions import Self
+
     import polars.type_aliases as pl_types
+    from polars.polars import _expr_nodes as pl_expr
 
     from cudf_polars.containers import DataFrame
 
@@ -32,13 +34,46 @@
 
 
 class BooleanFunction(Expr):
+    class Name(IntEnum):
+        """Internal and picklable representation of polars' `BooleanFunction`."""
+
+        All = auto()
+        AllHorizontal = auto()
+        Any = auto()
+        AnyHorizontal = auto()
+        IsBetween = auto()
+        IsDuplicated = auto()
+        IsFinite = auto()
+        IsFirstDistinct = auto()
+        IsIn = auto()
+        IsInfinite = auto()
+        IsLastDistinct = auto()
+        IsNan = auto()
+        IsNotNan = auto()
+        IsNotNull = auto()
+        IsNull = auto()
+        IsUnique = auto()
+        Not = auto()
+
+        @classmethod
+        def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self:
+            """Convert from polars' `BooleanFunction`."""
+            try:
+                function, name = str(obj).split(".", maxsplit=1)
+            except ValueError:
+                # Failed to unpack string
+                function = None
+            if function != "BooleanFunction":
+                raise ValueError("BooleanFunction required")
+            return getattr(cls, name)
+
     __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
         self,
         dtype: plc.DataType,
-        name: pl_expr.BooleanFunction,
+        name: BooleanFunction.Name,
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
@@ -46,7 +81,7 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
-        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+        if self.name is BooleanFunction.Name.IsIn and not all(
             c.dtype == self.children[0].dtype for c in self.children
         ):
             # TODO: If polars IR doesn't put the casts in, we need to
@@ -110,12 +145,12 @@ def do_evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
+            BooleanFunction.Name.IsFinite,
+            BooleanFunction.Name.IsInfinite,
         ):
             # Avoid evaluating the child if the dtype tells us it's unnecessary.
             (child,) = self.children
-            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            is_finite = self.name is BooleanFunction.Name.IsFinite
             if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
                 value = plc.interop.from_arrow(
                     pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
@@ -142,10 +177,10 @@ def do_evaluate(
         ]
         # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
         # False
-        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+        if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
             (ignore_nulls,) = self.options
             (column,) = columns
-            is_any = self.name == pl_expr.BooleanFunction.Any
+            is_any = self.name is BooleanFunction.Name.Any
             agg = plc.aggregation.any() if is_any else plc.aggregation.all()
             result = plc.reduce.reduce(column.obj, agg, self.dtype)
             if not ignore_nulls and column.obj.null_count() > 0:
@@ -165,27 +200,27 @@ def do_evaluate(
                     # False || Null => Null   True && Null => Null
                     return Column(plc.Column.all_null_like(column.obj, 1))
             return Column(plc.Column.from_scalar(result, 1))
-        if self.name == pl_expr.BooleanFunction.IsNull:
+        if self.name is BooleanFunction.Name.IsNull:
             (column,) = columns
             return Column(plc.unary.is_null(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+        elif self.name is BooleanFunction.Name.IsNotNull:
             (column,) = columns
             return Column(plc.unary.is_valid(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNan:
+        elif self.name is BooleanFunction.Name.IsNan:
             (column,) = columns
             return Column(
                 plc.unary.is_nan(column.obj).with_mask(
                     column.obj.null_mask(), column.obj.null_count()
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+        elif self.name is BooleanFunction.Name.IsNotNan:
             (column,) = columns
             return Column(
                 plc.unary.is_not_nan(column.obj).with_mask(
                     column.obj.null_mask(), column.obj.null_count()
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+        elif self.name is BooleanFunction.Name.IsFirstDistinct:
             (column,) = columns
             return self._distinct(
                 column,
@@ -197,7 +232,7 @@ def do_evaluate(
                     pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
                 ),
             )
-        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+        elif self.name is BooleanFunction.Name.IsLastDistinct:
             (column,) = columns
             return self._distinct(
                 column,
@@ -209,7 +244,7 @@ def do_evaluate(
                     pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
                 ),
             )
-        elif self.name == pl_expr.BooleanFunction.IsUnique:
+        elif self.name is BooleanFunction.Name.IsUnique:
             (column,) = columns
             return self._distinct(
                 column,
@@ -221,7 +256,7 @@ def do_evaluate(
                     pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
                 ),
             )
-        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+        elif self.name is BooleanFunction.Name.IsDuplicated:
             (column,) = columns
             return self._distinct(
                 column,
@@ -233,7 +268,7 @@ def do_evaluate(
                     pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
                 ),
             )
-        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
+        elif self.name is BooleanFunction.Name.AllHorizontal:
             return Column(
                 reduce(
                     partial(
@@ -244,7 +279,7 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
+        elif self.name is BooleanFunction.Name.AnyHorizontal:
             return Column(
                 reduce(
                     partial(
@@ -255,10 +290,10 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsIn:
+        elif self.name is BooleanFunction.Name.IsIn:
             needles, haystack = columns
             return Column(plc.search.contains(haystack.obj, needles.obj))
-        elif self.name == pl_expr.BooleanFunction.Not:
+        elif self.name is BooleanFunction.Name.Not:
             (column,) = columns
             return Column(
                 plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index cd8e5c6a4eb..c2dddfd9940 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -6,12 +6,11 @@
 
 from __future__ import annotations
 
+from enum import IntEnum, auto
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
 
-from polars.polars import _expr_nodes as pl_expr
-
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -20,33 +19,94 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
+    from typing_extensions import Self
+
+    from polars.polars import _expr_nodes as pl_expr
+
     from cudf_polars.containers import DataFrame
 
 __all__ = ["TemporalFunction"]
 
 
 class TemporalFunction(Expr):
+    class Name(IntEnum):
+        """Internal and picklable representation of polars' `TemporalFunction`."""
+
+        BaseUtcOffset = auto()
+        CastTimeUnit = auto()
+        Century = auto()
+        Combine = auto()
+        ConvertTimeZone = auto()
+        DSTOffset = auto()
+        Date = auto()
+        Datetime = auto()
+        DatetimeFunction = auto()
+        Day = auto()
+        Duration = auto()
+        Hour = auto()
+        IsLeapYear = auto()
+        IsoYear = auto()
+        Microsecond = auto()
+        Millennium = auto()
+        Millisecond = auto()
+        Minute = auto()
+        Month = auto()
+        MonthEnd = auto()
+        MonthStart = auto()
+        Nanosecond = auto()
+        OffsetBy = auto()
+        OrdinalDay = auto()
+        Quarter = auto()
+        ReplaceTimeZone = auto()
+        Round = auto()
+        Second = auto()
+        Time = auto()
+        TimeStamp = auto()
+        ToString = auto()
+        TotalDays = auto()
+        TotalHours = auto()
+        TotalMicroseconds = auto()
+        TotalMilliseconds = auto()
+        TotalMinutes = auto()
+        TotalNanoseconds = auto()
+        TotalSeconds = auto()
+        Truncate = auto()
+        Week = auto()
+        WeekDay = auto()
+        WithTimeUnit = auto()
+        Year = auto()
+
+        @classmethod
+        def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self:
+            """Convert from polars' `TemporalFunction`."""
+            try:
+                function, name = str(obj).split(".", maxsplit=1)
+            except ValueError:
+                # Failed to unpack string
+                function = None
+            if function != "TemporalFunction":
+                raise ValueError("TemporalFunction required")
+            return getattr(cls, name)
+
     __slots__ = ("name", "options")
-    _COMPONENT_MAP: ClassVar[
-        dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
-    ] = {
-        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
-        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
-        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
-        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
-        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
-        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
-        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
-        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
-        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
-        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
-    }
     _non_child = ("dtype", "name", "options")
+    _COMPONENT_MAP: ClassVar[dict[Name, plc.datetime.DatetimeComponent]] = {
+        Name.Year: plc.datetime.DatetimeComponent.YEAR,
+        Name.Month: plc.datetime.DatetimeComponent.MONTH,
+        Name.Day: plc.datetime.DatetimeComponent.DAY,
+        Name.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
+        Name.Hour: plc.datetime.DatetimeComponent.HOUR,
+        Name.Minute: plc.datetime.DatetimeComponent.MINUTE,
+        Name.Second: plc.datetime.DatetimeComponent.SECOND,
+        Name.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
+        Name.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
+        Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
+    }
 
     def __init__(
         self,
         dtype: plc.DataType,
-        name: pl_expr.TemporalFunction,
+        name: TemporalFunction.Name,
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
@@ -70,7 +130,7 @@ def do_evaluate(
             for child in self.children
         ]
         (column,) = columns
-        if self.name == pl_expr.TemporalFunction.Microsecond:
+        if self.name is TemporalFunction.Name.Microsecond:
             millis = plc.datetime.extract_datetime_component(
                 column.obj, plc.datetime.DatetimeComponent.MILLISECOND
             )
@@ -90,7 +150,7 @@ def do_evaluate(
                 plc.types.DataType(plc.types.TypeId.INT32),
             )
             return Column(total_micros)
-        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+        elif self.name is TemporalFunction.Name.Nanosecond:
             millis = plc.datetime.extract_datetime_component(
                 column.obj, plc.datetime.DatetimeComponent.MILLISECOND
             )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 8b66c9d4676..92c3c658c21 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -6,13 +6,13 @@
 
 from __future__ import annotations
 
+from enum import IntEnum, auto
 from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 import pyarrow.compute as pc
 
 from polars.exceptions import InvalidOperationError
-from polars.polars import _expr_nodes as pl_expr
 
 import pylibcudf as plc
 
@@ -23,19 +23,82 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
+    from typing_extensions import Self
+
+    from polars.polars import _expr_nodes as pl_expr
+
     from cudf_polars.containers import DataFrame
 
 __all__ = ["StringFunction"]
 
 
 class StringFunction(Expr):
+    class Name(IntEnum):
+        """Internal and picklable representation of polars' `StringFunction`."""
+
+        Base64Decode = auto()
+        Base64Encode = auto()
+        ConcatHorizontal = auto()
+        ConcatVertical = auto()
+        Contains = auto()
+        ContainsMany = auto()
+        CountMatches = auto()
+        EndsWith = auto()
+        EscapeRegex = auto()
+        Extract = auto()
+        ExtractAll = auto()
+        ExtractGroups = auto()
+        Find = auto()
+        Head = auto()
+        HexDecode = auto()
+        HexEncode = auto()
+        JsonDecode = auto()
+        JsonPathMatch = auto()
+        LenBytes = auto()
+        LenChars = auto()
+        Lowercase = auto()
+        PadEnd = auto()
+        PadStart = auto()
+        Replace = auto()
+        ReplaceMany = auto()
+        Reverse = auto()
+        Slice = auto()
+        Split = auto()
+        SplitExact = auto()
+        SplitN = auto()
+        StartsWith = auto()
+        StripChars = auto()
+        StripCharsEnd = auto()
+        StripCharsStart = auto()
+        StripPrefix = auto()
+        StripSuffix = auto()
+        Strptime = auto()
+        Tail = auto()
+        Titlecase = auto()
+        ToDecimal = auto()
+        ToInteger = auto()
+        Uppercase = auto()
+        ZFill = auto()
+
+        @classmethod
+        def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
+            """Convert from polars' `StringFunction`."""
+            try:
+                function, name = str(obj).split(".", maxsplit=1)
+            except ValueError:
+                # Failed to unpack string
+                function = None
+            if function != "StringFunction":
+                raise ValueError("StringFunction required")
+            return getattr(cls, name)
+
     __slots__ = ("name", "options", "_regex_program")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
         self,
         dtype: plc.DataType,
-        name: pl_expr.StringFunction,
+        name: StringFunction.Name,
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
@@ -47,21 +110,21 @@ def __init__(
 
     def _validate_input(self):
         if self.name not in (
-            pl_expr.StringFunction.Contains,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Replace,
-            pl_expr.StringFunction.ReplaceMany,
-            pl_expr.StringFunction.Slice,
-            pl_expr.StringFunction.Strptime,
-            pl_expr.StringFunction.StartsWith,
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-            pl_expr.StringFunction.Uppercase,
+            StringFunction.Name.Contains,
+            StringFunction.Name.EndsWith,
+            StringFunction.Name.Lowercase,
+            StringFunction.Name.Replace,
+            StringFunction.Name.ReplaceMany,
+            StringFunction.Name.Slice,
+            StringFunction.Name.Strptime,
+            StringFunction.Name.StartsWith,
+            StringFunction.Name.StripChars,
+            StringFunction.Name.StripCharsStart,
+            StringFunction.Name.StripCharsEnd,
+            StringFunction.Name.Uppercase,
         ):
             raise NotImplementedError(f"String function {self.name}")
-        if self.name == pl_expr.StringFunction.Contains:
+        if self.name is StringFunction.Name.Contains:
             literal, strict = self.options
             if not literal:
                 if not strict:
@@ -82,7 +145,7 @@ def _validate_input(self):
                     raise NotImplementedError(
                         f"Unsupported regex {pattern} for GPU engine."
                     ) from e
-        elif self.name == pl_expr.StringFunction.Replace:
+        elif self.name is StringFunction.Name.Replace:
             _, literal = self.options
             if not literal:
                 raise NotImplementedError("literal=False is not supported for replace")
@@ -93,7 +156,7 @@ def _validate_input(self):
                 raise NotImplementedError(
                     "libcudf replace does not support empty strings"
                 )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
+        elif self.name is StringFunction.Name.ReplaceMany:
             (ascii_case_insensitive,) = self.options
             if ascii_case_insensitive:
                 raise NotImplementedError(
@@ -109,12 +172,12 @@ def _validate_input(self):
                     "libcudf replace_many is implemented differently from polars "
                     "for empty strings"
                 )
-        elif self.name == pl_expr.StringFunction.Slice:
+        elif self.name is StringFunction.Name.Slice:
             if not all(isinstance(child, Literal) for child in self.children[1:]):
                 raise NotImplementedError(
                     "Slice only supports literal start and stop values"
                 )
-        elif self.name == pl_expr.StringFunction.Strptime:
+        elif self.name is StringFunction.Name.Strptime:
             format, _, exact, cache = self.options
             if cache:
                 raise NotImplementedError("Strptime cache is a CPU feature")
@@ -123,9 +186,9 @@ def _validate_input(self):
             if not exact:
                 raise NotImplementedError("Strptime does not support exact=False")
         elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
+            StringFunction.Name.StripChars,
+            StringFunction.Name.StripCharsStart,
+            StringFunction.Name.StripCharsEnd,
         }:
             if not isinstance(self.children[1], Literal):
                 raise NotImplementedError(
@@ -140,7 +203,7 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        if self.name == pl_expr.StringFunction.Contains:
+        if self.name is StringFunction.Name.Contains:
             child, arg = self.children
             column = child.evaluate(df, context=context, mapping=mapping)
 
@@ -157,7 +220,7 @@ def do_evaluate(
                 return Column(
                     plc.strings.contains.contains_re(column.obj, self._regex_program)
                 )
-        elif self.name == pl_expr.StringFunction.Slice:
+        elif self.name is StringFunction.Name.Slice:
             child, expr_offset, expr_length = self.children
             assert isinstance(expr_offset, Literal)
             assert isinstance(expr_length, Literal)
@@ -188,16 +251,16 @@ def do_evaluate(
                 )
             )
         elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
+            StringFunction.Name.StripChars,
+            StringFunction.Name.StripCharsStart,
+            StringFunction.Name.StripCharsEnd,
         }:
             column, chars = (
                 c.evaluate(df, context=context, mapping=mapping) for c in self.children
             )
-            if self.name == pl_expr.StringFunction.StripCharsStart:
+            if self.name is StringFunction.Name.StripCharsStart:
                 side = plc.strings.SideType.LEFT
-            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+            elif self.name is StringFunction.Name.StripCharsEnd:
                 side = plc.strings.SideType.RIGHT
             else:
                 side = plc.strings.SideType.BOTH
@@ -207,13 +270,13 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         ]
-        if self.name == pl_expr.StringFunction.Lowercase:
+        if self.name is StringFunction.Name.Lowercase:
             (column,) = columns
             return Column(plc.strings.case.to_lower(column.obj))
-        elif self.name == pl_expr.StringFunction.Uppercase:
+        elif self.name is StringFunction.Name.Uppercase:
             (column,) = columns
             return Column(plc.strings.case.to_upper(column.obj))
-        elif self.name == pl_expr.StringFunction.EndsWith:
+        elif self.name is StringFunction.Name.EndsWith:
             column, suffix = columns
             return Column(
                 plc.strings.find.ends_with(
@@ -223,7 +286,7 @@ def do_evaluate(
                     else suffix.obj,
                 )
             )
-        elif self.name == pl_expr.StringFunction.StartsWith:
+        elif self.name is StringFunction.Name.StartsWith:
             column, prefix = columns
             return Column(
                 plc.strings.find.starts_with(
@@ -233,7 +296,7 @@ def do_evaluate(
                     else prefix.obj,
                 )
             )
-        elif self.name == pl_expr.StringFunction.Strptime:
+        elif self.name is StringFunction.Name.Strptime:
             # TODO: ignores ambiguous
             format, strict, exact, cache = self.options
             col = self.children[0].evaluate(df, context=context, mapping=mapping)
@@ -265,7 +328,7 @@ def do_evaluate(
                         res.columns()[0], self.dtype, format
                     )
                 )
-        elif self.name == pl_expr.StringFunction.Replace:
+        elif self.name is StringFunction.Name.Replace:
             column, target, repl = columns
             n, _ = self.options
             return Column(
@@ -273,7 +336,7 @@ def do_evaluate(
                     column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
                 )
             )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
+        elif self.name is StringFunction.Name.ReplaceMany:
             column, target, repl = columns
             return Column(
                 plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py
index acc4b3669af..c3febc833e2 100644
--- a/python/cudf_polars/cudf_polars/dsl/to_ast.py
+++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py
@@ -8,8 +8,6 @@
 from functools import partial, reduce, singledispatch
 from typing import TYPE_CHECKING, TypeAlias
 
-from polars.polars import _expr_nodes as pl_expr
-
 import pylibcudf as plc
 from pylibcudf import expressions as plc_expr
 
@@ -185,7 +183,7 @@ def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression:
 
 @_to_ast.register
 def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
-    if node.name == pl_expr.BooleanFunction.IsIn:
+    if node.name is expr.BooleanFunction.Name.IsIn:
         needles, haystack = node.children
         if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16:
             # 16 is an arbitrary limit
@@ -204,14 +202,14 @@ def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
         raise NotImplementedError(
             f"Parquet filters don't support {node.name} on columns"
         )
-    if node.name == pl_expr.BooleanFunction.IsNull:
+    if node.name is expr.BooleanFunction.Name.IsNull:
         return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0]))
-    elif node.name == pl_expr.BooleanFunction.IsNotNull:
+    elif node.name is expr.BooleanFunction.Name.IsNotNull:
         return plc_expr.Operation(
             plc_expr.ASTOperator.NOT,
             plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])),
         )
-    elif node.name == pl_expr.BooleanFunction.Not:
+    elif node.name is expr.BooleanFunction.Name.Not:
         return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0]))
     raise NotImplementedError(f"AST conversion does not support {node.name}")
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 9480ce6e535..b1e2de63ba6 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -531,10 +531,16 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
                         column.dtype,
                         pa.scalar("", type=plc.interop.to_arrow(column.dtype)),
                     )
-            return expr.StringFunction(dtype, name, options, column, chars)
+            return expr.StringFunction(
+                dtype,
+                expr.StringFunction.Name.from_polars(name),
+                options,
+                column,
+                chars,
+            )
         return expr.StringFunction(
             dtype,
-            name,
+            expr.StringFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
@@ -551,7 +557,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
             )
         return expr.BooleanFunction(
             dtype,
-            name,
+            expr.BooleanFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
@@ -571,7 +577,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
         }
         result_expr = expr.TemporalFunction(
             dtype,
-            name,
+            expr.TemporalFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
diff --git a/python/cudf_polars/tests/dsl/test_serialization.py b/python/cudf_polars/tests/dsl/test_serialization.py
new file mode 100644
index 00000000000..7de8f959843
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_serialization.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pickle
+
+import pytest
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.dsl.expressions.boolean import BooleanFunction
+from cudf_polars.dsl.expressions.datetime import TemporalFunction
+from cudf_polars.dsl.expressions.string import StringFunction
+
+
+@pytest.fixture(params=[BooleanFunction, StringFunction, TemporalFunction])
+def function(request):
+    return request.param
+
+
+def test_function_name_serialization_all_values(function):
+    # Test serialization and deserialization for all values of function.Name
+    for name in function.Name:
+        serialized_name = pickle.dumps(name)
+        deserialized_name = pickle.loads(serialized_name)
+        assert deserialized_name is name
+
+
+def test_function_name_invalid(function):
+    # Test invalid attribute name
+    with pytest.raises(AttributeError, match="InvalidAttribute"):
+        assert function.Name.InvalidAttribute is function.Name.InvalidAttribute
+
+
+def test_from_polars_all_names(function):
+    # Test that all valid names of polars expressions are correctly converted
+    polars_function = getattr(pl_expr, function.__name__)
+    polars_names = [name for name in dir(polars_function) if not name.startswith("_")]
+    # Check names advertised by polars are the same as we advertise
+    assert set(polars_names) == set(function.Name.__members__)
+    for name in function.Name:
+        attr = getattr(polars_function, name.name)
+        assert function.Name.from_polars(attr) == name
+
+
+def test_from_polars_invalid_attribute(function):
+    # Test converting from invalid attribute name
+    with pytest.raises(ValueError, match=f"{function.__name__} required"):
+        function.Name.from_polars("InvalidAttribute")
+
+
+def test_from_polars_invalid_polars_attribute(function):
+    # Test converting from polars function with invalid attribute name
+    with pytest.raises(AttributeError, match="InvalidAttribute"):
+        function.Name.from_polars(f"{function.__name__}.InvalidAttribute")

From 165d756f7f7cb558d1cab62a81a1c91368648d12 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 26 Nov 2024 13:38:22 -0500
Subject: [PATCH 23/23] Migrate ORC Writer to pylibcudf (#17310)

Apart of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17310
---
 python/cudf/cudf/_lib/orc.pyx                 | 167 ++++---
 python/pylibcudf/pylibcudf/io/orc.pxd         |  65 ++-
 python/pylibcudf/pylibcudf/io/orc.pyi         |  51 ++-
 python/pylibcudf/pylibcudf/io/orc.pyx         | 413 +++++++++++++++++-
 python/pylibcudf/pylibcudf/io/types.pxd       |  17 +-
 python/pylibcudf/pylibcudf/io/types.pyi       |  22 +-
 python/pylibcudf/pylibcudf/io/types.pyx       |  61 ++-
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  62 +++
 .../pylibcudf/tests/io/test_types.py          |  28 ++
 9 files changed, 762 insertions(+), 124 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_types.py

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 32a5e463916..c829cac6409 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -3,11 +3,9 @@
 from libc.stdint cimport int64_t
 from libcpp cimport bool, int
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
+import itertools
 from collections import OrderedDict
 
 try:
@@ -16,23 +14,10 @@ except ImportError:
     import json
 
 cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.orc cimport (
-    chunked_orc_writer_options,
-    orc_chunked_writer,
-    orc_writer_options,
-    write_orc as libcudf_write_orc,
-)
-from pylibcudf.libcudf.io.types cimport (
-    column_in_metadata,
-    sink_info,
-    table_input_metadata,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
 
 from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names
-from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
+from cudf._lib.io.utils cimport update_col_struct_field_names
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
 import pylibcudf as plc
 
@@ -40,7 +25,8 @@ import cudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from cudf.core.buffer import acquire_spill_lock
-
+from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
+from pylibcudf.io.orc cimport OrcChunkedWriter
 
 # TODO: Consider inlining this function since it seems to only be used in one place.
 cpdef read_parsed_orc_statistics(filepath_or_buffer):
@@ -246,36 +232,33 @@ def write_orc(
     --------
     cudf.read_orc
     """
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-    cdef table_input_metadata tbl_meta
-    cdef map[string, string] user_data
-    user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
-        table, index)
-    )
-
+    user_data = {}
+    user_data["pandas"] = generate_pandas_metadata(table, index)
     if index is True or (
         index is None and not isinstance(table._index, cudf.RangeIndex)
     ):
-        tv = table_view_from_table(table)
-        tbl_meta = table_input_metadata(tv)
+        columns = table._columns if table._index is None else [
+            *table.index._columns, *table._columns
+        ]
+        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
+        tbl_meta = TableInputMetadata(plc_table)
         for level, idx_name in enumerate(table._index.names):
             tbl_meta.column_metadata[level].set_name(
-                str.encode(
-                    _index_level_name(idx_name, level, table._column_names)
-                )
+                _index_level_name(idx_name, level, table._column_names)
             )
         num_index_cols_meta = len(table._index.names)
     else:
-        tv = table_view_from_table(table, ignore_index=True)
-        tbl_meta = table_input_metadata(tv)
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = TableInputMetadata(plc_table)
         num_index_cols_meta = 0
 
     if cols_as_map_type is not None:
         cols_as_map_type = set(cols_as_map_type)
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
-        tbl_meta.column_metadata[i].set_name(name.encode())
+        tbl_meta.column_metadata[i].set_name(name)
         _set_col_children_metadata(
             table[name]._column,
             tbl_meta.column_metadata[i],
@@ -283,24 +266,24 @@ def write_orc(
             and (name in cols_as_map_type),
         )
 
-    cdef orc_writer_options c_orc_writer_options = move(
-        orc_writer_options.builder(
-            sink_info_c, tv
-        ).metadata(tbl_meta)
-        .key_value_metadata(move(user_data))
+    options = (
+        plc.io.orc.OrcWriterOptions.builder(
+            plc.io.SinkInfo([path_or_buf]), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
         .compression(_get_comp_type(compression))
         .enable_statistics(_get_orc_stat_freq(statistics))
         .build()
     )
     if stripe_size_bytes is not None:
-        c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes)
+        options.set_stripe_size_bytes(stripe_size_bytes)
     if stripe_size_rows is not None:
-        c_orc_writer_options.set_stripe_size_rows(stripe_size_rows)
+        options.set_stripe_size_rows(stripe_size_rows)
     if row_index_stride is not None:
-        c_orc_writer_options.set_row_index_stride(row_index_stride)
+        options.set_row_index_stride(row_index_stride)
 
-    with nogil:
-        libcudf_write_orc(c_orc_writer_options)
+    plc.io.orc.write_orc(options)
 
 
 cdef int64_t get_skiprows_arg(object arg) except*:
@@ -326,13 +309,12 @@ cdef class ORCWriter:
     cudf.io.orc.to_orc
     """
     cdef bool initialized
-    cdef unique_ptr[orc_chunked_writer] writer
-    cdef sink_info sink
-    cdef unique_ptr[data_sink] _data_sink
+    cdef OrcChunkedWriter writer
+    cdef SinkInfo sink
     cdef str statistics
     cdef object compression
     cdef object index
-    cdef table_input_metadata tbl_meta
+    cdef TableInputMetadata tbl_meta
     cdef object cols_as_map_type
     cdef object stripe_size_bytes
     cdef object stripe_size_rows
@@ -347,8 +329,7 @@ cdef class ORCWriter:
                   object stripe_size_bytes=None,
                   object stripe_size_rows=None,
                   object row_index_stride=None):
-
-        self.sink = make_sink_info(path, self._data_sink)
+        self.sink = plc.io.SinkInfo([path])
         self.statistics = statistics
         self.compression = compression
         self.index = index
@@ -368,17 +349,21 @@ cdef class ORCWriter:
             table._index.name is not None or
             isinstance(table._index, cudf.core.multiindex.MultiIndex)
         )
-        tv = table_view_from_table(table, not keep_index)
+        if keep_index:
+            columns = [
+                col.to_pylibcudf(mode="read")
+                for col in itertools.chain(table.index._columns, table._columns)
+            ]
+        else:
+            columns = [col.to_pylibcudf(mode="read") for col in table._columns]
 
-        with nogil:
-            self.writer.get()[0].write(tv)
+        self.writer.write(plc.Table(columns))
 
     def close(self):
         if not self.initialized:
             return
 
-        with nogil:
-            self.writer.get()[0].close()
+        self.writer.close()
 
     def __dealloc__(self):
         self.close()
@@ -387,32 +372,47 @@ cdef class ORCWriter:
         """
         Prepare all the values required to build the
         chunked_orc_writer_options anb creates a writer"""
-        cdef table_view tv
 
         num_index_cols_meta = 0
-        self.tbl_meta = table_input_metadata(
-            table_view_from_table(table, ignore_index=True),
+        plc_table = plc.Table(
+            [
+                col.to_pylibcudf(mode="read")
+                for col in table._columns
+            ]
         )
+        self.tbl_meta = TableInputMetadata(plc_table)
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                tv = table_view_from_table(table)
-                self.tbl_meta = table_input_metadata(tv)
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(table.index._columns, table._columns)
+                    ]
+                )
+                self.tbl_meta = TableInputMetadata(plc_table)
                 for level, idx_name in enumerate(table._index.names):
                     self.tbl_meta.column_metadata[level].set_name(
-                        (str.encode(idx_name))
+                        idx_name
                     )
                 num_index_cols_meta = len(table._index.names)
             else:
                 if table._index.name is not None:
-                    tv = table_view_from_table(table)
-                    self.tbl_meta = table_input_metadata(tv)
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = TableInputMetadata(plc_table)
                     self.tbl_meta.column_metadata[0].set_name(
-                        str.encode(table._index.name)
+                        table._index.name
                     )
                     num_index_cols_meta = 1
 
         for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name.encode())
+            self.tbl_meta.column_metadata[i].set_name(name)
             _set_col_children_metadata(
                 table[name]._column,
                 self.tbl_meta.column_metadata[i],
@@ -420,38 +420,37 @@ cdef class ORCWriter:
                 and (name in self.cols_as_map_type),
             )
 
-        cdef map[string, string] user_data
+        user_data = {}
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-
-        cdef chunked_orc_writer_options c_opts = move(
-                chunked_orc_writer_options.builder(self.sink)
-                .metadata(self.tbl_meta)
-                .key_value_metadata(move(user_data))
-                .compression(_get_comp_type(self.compression))
-                .enable_statistics(_get_orc_stat_freq(self.statistics))
-                .build()
-            )
+        user_data["pandas"] = pandas_metadata
+
+        options = (
+            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(_get_comp_type(self.compression))
+            .enable_statistics(_get_orc_stat_freq(self.statistics))
+            .build()
+        )
         if self.stripe_size_bytes is not None:
-            c_opts.set_stripe_size_bytes(self.stripe_size_bytes)
+            options.set_stripe_size_bytes(self.stripe_size_bytes)
         if self.stripe_size_rows is not None:
-            c_opts.set_stripe_size_rows(self.stripe_size_rows)
+            options.set_stripe_size_rows(self.stripe_size_rows)
         if self.row_index_stride is not None:
-            c_opts.set_row_index_stride(self.row_index_stride)
+            options.set_row_index_stride(self.row_index_stride)
 
-        with nogil:
-            self.writer.reset(new orc_chunked_writer(c_opts))
+        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
 
         self.initialized = True
 
 cdef _set_col_children_metadata(Column col,
-                                column_in_metadata& col_meta,
+                                ColumnInMetadata col_meta,
                                 list_column_as_map=False):
     if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
-            col_meta.child(i).set_name(name.encode())
+            col_meta.child(i).set_name(name)
             _set_col_children_metadata(
                 child_col, col_meta.child(i), list_column_as_map
             )
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
index b111d617b1b..671f0692444 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -4,15 +4,33 @@ from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from libcpp.memory cimport unique_ptr
+from libcpp.map cimport map
+from pylibcudf.io.types cimport (
+    SourceInfo,
+    SinkInfo,
+    TableWithMetadata,
+    TableInputMetadata,
+)
 from pylibcudf.libcudf.io.orc_metadata cimport (
     column_statistics,
     parsed_orc_statistics,
     statistics_type,
 )
+from pylibcudf.libcudf.io.orc cimport (
+    orc_chunked_writer,
+    orc_writer_options,
+    orc_writer_options_builder,
+    chunked_orc_writer_options,
+    chunked_orc_writer_options_builder,
+)
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.types cimport DataType
-
+from pylibcudf.table cimport Table
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    statistics_freq,
+)
 
 cpdef TableWithMetadata read_orc(
     SourceInfo source_info,
@@ -48,3 +66,46 @@ cdef class ParsedOrcStatistics:
 cpdef ParsedOrcStatistics read_parsed_orc_statistics(
     SourceInfo source_info
 )
+
+cdef class OrcWriterOptions:
+    cdef orc_writer_options c_obj
+    cdef Table table
+    cdef SinkInfo sink
+    cpdef void set_stripe_size_bytes(self, size_t size_bytes)
+    cpdef void set_stripe_size_rows(self, size_type size_rows)
+    cpdef void set_row_index_stride(self, size_type stride)
+
+cdef class OrcWriterOptionsBuilder:
+    cdef orc_writer_options_builder c_obj
+    cdef Table table
+    cdef SinkInfo sink
+    cpdef OrcWriterOptionsBuilder compression(self, compression_type comp)
+    cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val)
+    cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm)
+    cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta)
+    cpdef OrcWriterOptions build(self)
+
+cpdef void write_orc(OrcWriterOptions options)
+
+cdef class OrcChunkedWriter:
+    cdef unique_ptr[orc_chunked_writer] c_obj
+    cpdef void close(self)
+    cpdef void write(self, Table table)
+
+cdef class ChunkedOrcWriterOptions:
+    cdef chunked_orc_writer_options c_obj
+    cdef SinkInfo sink
+    cpdef void set_stripe_size_bytes(self, size_t size_bytes)
+    cpdef void set_stripe_size_rows(self, size_type size_rows)
+    cpdef void set_row_index_stride(self, size_type stride)
+
+cdef class ChunkedOrcWriterOptionsBuilder:
+    cdef chunked_orc_writer_options_builder c_obj
+    cdef SinkInfo sink
+    cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp)
+    cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val)
+    cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata(
+        self, dict kvm
+    )
+    cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta)
+    cpdef ChunkedOrcWriterOptions build(self)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
index 4cf87f1a832..516f97981e9 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyi
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -1,8 +1,16 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from typing import Any
+from typing import Any, Self
 
-from pylibcudf.io.types import SourceInfo, TableWithMetadata
+from pylibcudf.io.types import (
+    CompressionType,
+    SinkInfo,
+    SourceInfo,
+    StatisticsFreq,
+    TableInputMetadata,
+    TableWithMetadata,
+)
+from pylibcudf.table import Table
 from pylibcudf.types import DataType
 
 def read_orc(
@@ -39,3 +47,42 @@ class ParsedOrcStatistics:
 def read_parsed_orc_statistics(
     source_info: SourceInfo,
 ) -> ParsedOrcStatistics: ...
+
+class OrcWriterOptions:
+    def __init__(self): ...
+    def set_stripe_size_bytes(self, size_bytes: int) -> None: ...
+    def set_stripe_size_rows(self, size_rows: int) -> None: ...
+    def set_row_index_stride(self, stride: int) -> None: ...
+    @staticmethod
+    def builder(sink: SinkInfo, table: Table) -> OrcWriterOptionsBuilder: ...
+
+class OrcWriterOptionsBuilder:
+    def __init__(self): ...
+    def compression(self, comp: CompressionType) -> Self: ...
+    def enable_statistics(self, val: StatisticsFreq) -> Self: ...
+    def key_value_metadata(self, kvm: dict[str, str]) -> Self: ...
+    def metadata(self, meta: TableWithMetadata) -> Self: ...
+    def build(self) -> OrcWriterOptions: ...
+
+def write_orc(options: OrcWriterOptions) -> None: ...
+
+class OrcChunkedWriter:
+    def __init__(self): ...
+    def close(self) -> None: ...
+    def write(self, table: Table) -> None: ...
+
+class ChunkedOrcWriterOptions:
+    def __init__(self): ...
+    def set_stripe_size_bytes(self, size_bytes: int) -> None: ...
+    def set_stripe_size_rows(self, size_rows: int) -> None: ...
+    def set_row_index_stride(self, stride: int) -> None: ...
+    @staticmethod
+    def builder(sink: SinkInfo) -> ChunkedOrcWriterOptionsBuilder: ...
+
+class ChunkedOrcWriterOptionsBuilder:
+    def __init__(self): ...
+    def compression(self, comp: CompressionType) -> Self: ...
+    def enable_statistics(self, val: StatisticsFreq) -> Self: ...
+    def key_value_metadata(self, kvm: dict[str, str]) -> Self: ...
+    def metadata(self, meta: TableInputMetadata) -> Self: ...
+    def build(self) -> ChunkedOrcWriterOptions: ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 4270f5b4f95..63eab4a9634 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -6,10 +6,11 @@ from libcpp.vector cimport vector
 
 import datetime
 
-from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata, SinkInfo
 from pylibcudf.libcudf.io.orc cimport (
     orc_reader_options,
     read_orc as cpp_read_orc,
+    write_orc as cpp_write_orc,
 )
 from pylibcudf.libcudf.io.orc_metadata cimport (
     binary_statistics,
@@ -29,12 +30,27 @@ from pylibcudf.libcudf.io.types cimport table_with_metadata
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.types cimport DataType
 from pylibcudf.variant cimport get_if, holds_alternative
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    statistics_freq,
+)
+from pylibcudf.libcudf.io.orc cimport (
+    orc_chunked_writer,
+    orc_writer_options,
+    chunked_orc_writer_options,
+)
 
 __all__ = [
     "OrcColumnStatistics",
     "ParsedOrcStatistics",
     "read_orc",
     "read_parsed_orc_statistics",
+    "write_orc",
+    "OrcWriterOptions",
+    "OrcWriterOptionsBuilder",
+    "OrcChunkedWriter",
+    "ChunkedOrcWriterOptions",
+    "ChunkedOrcWriterOptionsBuilder",
 ]
 
 cdef class OrcColumnStatistics:
@@ -310,3 +326,398 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics(
         cpp_read_parsed_orc_statistics(source_info.c_obj)
     )
     return ParsedOrcStatistics.from_libcudf(parsed)
+
+
+cdef class OrcWriterOptions:
+    cpdef void set_stripe_size_bytes(self, size_t size_bytes):
+        """
+        Sets the maximum stripe size, in bytes.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_bytes`
+
+        Parameters
+        ----------
+        size_bytes: size_t
+            Sets the maximum stripe size, in bytes.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_stripe_size_bytes(size_bytes)
+
+    cpdef void set_stripe_size_rows(self, size_type size_rows):
+        """
+        Sets the maximum stripe size, in rows.
+
+        If the stripe size is smaller that the row group size,
+        row group size will be reduced to math the stripe size.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_rows`
+
+        Parameters
+        ----------
+        size_bytes: size_type
+            Maximum stripe size, in rows to be set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_stripe_size_rows(size_rows)
+
+    cpdef void set_row_index_stride(self, size_type stride):
+        """
+        Sets the row index stride.
+
+        Rounded down to a multiple of 8.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options::set_row_index_stride`
+
+        Parameters
+        ----------
+        size_bytes: size_type
+            Maximum stripe size, in rows to be set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_row_index_stride(stride)
+
+    @staticmethod
+    def builder(SinkInfo sink, Table table):
+        """
+        Create builder to create OrcWriterOptions.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options::builder`
+
+        Parameters
+        ----------
+        sink: SinkInfo
+            The sink used for writer output
+        table: Table
+            Table to be written to output
+
+        Returns
+        -------
+        OrcWriterOptionsBuilder
+        """
+        cdef OrcWriterOptionsBuilder orc_builder = OrcWriterOptionsBuilder.__new__(
+            OrcWriterOptionsBuilder
+        )
+        orc_builder.c_obj = orc_writer_options.builder(sink.c_obj, table.view())
+        orc_builder.table = table
+        orc_builder.sink = sink
+        return orc_builder
+
+
+cdef class OrcWriterOptionsBuilder:
+    cpdef OrcWriterOptionsBuilder compression(self, compression_type comp):
+        """
+        Sets compression type.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options_builder::compression`
+
+        Parameters
+        ----------
+        comp: CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        OrcWriterOptionsBuilder
+        """
+        self.c_obj.compression(comp)
+        return self
+
+    cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val):
+        """
+        Choose granularity of column statistics to be written.
+
+        For details, see :cpp:func:`enable_statistics`
+
+        Parameters
+        ----------
+        val: StatisticsFreq
+            Level of statistics collection
+
+        Returns
+        -------
+        OrcWriterOptionsBuilder
+        """
+        self.c_obj.enable_statistics(val)
+        return self
+
+    cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm):
+        """
+        Sets Key-Value footer metadata.
+
+        Parameters
+        ----------
+        kvm: dict
+            Key-Value footer metadata
+
+        Returns
+        -------
+        OrcWriterOptionsBuilder
+        """
+        self.c_obj.key_value_metadata(
+            {key.encode(): value.encode() for key, value in kvm.items()}
+        )
+        return self
+
+    cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta):
+        """
+        Sets associated metadata.
+
+        For details, see :cpp:func:`cudf::io::orc_writer_options_builder::metadata`
+
+        Parameters
+        ----------
+        meta: TableInputMetadata
+            Associated metadata
+
+        Returns
+        -------
+        OrcWriterOptionsBuilder
+        """
+        self.c_obj.metadata(meta.c_obj)
+        return self
+
+    cpdef OrcWriterOptions build(self):
+        """Moves the ORC writer options builder"""
+        cdef OrcWriterOptions orc_options = OrcWriterOptions.__new__(
+            OrcWriterOptions
+        )
+        orc_options.c_obj = move(self.c_obj.build())
+        orc_options.table = self.table
+        orc_options.sink = self.sink
+        return orc_options
+
+
+cpdef void write_orc(OrcWriterOptions options):
+    """
+    Write to ORC format.
+
+    The table to write, output paths, and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`write_csv`.
+
+    Parameters
+    ----------
+    options: OrcWriterOptions
+        Settings for controlling writing behavior
+
+    Returns
+    -------
+    None
+    """
+    with nogil:
+        cpp_write_orc(move(options.c_obj))
+
+
+cdef class OrcChunkedWriter:
+    cpdef void close(self):
+        """
+        Closes the chunked ORC writer.
+
+        Returns
+        -------
+        None
+        """
+        with nogil:
+            self.c_obj.get()[0].close()
+
+    cpdef void write(self, Table table):
+        """
+        Writes table to output.
+
+        Parameters
+        ----------
+        table: Table
+            able that needs to be written
+
+        Returns
+        -------
+        None
+        """
+        with nogil:
+            self.c_obj.get()[0].write(table.view())
+
+    @staticmethod
+    def from_options(ChunkedOrcWriterOptions options):
+        """
+        Creates a chunked ORC writer from options
+
+        Parameters
+        ----------
+        options: ChunkedOrcWriterOptions
+            Settings for controlling writing behavior
+
+        Returns
+        -------
+        OrcChunkedWriter
+        """
+        cdef OrcChunkedWriter orc_writer = OrcChunkedWriter.__new__(
+            OrcChunkedWriter
+        )
+        orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj))
+        return orc_writer
+
+
+cdef class ChunkedOrcWriterOptions:
+    cpdef void set_stripe_size_bytes(self, size_t size_bytes):
+        """
+        Sets the maximum stripe size, in bytes.
+
+        Parameters
+        ----------
+        size_bytes: size_t
+            Sets the maximum stripe size, in bytes.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_stripe_size_bytes(size_bytes)
+
+    cpdef void set_stripe_size_rows(self, size_type size_rows):
+        """
+        Sets the maximum stripe size, in rows.
+
+        If the stripe size is smaller that the row group size,
+        row group size will be reduced to math the stripe size.
+
+        Parameters
+        ----------
+        size_bytes: size_type
+            Maximum stripe size, in rows to be set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_stripe_size_rows(size_rows)
+
+    cpdef void set_row_index_stride(self, size_type stride):
+        """
+        Sets the row index stride.
+
+        Rounded down to a multiple of 8.
+
+        Parameters
+        ----------
+        size_bytes: size_type
+            Maximum stripe size, in rows to be set
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_row_index_stride(stride)
+
+    @staticmethod
+    def builder(SinkInfo sink):
+        """
+        Create builder to create ChunkedOrcWriterOptions.
+
+        Parameters
+        ----------
+        sink: SinkInfo
+            The sink used for writer output
+        table: Table
+            Table to be written to output
+
+        Returns
+        -------
+        ChunkedOrcWriterOptionsBuilder
+        """
+        cdef ChunkedOrcWriterOptionsBuilder orc_builder = \
+            ChunkedOrcWriterOptionsBuilder.__new__(
+                ChunkedOrcWriterOptionsBuilder
+            )
+        orc_builder.c_obj = chunked_orc_writer_options.builder(sink.c_obj)
+        orc_builder.sink = sink
+        return orc_builder
+
+
+cdef class ChunkedOrcWriterOptionsBuilder:
+    cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        comp: CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        ChunkedOrcWriterOptionsBuilder
+        """
+        self.c_obj.compression(comp)
+        return self
+
+    cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val):
+        """
+        Choose granularity of column statistics to be written.
+
+        Parameters
+        ----------
+        val: StatisticsFreq
+            Level of statistics collection
+
+        Returns
+        -------
+        ChunkedOrcWriterOptionsBuilder
+        """
+        self.c_obj.enable_statistics(val)
+        return self
+
+    cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata(
+        self,
+        dict kvm
+    ):
+        """
+        Sets Key-Value footer metadata.
+
+        Parameters
+        ----------
+        kvm: dict
+            Key-Value footer metadata
+
+        Returns
+        -------
+        ChunkedOrcWriterOptionsBuilder
+        """
+        self.c_obj.key_value_metadata(
+            {key.encode(): value.encode() for key, value in kvm.items()}
+        )
+        return self
+
+    cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta):
+        """
+        Sets associated metadata.
+
+        Parameters
+        ----------
+        meta: TableInputMetadata
+            Associated metadata
+
+        Returns
+        -------
+        ChunkedOrcWriterOptionsBuilder
+        """
+        self.c_obj.metadata(meta.c_obj)
+        return self
+
+    cpdef ChunkedOrcWriterOptions build(self):
+        """Create a OrcWriterOptions object"""
+        cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__(
+            ChunkedOrcWriterOptions
+        )
+        orc_options.c_obj = move(self.c_obj.build())
+        orc_options.sink = self.sink
+        return orc_options
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index 90b43cf0ff5..a1f3b17936c 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -3,6 +3,7 @@ from libc.stdint cimport uint8_t, int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from libcpp cimport bool
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.types cimport (
     column_encoding,
@@ -22,16 +23,16 @@ from pylibcudf.libcudf.io.types cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.table cimport Table
-
+from pylibcudf.libcudf.types cimport size_type
 
 cdef class PartitionInfo:
     cdef partition_info c_obj
 
 cdef class ColumnInMetadata:
-    cdef column_in_metadata c_obj
+    cdef column_in_metadata* c_obj
+    cdef TableInputMetadata owner
 
-    @staticmethod
-    cdef ColumnInMetadata from_metadata(column_in_metadata metadata)
+    cdef TableInputMetadata table
 
     cpdef ColumnInMetadata set_name(self, str name)
 
@@ -43,7 +44,7 @@ cdef class ColumnInMetadata:
 
     cpdef ColumnInMetadata set_int96_timestamps(self, bool req)
 
-    cpdef ColumnInMetadata set_decimal_precision(self, uint8_t req)
+    cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision)
 
     cpdef ColumnInMetadata child(self, size_type i)
 
@@ -57,8 +58,14 @@ cdef class ColumnInMetadata:
 
     cpdef str get_name(self)
 
+    @staticmethod
+    cdef ColumnInMetadata from_libcudf(
+        column_in_metadata* metadata, TableInputMetadata owner
+    )
+
 cdef class TableInputMetadata:
     cdef table_input_metadata c_obj
+    cdef list column_metadata
 
 cdef class TableWithMetadata:
     cdef public Table tbl
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
index 04f276cfeee..a3a559219ff 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyi
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -3,7 +3,7 @@ import io
 import os
 from collections.abc import Mapping
 from enum import IntEnum
-from typing import Any, Literal, TypeAlias, overload
+from typing import Any, Literal, Self, TypeAlias, overload
 
 from pylibcudf.column import Column
 from pylibcudf.io.datasource import Datasource
@@ -66,16 +66,16 @@ class TableInputMetadata:
     def __init__(self, table: Table): ...
 
 class ColumnInMetadata:
-    def set_name(self, name: str) -> ColumnInMetadata: ...
-    def set_nullability(self, nullable: bool) -> ColumnInMetadata: ...
-    def set_list_column_as_map(self) -> ColumnInMetadata: ...
-    def set_int96_timestamps(self, req: bool) -> ColumnInMetadata: ...
-    def set_decimal_precision(self, precision: int) -> ColumnInMetadata: ...
-    def child(self, i: int) -> ColumnInMetadata: ...
-    def set_output_as_binary(self, binary: bool) -> ColumnInMetadata: ...
-    def set_type_length(self, type_length: int) -> ColumnInMetadata: ...
-    def set_skip_compression(self, skip: bool) -> ColumnInMetadata: ...
-    def set_encoding(self, encoding: ColumnEncoding) -> ColumnInMetadata: ...
+    def set_name(self, name: str) -> Self: ...
+    def set_nullability(self, nullable: bool) -> Self: ...
+    def set_list_column_as_map(self) -> Self: ...
+    def set_int96_timestamps(self, req: bool) -> Self: ...
+    def set_decimal_precision(self, precision: int) -> Self: ...
+    def child(self, i: int) -> Self: ...
+    def set_output_as_binary(self, binary: bool) -> Self: ...
+    def set_type_length(self, type_length: int) -> Self: ...
+    def set_skip_compression(self, skip: bool) -> Self: ...
+    def set_encoding(self, encoding: ColumnEncoding) -> Self: ...
     def get_name(self) -> str: ...
 
 class TableWithMetadata:
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 460ab6844c3..a2155829f2c 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -2,7 +2,6 @@
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
-from libc.stdint cimport uint8_t, int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -20,6 +19,8 @@ from pylibcudf.libcudf.io.types cimport (
     source_info,
     table_input_metadata,
     table_with_metadata,
+    column_in_metadata,
+    table_input_metadata,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -38,9 +39,14 @@ from pylibcudf.libcudf.io.types import (
     quote_style as QuoteStyle,  # no-cython-lint
     statistics_freq as StatisticsFreq, # no-cython-lint
 )
+from cython.operator cimport dereference
+from pylibcudf.libcudf.types cimport size_type
+from cython.operator cimport dereference
+from pylibcudf.libcudf.types cimport size_type
 
 __all__ = [
     "ColumnEncoding",
+    "ColumnInMetadata",
     "CompressionType",
     "DictionaryPolicy",
     "JSONRecoveryMode",
@@ -74,18 +80,30 @@ cdef class ColumnInMetadata:
     Metadata for a column
     """
 
+    def __init__(self):
+        raise ValueError(
+            "ColumnInMetadata should not be constructed directly. "
+            "Use one of the factories."
+        )
+
     @staticmethod
-    cdef ColumnInMetadata from_metadata(column_in_metadata metadata):
+    cdef ColumnInMetadata from_libcudf(
+        column_in_metadata* metadata, TableInputMetadata owner
+    ):
         """
-        Construct a ColumnInMetadata.
+        A Python representation of `column_in_metadata`.
 
         Parameters
         ----------
-        metadata : column_in_metadata
-        """
-        cdef ColumnInMetadata col_metadata = ColumnInMetadata.__new__(ColumnInMetadata)
-        col_metadata.c_obj = metadata
-        return col_metadata
+        metadata : column_in_metadata*
+            Raw pointer to C++ metadata.
+        owner : TableInputMetadata
+            Owning table input metadata that manages lifetime of the raw pointer.
+        """
+        cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata)
+        out.c_obj = metadata
+        out.owner = owner
+        return out
 
     cpdef ColumnInMetadata set_name(self, str name):
         """
@@ -100,7 +118,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_name(name.encode())
+        dereference(self.c_obj).set_name(name.encode())
         return self
 
     cpdef ColumnInMetadata set_nullability(self, bool nullable):
@@ -116,7 +134,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_nullability(nullable)
+        dereference(self.c_obj).set_nullability(nullable)
         return self
 
     cpdef ColumnInMetadata set_list_column_as_map(self):
@@ -128,7 +146,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_list_column_as_map()
+        dereference(self.c_obj).set_list_column_as_map()
         return self
 
     cpdef ColumnInMetadata set_int96_timestamps(self, bool req):
@@ -145,7 +163,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_int96_timestamps(req)
+        dereference(self.c_obj).set_int96_timestamps(req)
         return self
 
     cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision):
@@ -162,7 +180,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_decimal_precision(precision)
+        dereference(self.c_obj).set_decimal_precision(precision)
         return self
 
     cpdef ColumnInMetadata child(self, size_type i):
@@ -178,7 +196,8 @@ cdef class ColumnInMetadata:
         -------
         ColumnInMetadata
         """
-        return ColumnInMetadata.from_metadata(self.c_obj.child(i))
+        cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i)
+        return ColumnInMetadata.from_libcudf(child_c_obj, self.owner)
 
     cpdef ColumnInMetadata set_output_as_binary(self, bool binary):
         """
@@ -193,7 +212,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_output_as_binary(binary)
+        dereference(self.c_obj).set_output_as_binary(binary)
         return self
 
     cpdef ColumnInMetadata set_type_length(self, int32_t type_length):
@@ -209,7 +228,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_type_length(type_length)
+        dereference(self.c_obj).set_type_length(type_length)
         return self
 
     cpdef ColumnInMetadata set_skip_compression(self, bool skip):
@@ -226,7 +245,7 @@ cdef class ColumnInMetadata:
         -------
         Self
         """
-        self.c_obj.set_skip_compression(skip)
+        dereference(self.c_obj).set_skip_compression(skip)
         return self
 
     cpdef ColumnInMetadata set_encoding(self, column_encoding encoding):
@@ -243,7 +262,7 @@ cdef class ColumnInMetadata:
         -------
         ColumnInMetadata
         """
-        self.c_obj.set_encoding(encoding)
+        dereference(self.c_obj).set_encoding(encoding)
         return self
 
     cpdef str get_name(self):
@@ -255,7 +274,7 @@ cdef class ColumnInMetadata:
         str
             The name of this column
         """
-        return self.c_obj.get_name().decode()
+        return dereference(self.c_obj).get_name().decode()
 
 
 cdef class TableInputMetadata:
@@ -269,6 +288,10 @@ cdef class TableInputMetadata:
     """
     def __init__(self, Table table):
         self.c_obj = table_input_metadata(table.view())
+        self.column_metadata = [
+            ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self)
+            for i in range(self.c_obj.column_metadata.size())
+        ]
 
 
 cdef class TableWithMetadata:
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 5ed660ba6cf..2557e40c935 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+
 import pyarrow as pa
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
@@ -52,3 +53,64 @@ def test_read_orc_basic(
     )
 
     assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize(
+    "compression",
+    [
+        plc.io.types.CompressionType.NONE,
+        plc.io.types.CompressionType.SNAPPY,
+    ],
+)
+@pytest.mark.parametrize(
+    "statistics",
+    [
+        plc.io.types.StatisticsFreq.STATISTICS_NONE,
+        plc.io.types.StatisticsFreq.STATISTICS_COLUMN,
+    ],
+)
+@pytest.mark.parametrize("stripe_size_bytes", [None, 65536])
+@pytest.mark.parametrize("stripe_size_rows", [None, 512])
+@pytest.mark.parametrize("row_index_stride", [None, 512])
+def test_roundtrip_pa_table(
+    compression,
+    statistics,
+    stripe_size_bytes,
+    stripe_size_rows,
+    row_index_stride,
+    tmp_path,
+):
+    pa_table = pa.table({"a": [1.0, 2.0, None], "b": [True, None, False]})
+    plc_table = plc.interop.from_arrow(pa_table)
+
+    tmpfile_name = tmp_path / "test.orc"
+
+    sink = plc.io.SinkInfo([str(tmpfile_name)])
+
+    tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+    user_data = {"a": "", "b": ""}
+    options = (
+        plc.io.orc.OrcWriterOptions.builder(sink, plc_table)
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(compression)
+        .enable_statistics(statistics)
+        .build()
+    )
+    if stripe_size_bytes is not None:
+        options.set_stripe_size_bytes(stripe_size_bytes)
+    if stripe_size_rows is not None:
+        options.set_stripe_size_rows(stripe_size_rows)
+    if row_index_stride is not None:
+        options.set_row_index_stride(row_index_stride)
+
+    plc.io.orc.write_orc(options)
+
+    read_table = pa.orc.read_table(str(tmpfile_name))
+
+    res = plc.io.types.TableWithMetadata(
+        plc.interop.from_arrow(read_table),
+        [(name, []) for name in pa_table.schema.names],
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py
new file mode 100644
index 00000000000..a7642556bf2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import gc
+import weakref
+
+import pyarrow as pa
+
+import pylibcudf as plc
+
+
+def test_gc_with_table_and_column_input_metadata():
+    class Foo(plc.io.types.TableInputMetadata):
+        def __del__(self):
+            pass
+
+    pa_table = pa.table(
+        {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])}
+    )
+    plc_table = plc.interop.from_arrow(pa_table)
+
+    tbl_meta = Foo(plc_table)
+    weak_tbl_meta = weakref.ref(tbl_meta)
+
+    del tbl_meta
+
+    gc.collect()
+
+    assert weak_tbl_meta() is None