From 618a93fc99ccb916177cb03429c69c8bbd5639b3 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 8 Oct 2024 17:24:45 -0400
Subject: [PATCH 01/24] Migrate nvtext jaccard API to pylibcudf (#17007)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17007
---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/jaccard.rst     |  6 +++
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      | 33 ++++---------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  3 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd |  7 +++
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 47 +++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_jaccard.py    | 37 +++++++++++++++
 9 files changed, 111 insertions(+), 28 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 2e03b589c8b..6300f77d686 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -6,3 +6,4 @@ nvtext
 
     edit_distance
     generate_ngrams
+    jaccard
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
new file mode 100644
index 00000000000..ea59657c25e
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
@@ -0,0 +1,6 @@
+=======
+jaccard
+=======
+
+.. automodule:: pylibcudf.nvtext.jaccard
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 0ebf7c281e3..c964d0206b7 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -2,33 +2,16 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.jaccard cimport (
-    jaccard_index as cpp_jaccard_index,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 @acquire_spill_lock()
 def jaccard_index(Column input1, Column input2, int width):
-    cdef column_view c_input1 = input1.view()
-    cdef column_view c_input2 = input2.view()
-    cdef size_type c_width = width
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_jaccard_index(
-                c_input1,
-                c_input2,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.jaccard.jaccard_index(
+        input1.to_pylibcudf(mode="read"),
+        input2.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index eb5617a1da6..9913e1fbadb 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 7f5fa2b9925..5f1762b1e3d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams
+from . cimport edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index a66ce984745..1c0ddb1e5a4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams
+from . import edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
new file mode 100644
index 00000000000..a4d4a15335b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width)
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
new file mode 100644
index 00000000000..9334d7ce751
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -0,0 +1,47 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width):
+    """
+    Returns the Jaccard similarity between individual rows in two strings columns.
+
+    For details, see :cpp:func:`jaccard_index`
+
+    Parameters
+    ----------
+    input1 : Column
+        Input strings column
+    input2 : Column
+        Input strings column
+    width : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Index calculation values
+    """
+    cdef column_view c_input1 = input1.view()
+    cdef column_view c_input2 = input2.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_jaccard_index(
+                c_input1,
+                c_input2,
+                width
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
new file mode 100644
index 00000000000..d5a168426b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_data():
+    input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+    input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+    return pa.array(input1), pa.array(input2)
+
+
+@pytest.mark.parametrize("width", [2, 3])
+def test_jaccard_index(input_data, width):
+    def get_tokens(s, width):
+        return [s[i : i + width] for i in range(len(s) - width + 1)]
+
+    def jaccard_index(s1, s2, width):
+        x = set(get_tokens(s1, width))
+        y = set(get_tokens(s2, width))
+        return len(x & y) / len(x | y)
+
+    input1, input2 = input_data
+    result = plc.nvtext.jaccard.jaccard_index(
+        plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
+    )
+    expected = pa.array(
+        [
+            jaccard_index(s1.as_py(), s2.as_py(), width)
+            for s1, s2 in zip(input1, input2)
+        ],
+        type=pa.float32(),
+    )
+    assert_column_eq(result, expected)

From 349ba5d37789938a34c1ad75eb5eb57f1db85b2c Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 8 Oct 2024 17:06:10 -0500
Subject: [PATCH 02/24] make conda installs in CI stricter (#17013)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls in CI that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17013
---
 ci/build_docs.sh         | 8 +++++---
 ci/test_cpp_common.sh    | 7 ++++++-
 ci/test_java.sh          | 4 +++-
 ci/test_notebooks.sh     | 5 ++++-
 ci/test_python_common.sh | 5 ++++-
 ci/test_python_other.sh  | 6 +++++-
 6 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index c67d127e635..dae6ac46757 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -3,8 +3,7 @@
 
 set -euo pipefail
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 
 rapids-logger "Create test conda environment"
@@ -29,7 +28,10 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcudf pylibcudf cudf dask-cudf
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "pylibcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index f5a8de543f6..e8f6e9388f4 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate C++ testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -31,7 +33,10 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests libcudf-example
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf-tests=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf-example=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 629ad11014a..9b7b2e48dd6 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate Java testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index da9478ce25d..3e0712a0691 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate notebook testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
 pushd notebooks
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index dc70661a17a..81e82908eb4 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -7,6 +7,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate Python testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -38,4 +40,5 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 67c97ad29a5..eee1d54083f 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -7,10 +7,14 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 # Common setup steps shared by Python test jobs
 source ./ci/test_python_common.sh test_python_other
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  dask-cudf cudf_kafka custreamz
+  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "cudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "custreamz=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 5b931aca22a06734332963577a91e6af90bb6a68 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:35:13 -1000
Subject: [PATCH 03/24] Add string.convert.convert_urls APIs to pylibcudf
 (#17003)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Also I believe the cpp docstrings were incorrect, but could use a second look.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17003
---
 .../cudf/strings/convert/convert_urls.hpp     |  4 +-
 .../_lib/strings/convert/convert_urls.pyx     | 36 +++--------
 .../libcudf/strings/convert/convert_urls.pxd  |  4 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |  2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |  1 +
 .../pylibcudf/strings/convert/__init__.py     |  1 +
 .../strings/convert/convert_urls.pxd          |  8 +++
 .../strings/convert/convert_urls.pyx          | 63 +++++++++++++++++++
 .../tests/test_string_convert_urls.py         | 36 +++++++++++
 9 files changed, 121 insertions(+), 34 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py

diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index d6e87f9d543..febc63d8779 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -28,7 +28,7 @@ namespace strings {
  */
 
 /**
- * @brief Decodes each string using URL encoding.
+ * @brief Encodes each string using URL encoding.
  *
  * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points
  * prefixed with '%'. For example, the space character must be converted to characters '%20' where
@@ -49,7 +49,7 @@ std::unique_ptr<column> url_encode(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Encodes each string using URL encoding.
+ * @brief Decodes each string using URL encoding.
  *
  * Converts all character sequences starting with '%' into character code-points
  * interpreting the 2 following characters as hex values to create the code-point.
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index e52116d6247..d5c2f771970 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,17 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_urls cimport (
-    url_decode as cpp_url_decode,
-    url_encode as cpp_url_encode,
-)
-
 from cudf._lib.column cimport Column
 
 
@@ -28,17 +20,10 @@ def url_decode(Column source_strings):
     -------
     URL decoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_decode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_decode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -57,14 +42,7 @@ def url_encode(Column source_strings):
     -------
     URL encoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_encode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_encode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5c07b698454..cb319ad143b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] url_encode(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] url_decode(
-        column_view input_col) except +
+        column_view input) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index eb0d6ee6999..41aeb72039b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx
+                   convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 431beed8e5d..b4b0b521e39 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -5,4 +5,5 @@ from . cimport (
     convert_durations,
     convert_fixed_point,
     convert_ipv4,
+    convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index a601b562c2e..409620fce45 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -5,4 +5,5 @@
     convert_durations,
     convert_fixed_point,
     convert_ipv4,
+    convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
new file mode 100644
index 00000000000..da05ce93426
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column url_encode(Column Input)
+
+cpdef Column url_decode(Column Input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
new file mode 100644
index 00000000000..a5e080e53b7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls
+
+
+cpdef Column url_encode(Column input):
+    """
+    Encodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_encode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_urls.url_encode(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column url_decode(Column input):
+    """
+    Decodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_decode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_urls.url_decode(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
new file mode 100644
index 00000000000..fee8c3fb8f6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import urllib
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_url_encode():
+    data = ["/home/nfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_encode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.quote(url, safe="") if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_url_decode():
+    data = ["%2Fhome%2fnfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_decode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.unquote(url) if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)

From ded4dd2acbf2c5933765853eab56f4d37599c909 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 8 Oct 2024 18:02:14 -0700
Subject: [PATCH 04/24] Add pinning for pyarrow in wheels (#17018)

We have recently observed a number of seg faults in our Python tests. From some investigation, the error comes from the import of pyarrow loading the bundled libarrow.so, and in particular when that library runs a jemalloc function `background_thread_entry`. We have observed similar (but not identical) errors in the past that have to do with as-yet unsolved problems in the way that arrow handles multi-threaded environments. The error is currently only observed on arm runners and with pyarrow 17.0.0. In my tests the error is highly sensitive to everything from import order to unrelated code segments, suggesting a race condition, some form of memory corruption, or perhaps symbol resolution errors at runtime. As a result, I have had limited success in drilling down further into specific causes, especially since attempts to rebuild libarrow.so also squash the error and I therefore cannot use debug symbols. From some offline discussion we decided that avoiding the problematic version is a sufficient fix for now. Due to the sensitivity, I am simply skipping 17.0.0 in this PR. I suspect that future builds of pyarrow will also usually not exhibit this bug (although it may recur occasionally on specific versions of pyarrow). Therefore, rather than lowering the upper bound I would prefer to allow us to float and see if and when this problem reappears. Since our DFG+RBB combination for wheel builds does not yet support any matrix entry other than `cuda`, I'm using environment markers to specify the constraint rather than a matrix entry in dependencies.yaml.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17018
---
 dependencies.yaml                                | 11 ++++++++++-
 python/cudf/pyproject.toml                       |  3 ++-
 python/cudf_polars/tests/expressions/test_agg.py |  2 +-
 python/pylibcudf/pyproject.toml                  |  3 ++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 3561b22965d..ca17917c905 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -421,9 +421,18 @@ dependencies:
           - cython>=3.0.3
   pyarrow_run:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
           - pyarrow>=14.0.0,<18.0.0a0
+      - output_types: [requirements, pyproject]
+        packages:
+          # pyarrow 17.0.0 wheels have a subtle issue around threading that
+          # can cause segmentation faults around imports on arm. It appears to
+          # be highly dependent on the exact build configuration, so we'll just
+          # avoid 17.0.0 for now unless we observe similar issues in future
+          # releases as well.
+          - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index c0776fd0de6..feab04ffadc 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,7 +30,8 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==24.12.*,>=0.0.0a0",
     "rich",
     "rmm==24.12.*,>=0.0.0a0",
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 56055f4c6c2..3001a61101a 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -93,7 +93,7 @@ def test_bool_agg(agg, request):
     expr = getattr(pl.col("a"), agg)()
     q = df.select(expr)
 
-    assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_exact=False)
 
 
 @pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index be65142850f..c9a685de3e9 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,8 @@ dependencies = [
     "libcudf==24.12.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "rmm==24.12.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From a6853f4b3832b5338a4d0cd9d0b93c7bcd1ce884 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Tue, 8 Oct 2024 23:03:03 -0500
Subject: [PATCH 05/24] Refactor `histogram` reduction using
 `cuco::static_set::insert_and_find` (#16485)

Refactors `histogram` reduce and groupby aggregations using `cuco::static_set::insert_and_find`. Speed improvement results [here](https://github.com/rapidsai/cudf/pull/16485#issuecomment-2394855796) and [here](https://github.com/rapidsai/cudf/pull/16485#issuecomment-2394865692).

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16485
---
 cpp/benchmarks/CMakeLists.txt                 |  10 +-
 cpp/benchmarks/groupby/group_histogram.cpp    |  90 ++++++++++
 cpp/benchmarks/reduction/histogram.cpp        |  68 +++++++
 .../cudf/detail/hash_reduce_by_row.cuh        | 169 ------------------
 cpp/src/reductions/histogram.cu               | 164 +++++++----------
 5 files changed, 231 insertions(+), 270 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_histogram.cpp
 create mode 100644 cpp/benchmarks/reduction/histogram.cpp
 delete mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b8a53cd8bd9..b0f75b25975 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -245,6 +245,7 @@ ConfigureNVBench(
   REDUCTION_NVBENCH
   reduction/anyall.cpp
   reduction/dictionary.cpp
+  reduction/histogram.cpp
   reduction/minmax.cpp
   reduction/rank.cpp
   reduction/reduce.cpp
@@ -270,8 +271,13 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
-  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH
+  groupby/group_histogram.cpp
+  groupby/group_max.cpp
+  groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp
+  groupby/group_rank.cpp
+  groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
new file mode 100644
index 00000000000..cd7f9f298af
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_histogram_helper(nvbench::state& state,
+                              cudf::size_type num_rows,
+                              cudf::size_type cardinality,
+                              double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const values = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  // Vector of 1 request
+  std::vector<cudf::groupby::aggregation_request> requests(1);
+  requests.back().values = values->view();
+  requests.back().aggregations.push_back(
+    cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto gb_obj       = cudf::groupby::groupby(cudf::table_view({keys->view()}));
+    auto const result = gb_obj.aggregate(requests);
+  });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time, "rows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_histogram,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_histogram")
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
new file mode 100644
index 00000000000..d0925de5c87
--- /dev/null
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf/aggregation.hpp"
+#include "cudf/detail/aggregation/aggregation.hpp"
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename type>
+static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
+{
+  auto const dtype = cudf::type_to_id<type>();
+
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  data_profile const profile = data_profile_builder()
+                                 .null_probability(null_probability)
+                                 .cardinality(cardinality)
+                                 .distribution(dtype, distribution_id::UNIFORM, 0, num_rows);
+
+  auto const input = create_random_column(dtype, row_count{num_rows}, profile);
+  auto agg         = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
+  });
+
+  state.add_element_count(input->size());
+}
+
+using data_type = nvbench::type_list<int32_t, int64_t>;
+
+NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
+  .set_name("histogram")
+  .add_float64_axis("null_probability", {0.1})
+  .add_int64_axis("cardinality",
+                  {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
deleted file mode 100644
index 7de79b31bc7..00000000000
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/static_map.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
-
-/**
- * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
- * rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn_base {
- protected:
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  OutputType* const d_output;
-
-  reduce_by_row_fn_base(MapView const& d_map,
-                        KeyHasher const& d_hasher,
-                        KeyEqual const& d_equal,
-                        OutputType* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
-  {
-  }
-
-  /**
-   * @brief Return a pointer to the output array at the given index.
-   *
-   * @param idx The access index
-   * @return A pointer to the given index in the output array
-   */
-  __device__ OutputType* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `init` parameter. Then, the reduction result for each row group is written into the output
- * array at the index of an unspecified row in the group.
- *
- * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
- *         reduction functor derived from `reduce_by_row_fn_base`
- * @tparam OutputType Type of the reduction results
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param init The initial value for reduction of each row group
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<OutputType> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  ReduceFuncBuilder func_builder,
-  OutputType init,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto const map_dview  = map.get_device_view();
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 362b5f74c46..b40b2b6dd2e 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -15,18 +15,24 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/operator.hpp>
+#include <cuco/static_set.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
+#include <thrust/uninitialized_fill.h>
 
 #include <optional>
 
@@ -34,61 +40,12 @@ namespace cudf::reduction::detail {
 
 namespace {
 
+// A CUDA Cooperative Group of 1 thread for the hash set for histogram
+auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1;
+
 // Always use 64-bit signed integer for storing count.
 using histogram_count_type = int64_t;
 
-/**
- * @brief The functor to accumulate the frequency of each distinct rows in the input table.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
-struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
-  CountType const* d_partial_output;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            CountType* const d_output,
-            CountType const* const d_partial_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
-                                                                                   d_hasher,
-                                                                                   d_equal,
-                                                                                   d_output},
-      d_partial_output{d_partial_output}
-  {
-  }
-
-  // Count the number of rows in each group of rows that are compared equal.
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
-    auto const count =
-      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
-    count.fetch_add(increment, cuda::std::memory_order_relaxed);
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor.
- */
-template <typename CountType>
-struct reduce_func_builder {
-  CountType const* const d_partial_output;
-
-  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
-  {
-  }
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             CountType* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
-      d_map, d_hasher, d_equal, d_output, d_partial_output};
-  }
-};
-
 /**
  * @brief Specialized functor to check for not-zero of the second component of the input.
  */
@@ -163,14 +120,6 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{
-    compute_hash_table_size(input.num_rows()),
-    cuco::empty_key{-1},
-    cuco::empty_value{std::numeric_limits<size_type>::min()},
-
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
@@ -179,51 +128,68 @@ compute_row_frequencies(table_view const& input,
   auto const key_hasher = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
   // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
+  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
+  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
+  size_t const num_rows = input.num_rows();
+
+  // Construct a vector to store reduced counts and init to zero
+  rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             histogram_count_type{0});
+
+  // Construct a hash set
+  auto row_set = cuco::static_set{
+    cuco::extent{num_rows},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+    cuco::empty_key<size_type>{-1},
+    key_equal,
+    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+    {},  // thread scope
+    {},  // storage
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
-  if (has_nested_columns) {
-    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  } else {
-    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  }
-
-  // Gather the indices of distinct rows.
-  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<size_type>(map.get_size()), stream, mr);
-
-  // Store the number of occurrences of each distinct row.
-  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
-                                             static_cast<size_type>(map.get_size()),
-                                             mask_state::UNALLOCATED,
-                                             stream,
-                                             mr);
+  // Device-accessible reference to the hash set with `insert_and_find` operator
+  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
 
   // Compute frequencies (aka distinct counts) for the input rows.
   // Note that we consider null and NaNs as always equal.
-  auto const reduction_results = cudf::detail::hash_reduce_by_row(
-    map,
-    preprocessed_input,
-    input.num_rows(),
-    has_nulls,
-    has_nested_columns,
-    null_equality::EQUAL,
-    nan_equality::ALL_EQUAL,
-    reduce_func_builder<histogram_count_type>{
-      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
-    histogram_count_type{0},
-    stream,
-    cudf::get_current_device_resource_ref());
-
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(num_rows),
+    [set_ref = row_set_ref,
+     increments =
+       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
+     counts = reduction_results.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+        counts[*inserted_idx_ptr]};
+      auto const increment = increments ? increments[idx] : histogram_count_type{1};
+      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+    });
+
+  // Set-size is the number of distinct (inserted) rows
+  auto const set_size = row_set.size(stream);
+
+  // Vector of distinct indices
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);
+  // Column of distinct counts
+  auto distinct_counts = make_numeric_column(
+    data_type{type_to_id<histogram_count_type>()}, set_size, mask_state::UNALLOCATED, stream, mr);
+
+  // Copy row indices and counts to the output if counts are non-zero
   auto const input_it = thrust::make_zip_iterator(
     thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
   auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
@@ -232,7 +198,7 @@ compute_row_frequencies(table_view const& input,
   // Reduction results above are either group sizes of equal rows, or `0`.
   // The final output is non-zero group sizes only.
   thrust::copy_if(
-    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From bfac5e5d9b2c10718d2f0f925b4f2c9f62d8fea1 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 9 Oct 2024 13:56:10 +0800
Subject: [PATCH 06/24] Disable kvikio remote I/O to avoid openssl dependencies
 in JNI build (#17026)

the same issue as https://github.com/NVIDIA/spark-rapids-jni/issues/2475 due to https://github.com/rapidsai/kvikio/pull/464

Port the fix from https://github.com/NVIDIA/spark-rapids-jni/pull/2476, verified locally

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17026
---
 java/ci/build-in-docker.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 5a429bdc739..4b5379cf0f1 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -64,7 +64,8 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \
          -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
-         -DBUILD_SHARED_LIBS=OFF
+         -DBUILD_SHARED_LIBS=OFF \
+         -DKvikIO_REMOTE_SUPPORT=OFF
 
 if [[ -z "${PARALLEL_LEVEL}" ]]; then
     cmake --build .

From 319a53327ac7c921a78979a1f23c5caf7171129d Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:38:30 -0400
Subject: [PATCH 07/24] Update Changelog [skip ci]

---
 CHANGELOG.md | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 296 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2a7c337675..7a75b2a95a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,299 @@
+# cudf 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+
+## 🐛 Bug Fixes
+
+- Add license to the pylibcudf wheel ([#16976](https://github.com/rapidsai/cudf/pull/16976)) [@raydouglass](https://github.com/raydouglass)
+- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16950](https://github.com/rapidsai/cudf/pull/16950)) [@shrshi](https://github.com/shrshi)
+- Add dask-cudf workaround for missing `rename_axis` support in cudf ([#16899](https://github.com/rapidsai/cudf/pull/16899)) [@rjzamora](https://github.com/rjzamora)
+- Update oldest deps for `pyarrow` &amp; `numpy` ([#16883](https://github.com/rapidsai/cudf/pull/16883)) [@galipremsagar](https://github.com/galipremsagar)
+- Update labeler for pylibcudf ([#16868](https://github.com/rapidsai/cudf/pull/16868)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Refactor mixed_semi_join using cuco::static_set&quot; ([#16855](https://github.com/rapidsai/cudf/pull/16855)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Fix metadata after implicit array conversion from Dask cuDF ([#16842](https://github.com/rapidsai/cudf/pull/16842)) [@rjzamora](https://github.com/rjzamora)
+- Add cudf.pandas dependencies.yaml to update-version.sh ([#16840](https://github.com/rapidsai/cudf/pull/16840)) [@raydouglass](https://github.com/raydouglass)
+- Use cupy 12.2.0 as oldest dependency pinning on CUDA 12 ARM ([#16808](https://github.com/rapidsai/cudf/pull/16808)) [@bdice](https://github.com/bdice)
+- Revert &quot;Fix empty cluster handling in tdigest merge ([#16675)&quot; (#16800](https://github.com/rapidsai/cudf/pull/16675)&quot; (#16800)) [@jihoonson](https://github.com/jihoonson)
+- Intentionally leak thread_local CUDA resources to avoid crash (part 1) ([#16787](https://github.com/rapidsai/cudf/pull/16787)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix `cov`/`corr` bug in dask-cudf ([#16786](https://github.com/rapidsai/cudf/pull/16786)) [@rjzamora](https://github.com/rjzamora)
+- Fix slice_strings wide strings logic with multi-byte characters ([#16777](https://github.com/rapidsai/cudf/pull/16777)) [@davidwendt](https://github.com/davidwendt)
+- Fix nvbench output for sha512 ([#16773](https://github.com/rapidsai/cudf/pull/16773)) [@davidwendt](https://github.com/davidwendt)
+- Allow read_csv(header=None) to return int column labels in `mode.pandas_compatible` ([#16769](https://github.com/rapidsai/cudf/pull/16769)) [@mroeschke](https://github.com/mroeschke)
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) ([#16712](https://github.com/rapidsai/cudf/pull/16712)) [@mroeschke](https://github.com/mroeschke)
+- Use merge base when calculating changed files ([#16709](https://github.com/rapidsai/cudf/pull/16709)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Ensure we pass the has_nulls tparam to mixed_join kernels ([#16708](https://github.com/rapidsai/cudf/pull/16708)) [@abellina](https://github.com/abellina)
+- Add boost-devel to Java CI Docker image ([#16707](https://github.com/rapidsai/cudf/pull/16707)) [@jlowe](https://github.com/jlowe)
+- [BUG] Add gpu node type to cudf-pandas 3rd-party integration nightly CI job ([#16704](https://github.com/rapidsai/cudf/pull/16704)) [@Matt711](https://github.com/Matt711)
+- Fix typo in column_factories.hpp comment from &#39;depth 1&#39; to &#39;depth 2&#39; ([#16700](https://github.com/rapidsai/cudf/pull/16700)) [@a-hirota](https://github.com/a-hirota)
+- Fix Series.to_frame(name=None) setting a None name ([#16698](https://github.com/rapidsai/cudf/pull/16698)) [@mroeschke](https://github.com/mroeschke)
+- Disable gtests/ERROR_TEST during compute-sanitizer memcheck test ([#16691](https://github.com/rapidsai/cudf/pull/16691)) [@davidwendt](https://github.com/davidwendt)
+- Enable batched multi-source reading of JSONL files with large records ([#16687](https://github.com/rapidsai/cudf/pull/16687)) [@shrshi](https://github.com/shrshi)
+- Handle `ordered` parameter in `CategoricalIndex.__repr__` ([#16683](https://github.com/rapidsai/cudf/pull/16683)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix loc/iloc.__setitem__[:, loc] with non cupy types ([#16677](https://github.com/rapidsai/cudf/pull/16677)) [@mroeschke](https://github.com/mroeschke)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Fix `cudf::rank` not getting enough params ([#16666](https://github.com/rapidsai/cudf/pull/16666)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Fix slowdown in `CategoricalIndex.__repr__` ([#16665](https://github.com/rapidsai/cudf/pull/16665)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Fix slowdown in DataFrame repr in jupyter notebook ([#16656](https://github.com/rapidsai/cudf/pull/16656)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve Series name in duplicated method. ([#16655](https://github.com/rapidsai/cudf/pull/16655)) [@bdice](https://github.com/bdice)
+- Fix interval_range right child non-zero offset ([#16651](https://github.com/rapidsai/cudf/pull/16651)) [@mroeschke](https://github.com/mroeschke)
+- fix libcudf wheel publishing, make package-type explicit in wheel publishing ([#16650](https://github.com/rapidsai/cudf/pull/16650)) [@jameslamb](https://github.com/jameslamb)
+- Revert &quot;Hide all gtest symbols in cudftestutil ([#16546)&quot; (#16644](https://github.com/rapidsai/cudf/pull/16546)&quot; (#16644)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix integer overflow in indexalator pointer logic ([#16643](https://github.com/rapidsai/cudf/pull/16643)) [@davidwendt](https://github.com/davidwendt)
+- Allow for binops between two differently sized DecimalDtypes ([#16638](https://github.com/rapidsai/cudf/pull/16638)) [@mroeschke](https://github.com/mroeschke)
+- Move pragma once in rolling/jit/operation.hpp. ([#16636](https://github.com/rapidsai/cudf/pull/16636)) [@bdice](https://github.com/bdice)
+- Fix overflow bug in low-memory JSON reader ([#16632](https://github.com/rapidsai/cudf/pull/16632)) [@shrshi](https://github.com/shrshi)
+- Add the missing `num_aggregations` axis for `groupby_max_cardinality` ([#16630](https://github.com/rapidsai/cudf/pull/16630)) [@PointKernel](https://github.com/PointKernel)
+- Fix strings::detail::copy_range when target contains nulls ([#16626](https://github.com/rapidsai/cudf/pull/16626)) [@davidwendt](https://github.com/davidwendt)
+- Fix function parameters with common dependency modified during their evaluation ([#16620](https://github.com/rapidsai/cudf/pull/16620)) [@ttnghia](https://github.com/ttnghia)
+- bug-fix: Don&#39;t enable the CUDA language if testing was requested when finding cudf ([#16615](https://github.com/rapidsai/cudf/pull/16615)) [@cryos](https://github.com/cryos)
+- bug-fix: cudf/io/json.hpp use after move ([#16609](https://github.com/rapidsai/cudf/pull/16609)) [@NicolasDenoyelle](https://github.com/NicolasDenoyelle)
+- Remove CUDA whole compilation ODR violations ([#16603](https://github.com/rapidsai/cudf/pull/16603)) [@robertmaynard](https://github.com/robertmaynard)
+- MAINT: Adapt to numpy hiding flagsobject away ([#16593](https://github.com/rapidsai/cudf/pull/16593)) [@seberg](https://github.com/seberg)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Switch python version to `3.10` in `cudf.pandas` pandas test scripts ([#16559](https://github.com/rapidsai/cudf/pull/16559)) [@galipremsagar](https://github.com/galipremsagar)
+- Hide all gtest symbols in cudftestutil ([#16546](https://github.com/rapidsai/cudf/pull/16546)) [@robertmaynard](https://github.com/robertmaynard)
+- Update the java code to properly deal with lists being returned as strings ([#16536](https://github.com/rapidsai/cudf/pull/16536)) [@revans2](https://github.com/revans2)
+- Register `read_parquet` and `read_csv` with dask-expr ([#16535](https://github.com/rapidsai/cudf/pull/16535)) [@rjzamora](https://github.com/rjzamora)
+- Change cudf::empty_like to not include offsets for empty strings columns ([#16529](https://github.com/rapidsai/cudf/pull/16529)) [@davidwendt](https://github.com/davidwendt)
+- Fix DataFrame reductions with median returning scalar instead of Series ([#16527](https://github.com/rapidsai/cudf/pull/16527)) [@mroeschke](https://github.com/mroeschke)
+- Allow DataFrame.sort_values(by=) to select an index level ([#16519](https://github.com/rapidsai/cudf/pull/16519)) [@mroeschke](https://github.com/mroeschke)
+- Fix `date_range(start, end, freq)` when end-start is divisible by freq ([#16516](https://github.com/rapidsai/cudf/pull/16516)) [@mroeschke](https://github.com/mroeschke)
+- Preserve array name in MultiIndex.from_arrays ([#16515](https://github.com/rapidsai/cudf/pull/16515)) [@mroeschke](https://github.com/mroeschke)
+- Disallow indexing by selecting duplicate labels ([#16514](https://github.com/rapidsai/cudf/pull/16514)) [@mroeschke](https://github.com/mroeschke)
+- Fix `.replace(Index, Index)` raising a TypeError ([#16513](https://github.com/rapidsai/cudf/pull/16513)) [@mroeschke](https://github.com/mroeschke)
+- Check index bounds in compact protocol reader. ([#16493](https://github.com/rapidsai/cudf/pull/16493)) [@bdice](https://github.com/bdice)
+- Fix build failures with GCC 13 ([#16488](https://github.com/rapidsai/cudf/pull/16488)) [@PointKernel](https://github.com/PointKernel)
+- Fix all-empty input column for strings split APIs ([#16466](https://github.com/rapidsai/cudf/pull/16466)) [@davidwendt](https://github.com/davidwendt)
+- Fix segmented-sort overlapped input/output indices ([#16463](https://github.com/rapidsai/cudf/pull/16463)) [@davidwendt](https://github.com/davidwendt)
+- Fix merge conflict for auto merge 16447 ([#16449](https://github.com/rapidsai/cudf/pull/16449)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Fix links in Dask cuDF documentation ([#16929](https://github.com/rapidsai/cudf/pull/16929)) [@rjzamora](https://github.com/rjzamora)
+- Improve aggregation documentation ([#16822](https://github.com/rapidsai/cudf/pull/16822)) [@PointKernel](https://github.com/PointKernel)
+- Add best practices page to Dask cuDF docs ([#16821](https://github.com/rapidsai/cudf/pull/16821)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Update Pylibcudf doc strings ([#16810](https://github.com/rapidsai/cudf/pull/16810)) [@Matt711](https://github.com/Matt711)
+- Recommending `miniforge` for conda install ([#16782](https://github.com/rapidsai/cudf/pull/16782)) [@mmccarty](https://github.com/mmccarty)
+- Add labeling pylibcudf doc pages ([#16779](https://github.com/rapidsai/cudf/pull/16779)) [@mroeschke](https://github.com/mroeschke)
+- Migrate dask-cudf README improvements to dask-cudf sphinx docs ([#16765](https://github.com/rapidsai/cudf/pull/16765)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Remove out of date section from cudf.pandas docs ([#16697](https://github.com/rapidsai/cudf/pull/16697)) [@Matt711](https://github.com/Matt711)
+- Add performance tips to cudf.pandas FAQ. ([#16693](https://github.com/rapidsai/cudf/pull/16693)) [@bdice](https://github.com/bdice)
+- Update documentation for Dask cuDF ([#16671](https://github.com/rapidsai/cudf/pull/16671)) [@rjzamora](https://github.com/rjzamora)
+- Add missing pylibcudf strings docs ([#16471](https://github.com/rapidsai/cudf/pull/16471)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- DOC: Refresh pylibcudf guide ([#15856](https://github.com/rapidsai/cudf/pull/15856)) [@lithomas1](https://github.com/lithomas1)
+
+## 🚀 New Features
+
+- Build `cudf-polars` with `build.sh` ([#16898](https://github.com/rapidsai/cudf/pull/16898)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add polars to &quot;all&quot; dependency list. ([#16875](https://github.com/rapidsai/cudf/pull/16875)) [@bdice](https://github.com/bdice)
+- nvCOMP GZIP integration ([#16770](https://github.com/rapidsai/cudf/pull/16770)) [@vuule](https://github.com/vuule)
+- [FEA] Add support for `cudf.NamedAgg` ([#16744](https://github.com/rapidsai/cudf/pull/16744)) [@Matt711](https://github.com/Matt711)
+- Add experimental `filesystem=&quot;arrow&quot;` support in `dask_cudf.read_parquet` ([#16684](https://github.com/rapidsai/cudf/pull/16684)) [@rjzamora](https://github.com/rjzamora)
+- Relax Arrow pin ([#16681](https://github.com/rapidsai/cudf/pull/16681)) [@vyasr](https://github.com/vyasr)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Move NDS-H examples into benchmarks ([#16663](https://github.com/rapidsai/cudf/pull/16663)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- [FEA] Add third-party library integration testing of cudf.pandas to cudf ([#16645](https://github.com/rapidsai/cudf/pull/16645)) [@Matt711](https://github.com/Matt711)
+- Make isinstance check pass for proxy ndarrays ([#16601](https://github.com/rapidsai/cudf/pull/16601)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add an environment variable to fail on fallback in `cudf.pandas` ([#16562](https://github.com/rapidsai/cudf/pull/16562)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add support for `cudf.unique` ([#16554](https://github.com/rapidsai/cudf/pull/16554)) [@Matt711](https://github.com/Matt711)
+- [FEA] Support named aggregations in `df.groupby().agg()` ([#16528](https://github.com/rapidsai/cudf/pull/16528)) [@Matt711](https://github.com/Matt711)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove cuDF dependency from pylibcudf column from_device tests ([#16441](https://github.com/rapidsai/cudf/pull/16441)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable cudf.pandas REPL and -c command support ([#16428](https://github.com/rapidsai/cudf/pull/16428)) [@bdice](https://github.com/bdice)
+- Setup pylibcudf package ([#16299](https://github.com/rapidsai/cudf/pull/16299)) [@lithomas1](https://github.com/lithomas1)
+- Add a libcudf/thrust-based TPC-H derived datagen ([#16294](https://github.com/rapidsai/cudf/pull/16294)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Make proxy NumPy arrays pass isinstance check in `cudf.pandas` ([#16286](https://github.com/rapidsai/cudf/pull/16286)) [@Matt711](https://github.com/Matt711)
+- Add skiprows and nrows to parquet reader ([#16214](https://github.com/rapidsai/cudf/pull/16214)) [@lithomas1](https://github.com/lithomas1)
+- Upgrade to nvcomp 4.0.1 ([#16076](https://github.com/rapidsai/cudf/pull/16076)) [@vuule](https://github.com/vuule)
+- Migrate ORC reader to pylibcudf ([#16042](https://github.com/rapidsai/cudf/pull/16042)) [@lithomas1](https://github.com/lithomas1)
+- JSON reader validation of values ([#15968](https://github.com/rapidsai/cudf/pull/15968)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement exposed null mask APIs in pylibcudf ([#15908](https://github.com/rapidsai/cudf/pull/15908)) [@charlesbluca](https://github.com/charlesbluca)
+- Word-based nvtext::minhash function ([#15368](https://github.com/rapidsai/cudf/pull/15368)) [@davidwendt](https://github.com/davidwendt)
+
+## 🛠️ Improvements
+
+- Make tests deterministic ([#16910](https://github.com/rapidsai/cudf/pull/16910)) [@galipremsagar](https://github.com/galipremsagar)
+- Update update-version.sh to use packaging lib ([#16891](https://github.com/rapidsai/cudf/pull/16891)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Pin polars for 24.10 and update polars test suite xfail list ([#16886](https://github.com/rapidsai/cudf/pull/16886)) [@wence-](https://github.com/wence-)
+- Add in support for setting delim when parsing JSON through java ([#16867) (#16880](https://github.com/rapidsai/cudf/pull/16867) (#16880)) [@revans2](https://github.com/revans2)
+- Remove unnecessary flag from build.sh ([#16879](https://github.com/rapidsai/cudf/pull/16879)) [@vyasr](https://github.com/vyasr)
+- Ignore numba warning specific to ARM runners ([#16872](https://github.com/rapidsai/cudf/pull/16872)) [@galipremsagar](https://github.com/galipremsagar)
+- Display deltas for `cudf.pandas` test summary ([#16864](https://github.com/rapidsai/cudf/pull/16864)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch to using native `traceback` ([#16851](https://github.com/rapidsai/cudf/pull/16851)) [@galipremsagar](https://github.com/galipremsagar)
+- JSON tree algorithm code reorg ([#16836](https://github.com/rapidsai/cudf/pull/16836)) [@karthikeyann](https://github.com/karthikeyann)
+- Add string.repeats API to pylibcudf ([#16834](https://github.com/rapidsai/cudf/pull/16834)) [@mroeschke](https://github.com/mroeschke)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#16832](https://github.com/rapidsai/cudf/pull/16832)) [@jameslamb](https://github.com/jameslamb)
+- Rename the NDS-H benchmark binaries ([#16831](https://github.com/rapidsai/cudf/pull/16831)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add string.findall APIs to pylibcudf ([#16825](https://github.com/rapidsai/cudf/pull/16825)) [@mroeschke](https://github.com/mroeschke)
+- Add string.extract APIs to pylibcudf ([#16823](https://github.com/rapidsai/cudf/pull/16823)) [@mroeschke](https://github.com/mroeschke)
+- use get-pr-info from nv-gha-runners ([#16819](https://github.com/rapidsai/cudf/pull/16819)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add string.contains APIs to pylibcudf ([#16814](https://github.com/rapidsai/cudf/pull/16814)) [@mroeschke](https://github.com/mroeschke)
+- Forward-merge branch-24.08 to branch-24.10 ([#16813](https://github.com/rapidsai/cudf/pull/16813)) [@bdice](https://github.com/bdice)
+- Add io_type axis with default `PINNED_BUFFER` to nvbench PQ multithreaded reader ([#16809](https://github.com/rapidsai/cudf/pull/16809)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#16806](https://github.com/rapidsai/cudf/pull/16806)) [@jameslamb](https://github.com/jameslamb)
+- Add ability to set parquet row group max #rows and #bytes in java ([#16805](https://github.com/rapidsai/cudf/pull/16805)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Add in option for Java JSON APIs to do column pruning in CUDF ([#16796](https://github.com/rapidsai/cudf/pull/16796)) [@revans2](https://github.com/revans2)
+- Support drop_first in get_dummies ([#16795](https://github.com/rapidsai/cudf/pull/16795)) [@mroeschke](https://github.com/mroeschke)
+- Exposed stream-ordering to join API ([#16793](https://github.com/rapidsai/cudf/pull/16793)) [@lamarrr](https://github.com/lamarrr)
+- Add string.attributes APIs to pylibcudf ([#16785](https://github.com/rapidsai/cudf/pull/16785)) [@mroeschke](https://github.com/mroeschke)
+- Java: Make ColumnVector.fromViewWithContiguousAllocation public ([#16784](https://github.com/rapidsai/cudf/pull/16784)) [@jlowe](https://github.com/jlowe)
+- Add partitioning APIs to pylibcudf ([#16781](https://github.com/rapidsai/cudf/pull/16781)) [@mroeschke](https://github.com/mroeschke)
+- Optimization of tdigest merge aggregation. ([#16780](https://github.com/rapidsai/cudf/pull/16780)) [@nvdbaranec](https://github.com/nvdbaranec)
+- use libkvikio wheels in wheel builds ([#16778](https://github.com/rapidsai/cudf/pull/16778)) [@jameslamb](https://github.com/jameslamb)
+- Exposed stream-ordering to datetime API ([#16774](https://github.com/rapidsai/cudf/pull/16774)) [@lamarrr](https://github.com/lamarrr)
+- Add io/timezone APIs to pylibcudf ([#16771](https://github.com/rapidsai/cudf/pull/16771)) [@mroeschke](https://github.com/mroeschke)
+- Remove `MultiIndex._poplevel` inplace implementation. ([#16767](https://github.com/rapidsai/cudf/pull/16767)) [@mroeschke](https://github.com/mroeschke)
+- allow pandas patch version to float in cudf-pandas unit tests ([#16763](https://github.com/rapidsai/cudf/pull/16763)) [@jameslamb](https://github.com/jameslamb)
+- Simplify the nvCOMP adapter ([#16762](https://github.com/rapidsai/cudf/pull/16762)) [@vuule](https://github.com/vuule)
+- Add labeling APIs to pylibcudf ([#16761](https://github.com/rapidsai/cudf/pull/16761)) [@mroeschke](https://github.com/mroeschke)
+- Add transform APIs to pylibcudf ([#16760](https://github.com/rapidsai/cudf/pull/16760)) [@mroeschke](https://github.com/mroeschke)
+- Add a benchmark to study Parquet reader&#39;s performance for wide tables ([#16751](https://github.com/rapidsai/cudf/pull/16751)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change the Parquet writer&#39;s `default_row_group_size_bytes` from 128MB to inf ([#16750](https://github.com/rapidsai/cudf/pull/16750)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add transpose API to pylibcudf ([#16749](https://github.com/rapidsai/cudf/pull/16749)) [@mroeschke](https://github.com/mroeschke)
+- Add support for Python 3.12, update Kafka dependencies to 2.5.x ([#16745](https://github.com/rapidsai/cudf/pull/16745)) [@jameslamb](https://github.com/jameslamb)
+- Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` ([#16739](https://github.com/rapidsai/cudf/pull/16739)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor cudf pandas integration tests CI ([#16728](https://github.com/rapidsai/cudf/pull/16728)) [@Matt711](https://github.com/Matt711)
+- Remove ERROR_TEST gtest from libcudf ([#16722](https://github.com/rapidsai/cudf/pull/16722)) [@davidwendt](https://github.com/davidwendt)
+- Use Series._from_column more consistently to avoid validation ([#16716](https://github.com/rapidsai/cudf/pull/16716)) [@mroeschke](https://github.com/mroeschke)
+- remove some unnecessary libcudf nightly builds ([#16714](https://github.com/rapidsai/cudf/pull/16714)) [@jameslamb](https://github.com/jameslamb)
+- Remove xfail from torch-cudf.pandas integration test ([#16705](https://github.com/rapidsai/cudf/pull/16705)) [@Matt711](https://github.com/Matt711)
+- Add return type annotations to MultiIndex ([#16696](https://github.com/rapidsai/cudf/pull/16696)) [@mroeschke](https://github.com/mroeschke)
+- Add type annotations to Index classes, utilize _from_column more ([#16695](https://github.com/rapidsai/cudf/pull/16695)) [@mroeschke](https://github.com/mroeschke)
+- Have interval_range use IntervalIndex.from_breaks, remove column_empty_same_mask ([#16694](https://github.com/rapidsai/cudf/pull/16694)) [@mroeschke](https://github.com/mroeschke)
+- Increase timeouts for couple of tests ([#16692](https://github.com/rapidsai/cudf/pull/16692)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace raw device_memory_resource pointer in pylibcudf Cython ([#16674](https://github.com/rapidsai/cudf/pull/16674)) [@harrism](https://github.com/harrism)
+- switch from typing.Callable to collections.abc.Callable ([#16670](https://github.com/rapidsai/cudf/pull/16670)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#16669](https://github.com/rapidsai/cudf/pull/16669)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Multi-file and Parquet-aware prefetching from remote storage ([#16657](https://github.com/rapidsai/cudf/pull/16657)) [@rjzamora](https://github.com/rjzamora)
+- Access Frame attributes instead of ColumnAccessor attributes when available ([#16652](https://github.com/rapidsai/cudf/pull/16652)) [@mroeschke](https://github.com/mroeschke)
+- Use non-mangled type names in nvbench output ([#16649](https://github.com/rapidsai/cudf/pull/16649)) [@davidwendt](https://github.com/davidwendt)
+- Add pylibcudf build dir in build.sh for `clean` ([#16648](https://github.com/rapidsai/cudf/pull/16648)) [@galipremsagar](https://github.com/galipremsagar)
+- Prune workflows based on changed files ([#16642](https://github.com/rapidsai/cudf/pull/16642)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove arrow dependency ([#16640](https://github.com/rapidsai/cudf/pull/16640)) [@vyasr](https://github.com/vyasr)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Python 3.9 support ([#16637](https://github.com/rapidsai/cudf/pull/16637)) [@jameslamb](https://github.com/jameslamb)
+- Support DecimalDtype meta in dask_cudf ([#16634](https://github.com/rapidsai/cudf/pull/16634)) [@mroeschke](https://github.com/mroeschke)
+- Add `num_multiprocessors` utility ([#16628](https://github.com/rapidsai/cudf/pull/16628)) [@PointKernel](https://github.com/PointKernel)
+- Annotate `ColumnAccessor._data` labels as `Hashable` ([#16623](https://github.com/rapidsai/cudf/pull/16623)) [@mroeschke](https://github.com/mroeschke)
+- Remove build_categorical_column in favor of CategoricalColumn constructor ([#16617](https://github.com/rapidsai/cudf/pull/16617)) [@mroeschke](https://github.com/mroeschke)
+- Move apply_boolean_mask benchmark to nvbench ([#16616](https://github.com/rapidsai/cudf/pull/16616)) [@davidwendt](https://github.com/davidwendt)
+- Revise `get_reader_filepath_or_buffer` to handle a list of data sources ([#16613](https://github.com/rapidsai/cudf/pull/16613)) [@rjzamora](https://github.com/rjzamora)
+- do not install cudf in cudf_polars wheel tests ([#16612](https://github.com/rapidsai/cudf/pull/16612)) [@jameslamb](https://github.com/jameslamb)
+- remove streamz git dependency, standardize build dependency names, consolidate some dependency lists ([#16611](https://github.com/rapidsai/cudf/pull/16611)) [@jameslamb](https://github.com/jameslamb)
+- Fix C++ and Cython io types ([#16610](https://github.com/rapidsai/cudf/pull/16610)) [@vyasr](https://github.com/vyasr)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove thrust::optional from expression evaluator ([#16604](https://github.com/rapidsai/cudf/pull/16604)) [@bdice](https://github.com/bdice)
+- Add stricter typing and validation to ColumnAccessor ([#16602](https://github.com/rapidsai/cudf/pull/16602)) [@mroeschke](https://github.com/mroeschke)
+- make more use of YAML anchors in dependencies.yaml ([#16597](https://github.com/rapidsai/cudf/pull/16597)) [@jameslamb](https://github.com/jameslamb)
+- Enable testing `cudf.pandas` unit tests for all minor versions of pandas ([#16595](https://github.com/rapidsai/cudf/pull/16595)) [@galipremsagar](https://github.com/galipremsagar)
+- Extend the Parquet writer&#39;s dictionary encoding benchmark. ([#16591](https://github.com/rapidsai/cudf/pull/16591)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Add build job for pylibcudf ([#16587](https://github.com/rapidsai/cudf/pull/16587)) [@vyasr](https://github.com/vyasr)
+- Add `public` qualifier for some member functions in Java class `Schema` ([#16583](https://github.com/rapidsai/cudf/pull/16583)) [@ttnghia](https://github.com/ttnghia)
+- Enable gtests previously disabled for compute-sanitizer bug ([#16581](https://github.com/rapidsai/cudf/pull/16581)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Add filesystem argument to `cudf.read_parquet` ([#16577](https://github.com/rapidsai/cudf/pull/16577)) [@rjzamora](https://github.com/rjzamora)
+- Ensure size is always passed to NumericalColumn ([#16576](https://github.com/rapidsai/cudf/pull/16576)) [@mroeschke](https://github.com/mroeschke)
+- standardize and consolidate wheel installations in testing scripts ([#16575](https://github.com/rapidsai/cudf/pull/16575)) [@jameslamb](https://github.com/jameslamb)
+- Performance improvement for strings::slice for wide strings ([#16574](https://github.com/rapidsai/cudf/pull/16574)) [@davidwendt](https://github.com/davidwendt)
+- Add `ToCudfBackend` expression to dask-cudf ([#16573](https://github.com/rapidsai/cudf/pull/16573)) [@rjzamora](https://github.com/rjzamora)
+- CI: Test against old versions of key dependencies ([#16570](https://github.com/rapidsai/cudf/pull/16570)) [@seberg](https://github.com/seberg)
+- Replace `NativeFile` dependency in dask-cudf Parquet reader ([#16569](https://github.com/rapidsai/cudf/pull/16569)) [@rjzamora](https://github.com/rjzamora)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Move libcudf reduction google-benchmarks to nvbench ([#16564](https://github.com/rapidsai/cudf/pull/16564)) [@davidwendt](https://github.com/davidwendt)
+- Rework strings::slice benchmark to use nvbench ([#16563](https://github.com/rapidsai/cudf/pull/16563)) [@davidwendt](https://github.com/davidwendt)
+- Reenable arrow tests ([#16556](https://github.com/rapidsai/cudf/pull/16556)) [@vyasr](https://github.com/vyasr)
+- Clean up reshaping ops ([#16553](https://github.com/rapidsai/cudf/pull/16553)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Rewrite remaining Python Arrow interop conversions using the C Data Interface ([#16548](https://github.com/rapidsai/cudf/pull/16548)) [@vyasr](https://github.com/vyasr)
+- [REVIEW] JSON host tree algorithms ([#16545](https://github.com/rapidsai/cudf/pull/16545)) [@shrshi](https://github.com/shrshi)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove hardcoded versions from workflows. ([#16540](https://github.com/rapidsai/cudf/pull/16540)) [@bdice](https://github.com/bdice)
+- Ensure comparisons with pyints and integer series always succeed ([#16532](https://github.com/rapidsai/cudf/pull/16532)) [@seberg](https://github.com/seberg)
+- Remove unneeded output size parameter from internal count_matches utility ([#16531](https://github.com/rapidsai/cudf/pull/16531)) [@davidwendt](https://github.com/davidwendt)
+- Remove invalid column_view usage in string-scalar-to-column function ([#16530](https://github.com/rapidsai/cudf/pull/16530)) [@davidwendt](https://github.com/davidwendt)
+- Raise NotImplementedError for Series.rename that&#39;s not a scalar ([#16525](https://github.com/rapidsai/cudf/pull/16525)) [@mroeschke](https://github.com/mroeschke)
+- Remove deprecated public APIs from libcudf ([#16524](https://github.com/rapidsai/cudf/pull/16524)) [@davidwendt](https://github.com/davidwendt)
+- Return Interval object in pandas compat mode for IntervalIndex reductions ([#16523](https://github.com/rapidsai/cudf/pull/16523)) [@mroeschke](https://github.com/mroeschke)
+- Update json normalization to take device_buffer ([#16520](https://github.com/rapidsai/cudf/pull/16520)) [@karthikeyann](https://github.com/karthikeyann)
+- Rework cudf::io::text::byte_range_info class member functions ([#16518](https://github.com/rapidsai/cudf/pull/16518)) [@davidwendt](https://github.com/davidwendt)
+- Remove unneeded pair-iterator benchmark ([#16511](https://github.com/rapidsai/cudf/pull/16511)) [@davidwendt](https://github.com/davidwendt)
+- Update pre-commit hooks ([#16510](https://github.com/rapidsai/cudf/pull/16510)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#16506](https://github.com/rapidsai/cudf/pull/16506)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#16503](https://github.com/rapidsai/cudf/pull/16503)) [@jameslamb](https://github.com/jameslamb)
+- Pass batch size to JSON reader using environment variable ([#16502](https://github.com/rapidsai/cudf/pull/16502)) [@shrshi](https://github.com/shrshi)
+- Remove a deprecated multibyte_split API ([#16501](https://github.com/rapidsai/cudf/pull/16501)) [@davidwendt](https://github.com/davidwendt)
+- Add interop example for `arrow::StringViewArray` to `cudf::column` ([#16498](https://github.com/rapidsai/cudf/pull/16498)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add keep option to distinct nvbench ([#16497](https://github.com/rapidsai/cudf/pull/16497)) [@bdice](https://github.com/bdice)
+- Use more idomatic cudf APIs in dask_cudf meta generation ([#16487](https://github.com/rapidsai/cudf/pull/16487)) [@mroeschke](https://github.com/mroeschke)
+- Fix typo in dispatch_row_equal. ([#16473](https://github.com/rapidsai/cudf/pull/16473)) [@bdice](https://github.com/bdice)
+- Use explicit construction of column subclass instead of `build_column` when type is known ([#16470](https://github.com/rapidsai/cudf/pull/16470)) [@mroeschke](https://github.com/mroeschke)
+- Move exception handler into pylibcudf from cudf ([#16468](https://github.com/rapidsai/cudf/pull/16468)) [@lithomas1](https://github.com/lithomas1)
+- Make StructColumn.__init__ strict ([#16467](https://github.com/rapidsai/cudf/pull/16467)) [@mroeschke](https://github.com/mroeschke)
+- Make ListColumn.__init__ strict ([#16465](https://github.com/rapidsai/cudf/pull/16465)) [@mroeschke](https://github.com/mroeschke)
+- Make Timedelta/DatetimeColumn.__init__ strict ([#16464](https://github.com/rapidsai/cudf/pull/16464)) [@mroeschke](https://github.com/mroeschke)
+- Make NumericalColumn.__init__ strict ([#16457](https://github.com/rapidsai/cudf/pull/16457)) [@mroeschke](https://github.com/mroeschke)
+- Make CategoricalColumn.__init__ strict ([#16456](https://github.com/rapidsai/cudf/pull/16456)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Expose `stream` param in transform APIs ([#16452](https://github.com/rapidsai/cudf/pull/16452)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add upper bound pin for polars ([#16442](https://github.com/rapidsai/cudf/pull/16442)) [@wence-](https://github.com/wence-)
+- Make (Indexed)Frame.__init__ require data (and index) ([#16430](https://github.com/rapidsai/cudf/pull/16430)) [@mroeschke](https://github.com/mroeschke)
+- Add Java APIs to copy column data to host asynchronously ([#16429](https://github.com/rapidsai/cudf/pull/16429)) [@jlowe](https://github.com/jlowe)
+- Update docs of the TPC-H derived examples ([#16423](https://github.com/rapidsai/cudf/pull/16423)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use RMM adaptor constructors instead of factories. ([#16414](https://github.com/rapidsai/cudf/pull/16414)) [@bdice](https://github.com/bdice)
+- Align ewm APIs with pandas 2.x ([#16413](https://github.com/rapidsai/cudf/pull/16413)) [@mroeschke](https://github.com/mroeschke)
+- Remove checking for specific tests in memcheck script ([#16412](https://github.com/rapidsai/cudf/pull/16412)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to reshape APIs ([#16410](https://github.com/rapidsai/cudf/pull/16410)) [@davidwendt](https://github.com/davidwendt)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- update some branch references in GitHub Actions configs ([#16397](https://github.com/rapidsai/cudf/pull/16397)) [@jameslamb](https://github.com/jameslamb)
+- Support reading matching projected and filter cols from Parquet files with otherwise mismatched schemas ([#16394](https://github.com/rapidsai/cudf/pull/16394)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Merge branch-24.08 into branch-24.10 ([#16393](https://github.com/rapidsai/cudf/pull/16393)) [@jameslamb](https://github.com/jameslamb)
+- Add query 10 to the TPC-H suite ([#16392](https://github.com/rapidsai/cudf/pull/16392)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations ([#16386](https://github.com/rapidsai/cudf/pull/16386)) [@vuule](https://github.com/vuule)
+- Fix some issues with deprecated / removed cccl facilities ([#16377](https://github.com/rapidsai/cudf/pull/16377)) [@miscco](https://github.com/miscco)
+- Align IntervalIndex APIs with pandas 2.x ([#16371](https://github.com/rapidsai/cudf/pull/16371)) [@mroeschke](https://github.com/mroeschke)
+- Align CategoricalIndex APIs with pandas 2.x ([#16369](https://github.com/rapidsai/cudf/pull/16369)) [@mroeschke](https://github.com/mroeschke)
+- Align TimedeltaIndex APIs with pandas 2.x ([#16368](https://github.com/rapidsai/cudf/pull/16368)) [@mroeschke](https://github.com/mroeschke)
+- Align DatetimeIndex APIs with pandas 2.x ([#16367](https://github.com/rapidsai/cudf/pull/16367)) [@mroeschke](https://github.com/mroeschke)
+- fix [tool.setuptools] reference in custreamz config ([#16365](https://github.com/rapidsai/cudf/pull/16365)) [@jameslamb](https://github.com/jameslamb)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Rebuild for &amp; Support NumPy 2 ([#16300](https://github.com/rapidsai/cudf/pull/16300)) [@jakirkham](https://github.com/jakirkham)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Added batch memset to memset data and validity buffers in parquet reader ([#16281](https://github.com/rapidsai/cudf/pull/16281)) [@sdrp713](https://github.com/sdrp713)
+- Deduplicate decimal32/decimal64 to decimal128 conversion function ([#16236](https://github.com/rapidsai/cudf/pull/16236)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor mixed_semi_join using cuco::static_set ([#16230](https://github.com/rapidsai/cudf/pull/16230)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Improve performance of hash_character_ngrams using warp-per-string kernel ([#16212](https://github.com/rapidsai/cudf/pull/16212)) [@davidwendt](https://github.com/davidwendt)
+- Add environment variable to log cudf.pandas fallback calls ([#16161](https://github.com/rapidsai/cudf/pull/16161)) [@mroeschke](https://github.com/mroeschke)
+- Add libcudf example with large strings ([#15983](https://github.com/rapidsai/cudf/pull/15983)) [@davidwendt](https://github.com/davidwendt)
+- JSON tree algorithms refactor I: CSR data structure for column tree ([#15979](https://github.com/rapidsai/cudf/pull/15979)) [@shrshi](https://github.com/shrshi)
+- Support multiple new-line characters in regex APIs ([#15961](https://github.com/rapidsai/cudf/pull/15961)) [@davidwendt](https://github.com/davidwendt)
+- adding wheel build for libcudf ([#15483](https://github.com/rapidsai/cudf/pull/15483)) [@msarahan](https://github.com/msarahan)
+- Replace usages of `thrust::optional` with `std::optional` ([#15091](https://github.com/rapidsai/cudf/pull/15091)) [@miscco](https://github.com/miscco)
+
 # cudf 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes

From dfdae599622841bf3f4d523c01eee3ae1fe933f0 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 9 Oct 2024 14:02:28 -0400
Subject: [PATCH 08/24] Use std::optional for host types (#17015)

cuda::std::optional shouldn't be used for host types such as `std::vector` as it requires the constructors of the `T` types to be host+device.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17015
---
 .../io/parquet/compact_protocol_reader.cpp    |  8 +--
 cpp/src/io/parquet/parquet.hpp                | 64 +++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp            | 14 ++--
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader_impl.cpp            |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  8 +--
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  6 +-
 cpp/src/io/parquet/writer_impl.cu             |  8 +--
 cpp/tests/io/parquet_common.cpp               |  2 +-
 cpp/tests/io/parquet_common.hpp               |  2 +-
 10 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 312a5243687..d276e946a51 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -309,10 +309,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
+  std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
@@ -439,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field {
  */
 template <typename T, typename FieldFunctor>
 class parquet_field_optional : public parquet_field {
-  cuda::std::optional<T>& val;
+  std::optional<T>& val;
 
  public:
-  parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
+  parquet_field_optional(int f, std::optional<T>& v) : parquet_field(f), val(v) {}
 
   inline void operator()(CompactProtocolReader* cpr, int field_type)
   {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7c985643887..2851ef67a65 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,8 +20,6 @@
 
 #include <cudf/types.hpp>
 
-#include <cuda/std/optional>
-
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -94,10 +92,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  cuda::std::optional<DecimalType> decimal_type;
-  cuda::std::optional<TimeType> time_type;
-  cuda::std::optional<TimestampType> timestamp_type;
-  cuda::std::optional<IntType> int_type;
+  std::optional<DecimalType> decimal_type;
+  std::optional<TimeType> time_type;
+  std::optional<TimestampType> timestamp_type;
+  std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -178,21 +176,21 @@ struct SchemaElement {
   // 5: nested fields
   int32_t num_children = 0;
   // 6: DEPRECATED: record the original type before conversion to parquet type
-  cuda::std::optional<ConvertedType> converted_type;
+  std::optional<ConvertedType> converted_type;
   // 7: DEPRECATED: record the scale for DECIMAL converted type
   int32_t decimal_scale = 0;
   // 8: DEPRECATED: record the precision for DECIMAL converted type
   int32_t decimal_precision = 0;
   // 9: save field_id from original schema
-  cuda::std::optional<int32_t> field_id;
+  std::optional<int32_t> field_id;
   // 10: replaces converted type
-  cuda::std::optional<LogicalType> logical_type;
+  std::optional<LogicalType> logical_type;
 
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
   // cudf type determined from arrow:schema
-  cuda::std::optional<type_id> arrow_type;
+  std::optional<type_id> arrow_type;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -258,21 +256,21 @@ struct SchemaElement {
  */
 struct Statistics {
   // deprecated max value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> max;
+  std::optional<std::vector<uint8_t>> max;
   // deprecated min value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> min;
+  std::optional<std::vector<uint8_t>> min;
   // count of null values in the column
-  cuda::std::optional<int64_t> null_count;
+  std::optional<int64_t> null_count;
   // count of distinct values occurring
-  cuda::std::optional<int64_t> distinct_count;
+  std::optional<int64_t> distinct_count;
   // max value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> max_value;
+  std::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> min_value;
+  std::optional<std::vector<uint8_t>> min_value;
   // If true, max_value is the actual maximum value for a column
-  cuda::std::optional<bool> is_max_value_exact;
+  std::optional<bool> is_max_value_exact;
   // If true, min_value is the actual minimum value for a column
-  cuda::std::optional<bool> is_min_value_exact;
+  std::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -281,7 +279,7 @@ struct Statistics {
 struct SizeStatistics {
   // Number of variable-width bytes stored for the page/chunk. Should not be set for anything
   // but the BYTE_ARRAY physical type.
-  cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
+  std::optional<int64_t> unencoded_byte_array_data_bytes;
   /**
    * When present, there is expected to be one element corresponding to each
    * repetition (i.e. size=max repetition_level+1) where each element
@@ -290,14 +288,14 @@ struct SizeStatistics {
    *
    * This value should not be written if max_repetition_level is 0.
    */
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
 
   /**
    * Same as repetition_level_histogram except for definition levels.
    *
    * This value should not be written if max_definition_level is 0 or 1.
    */
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -318,7 +316,7 @@ struct OffsetIndex {
   std::vector<PageLocation> page_locations;
   // per-page size info. see description of the same field in SizeStatistics. only present for
   // columns with a BYTE_ARRAY physical type.
-  cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
+  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
 };
 
 /**
@@ -329,11 +327,11 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> min_values;  // lower bound for values in each page
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
-    BoundaryOrder::UNORDERED;  // Indicates if min and max values are ordered
-  cuda::std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
+    BoundaryOrder::UNORDERED;                       // Indicates if min and max values are ordered
+  std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
   // Repetition/definition level histograms for the column chunk
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -383,11 +381,11 @@ struct ColumnChunkMetaData {
   Statistics statistics;
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
-  cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
+  std::optional<std::vector<PageEncodingStats>> encoding_stats;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
-  cuda::std::optional<SizeStatistics> size_statistics;
+  std::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -429,13 +427,13 @@ struct RowGroup {
   int64_t num_rows = 0;
   // If set, specifies a sort ordering of the rows in this RowGroup.
   // The sorting columns can be a subset of all the columns.
-  cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
+  std::optional<std::vector<SortingColumn>> sorting_columns;
   // Byte offset from beginning of file to first page (data or dictionary) in this row group
-  cuda::std::optional<int64_t> file_offset;
+  std::optional<int64_t> file_offset;
   // Total byte size of all compressed (and potentially encrypted) column data in this row group
-  cuda::std::optional<int64_t> total_compressed_size;
+  std::optional<int64_t> total_compressed_size;
   // Row group ordinal in the file
-  cuda::std::optional<int16_t> ordinal;
+  std::optional<int16_t> ordinal;
 };
 
 /**
@@ -460,7 +458,7 @@ struct FileMetaData {
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
   std::string created_by = "";
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders;
+  std::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a8ba3a969ce..4f6d41a97da 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -395,7 +395,7 @@ struct ColumnChunkDesc {
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
                            Compression codec_,
-                           cuda::std::optional<LogicalType> logical_type_,
+                           std::optional<LogicalType> logical_type_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
@@ -441,12 +441,12 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                           // number of data pages
   int32_t num_dict_pages{};                           // number of dictionary pages
   PageInfo const* dict_page{};
-  string_index_pair* str_dict_index{};  // index for string dictionary
-  bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
-  void** column_data_base{};            // base pointers of column data
-  void** column_string_base{};          // base pointers of column string data
-  Compression codec{};                  // compressed codec enum
-  cuda::std::optional<LogicalType> logical_type{};  // logical type
+  string_index_pair* str_dict_index{};        // index for string dictionary
+  bitmask_type** valid_map_base{};            // base pointers of valid bit map for this column
+  void** column_data_base{};                  // base pointers of column data
+  void** column_string_base{};                // base pointers of column string data
+  Compression codec{};                        // compressed codec enum
+  std::optional<LogicalType> logical_type{};  // logical type
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index b90ca36c8c7..f0a0bc0b51b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -152,7 +152,7 @@ struct stats_caster {
         }
 
         void set_index(size_type index,
-                       cuda::std::optional<std::vector<uint8_t>> const& binary_value,
+                       std::optional<std::vector<uint8_t>> const& binary_value,
                        Type const type)
         {
           if (binary_value.has_value()) {
@@ -234,8 +234,8 @@ struct stats_caster {
             max.set_index(stats_idx, max_value, colchunk.meta_data.type);
           } else {
             // Marking it null, if column present in row group
-            min.set_index(stats_idx, cuda::std::nullopt, {});
-            max.set_index(stats_idx, cuda::std::nullopt, {});
+            min.set_index(stats_idx, std::nullopt, {});
+            max.set_index(stats_idx, std::nullopt, {});
           }
           stats_idx++;
         }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1b69ccb7742..f0865c715bc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -38,7 +38,7 @@ namespace {
 // be treated as a string. Currently the only logical type that has special handling is DECIMAL.
 // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
 // for now would also be treated as a string).
-inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
+inline bool is_treat_fixed_length_as_string(std::optional<LogicalType> const& logical_type)
 {
   if (!logical_type.has_value()) { return true; }
   return logical_type->type != LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index c588fedb85c..27312a4da89 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos,
  *
  * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
+[[nodiscard]] std::tuple<int32_t, std::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  cuda::std::optional<LogicalType> logical_type)
+  std::optional<LogicalType> logical_type)
 {
   int32_t const clock_rate =
     is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
@@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos,
     // if decimal but not outputting as float or decimal, then convert to no logical type
     if (column_type_id != type_id::FLOAT64 and
         not cudf::is_fixed_point(data_type{column_type_id})) {
-      return std::make_tuple(clock_rate, cuda::std::nullopt);
+      return {clock_rate, std::nullopt};
     }
   }
 
-  return std::make_tuple(clock_rate, std::move(logical_type));
+  return {clock_rate, std::move(logical_type)};
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 6d566b5815e..a6562d33de2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;
 
 namespace {
 
-cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
+std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
   if (schema.converted_type.has_value()) {
     switch (schema.converted_type.value()) {
@@ -66,7 +66,7 @@ cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const&
       default: return LogicalType{LogicalType::UNDEFINED};
     }
   }
-  return cuda::std::nullopt;
+  return std::nullopt;
 }
 
 }  // namespace
@@ -246,7 +246,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = cuda::std::nullopt;
+        struct_elem.converted_type  = std::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ec05f35d405..190f13eb688 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -186,7 +186,7 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
+  std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
 };
 
 namespace {
@@ -472,7 +472,7 @@ struct leaf_schema_fn {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = cuda::std::nullopt;
+    col_schema.converted_type = std::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
@@ -750,7 +750,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
           col_schema.type = Type::BYTE_ARRAY;
         }
 
-        col_schema.converted_type  = cuda::std::nullopt;
+        col_schema.converted_type  = std::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -2795,7 +2795,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
   for (auto& se : md.schema) {
     if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
-      se.logical_type = cuda::std::nullopt;
+      se.logical_type = std::nullopt;
     }
   }
 
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index 6141a40bc95..a1b8677eac8 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
   auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index bd1579eaa1b..c90b81ed27a 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
 
 void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);
 

From bd51a25ea6fdab6ab11e95e2c8192ed7eee43e75 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:05:05 -0400
Subject: [PATCH 09/24] [DOC] Document limitation using  `cudf.pandas` proxy
 arrays (#16955)

When instantiating a `cudf.pandas` proxy array, a DtoH transfer occurs so that the data buffer is set correctly. We do this because functions which utilize NumPy's C API can utilize the data buffer directly instead of going through `__array__`. This PR documents this limitation.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16955
---
 docs/cudf/source/cudf_pandas/faq.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 34b657488c1..5024747227e 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
    ```
 - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
   24.02 of cudf was the last to support pandas 1.5.x.
+- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
+  array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
+  verify this with an isinstance check.
+
+  ```python
+  %load_ext cudf.pandas
+  import pandas as pd
+  import numpy as np
+
+  arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
+  isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
+  ```
+  Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
+  access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
+  However, our proxy mechanism is designed to proxy function calls at the Python
+  level, which is incompatible with these types of accesses. To handle these
+  situations, we perform an eager device-to-host (DtoH) copy, which sets the data
+  buffer correctly but incurs the cost of extra time when creating the proxy array.
+  In the previous example, creating `arr` performed this kind of implicit DtoH transfer.
+
+  With this approach, we also get compatibility with third party libraries like `torch`.
+
+  ```python
+  import torch
+  x = torch.from_numpy(arr)
+  ```
 
 ## Can I force running on the CPU?
 

From c7b51195c675af47d0f3dd69c04d0fcc6920eca5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 9 Oct 2024 15:17:32 -0700
Subject: [PATCH 10/24] Fix `host_span` constructor to correctly copy
 `is_device_accessible` (#17020)

One of the `host_span` constructors was not updated when we added `is_device_accessible`, so the value was not assigned.
This PR fixes this simple error and adds tests that checks that this property is correctly set when creating `host_span`s.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17020
---
 cpp/include/cudf/utilities/span.hpp           |  2 +-
 .../utilities_tests/pinned_memory_tests.cpp   | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 914731ea417..f3e1a61d075 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
-    : base(other.data(), other.size())
+    : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
 
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index ae7c6fa8b8c..1e1e21fe18a 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
@@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
     EXPECT_FALSE(vec.get_allocator().is_device_accessible());
   }
 }
+
+TEST_F(PinnedMemoryTest, HostSpan)
+{
+  auto test_ctors = [](auto&& vec) {
+    auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
+    // Test conversion from a vector
+    auto const span = cudf::host_span<int16_t>{vec};
+    EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
+    // Test conversion from host_span with different type
+    auto const span_converted = cudf::host_span<int16_t const>{span};
+    EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
+  };
+
+  cudf::set_allocate_host_as_pinned_threshold(7);
+  for (int i = 1; i < 10; i++) {
+    // some iterations will use pinned memory, some will not
+    test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
+  }
+}

From 3791c8a9d1aeb7474bb9ef324a089a569183406c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 9 Oct 2024 13:45:02 -1000
Subject: [PATCH 11/24] Add string.convert_floats APIs to pylibcudf (#16990)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16990
---
 python/cudf/cudf/_lib/string_casting.pyx      |  34 ++----
 .../_lib/strings/convert/convert_floats.pyx   |  24 ++---
 .../strings/convert/convert_floats.pxd        |   6 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   1 +
 .../pylibcudf/strings/convert/__init__.py     |   1 +
 .../strings/convert/convert_floats.pxd        |  11 ++
 .../strings/convert/convert_floats.pyx        | 101 ++++++++++++++++++
 .../tests/test_string_convert_floats.py       |  33 ++++++
 9 files changed, 165 insertions(+), 48 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index d9595f4ab0a..93b67bd4c9d 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -10,10 +10,6 @@ from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    from_floats as cpp_from_floats,
-    to_floats as cpp_to_floats,
-)
 from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
@@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
 def floating_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_floats(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.from_floats(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def string_to_floating(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_floats.to_floats(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type)
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_floats(
-                input_column_view,
-                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def dtos(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 7965b588703..5da6e3f10cc 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    is_float as cpp_is_float,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
@@ -20,12 +13,7 @@ def is_float(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have floats.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.is_float(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index f4fc4674506..a45c7f9979e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_floats(
-        column_view input_col,
+        column_view strings,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_floats(
-        column_view input_col) except +
+        column_view floats) except +
 
     cdef unique_ptr[column] is_float(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 41aeb72039b..7b228c06a18 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
+                   convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index b4b0b521e39..be6145384ad 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -4,6 +4,7 @@ from . cimport (
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 409620fce45..7c94387282b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -4,6 +4,7 @@
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
new file mode 100644
index 00000000000..1284ff552aa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type)
+
+cpdef Column from_floats(Column floats)
+
+cpdef Column is_float(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..8081aadb085
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_floats as cpp_convert_floats,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type):
+    """
+    Returns a new numeric column by parsing float values from each string
+    in the provided strings column.
+
+    For details, see cpp:func:`cudf::strings::to_floats`
+
+    Parameters
+    ----------
+    strings : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of float numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with floats converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.to_floats(
+                strings.view(),
+                output_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_floats(Column floats):
+    """
+    Returns a new strings column converting the float values from the
+    provided column into strings.
+
+    For details, see cpp:func:`cudf::strings::from_floats`
+
+    Parameters
+    ----------
+    floats : Column
+        Numeric column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column with floats as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.from_floats(
+                floats.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_float(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to floats.
+
+    For details, see cpp:func:`cudf::strings::is_float`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.is_float(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
new file mode 100644
index 00000000000..e9918fab559
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_floats():
+    typ = pa.float32()
+    arr = pa.array(["-1.23", "1", None])
+    result = plc.strings.convert.convert_floats.to_floats(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_floats():
+    arr = pa.array([-1.23, 1, None])
+    result = plc.strings.convert.convert_floats.from_floats(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["-1.23", "1.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_float():
+    arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
+    result = plc.strings.convert.convert_floats.is_float(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)

From 31423d056c45bd6352f0c611ed5e63423b09b954 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 9 Oct 2024 21:01:30 -0400
Subject: [PATCH 12/24] Update all rmm imports to use pylibrmm/librmm (#16913)

This PR updates all the RMM imports to use pylibrmm/librmm now that `rmm._lib` is deprecated . It should be merged after [rmm/1676](https://github.com/rapidsai/rmm/pull/1676).

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16913
---
 docs/cudf/source/conf.py                                   | 5 ++++-
 python/cudf/cudf/_lib/column.pxd                           | 2 +-
 python/cudf/cudf/_lib/column.pyx                           | 2 +-
 python/cudf/cudf/_lib/copying.pyx                          | 2 +-
 python/cudf/cudf/_lib/scalar.pxd                           | 2 +-
 python/cudf/cudf/_lib/strings_udf.pyx                      | 3 ++-
 python/cudf/cudf/core/buffer/buffer.py                     | 2 +-
 python/cudf/cudf/core/buffer/spillable_buffer.py           | 4 ++--
 python/cudf/cudf/core/udf/strings_typing.py                | 2 +-
 python/pylibcudf/pylibcudf/column.pyx                      | 2 +-
 python/pylibcudf/pylibcudf/join.pyx                        | 2 +-
 python/pylibcudf/pylibcudf/libcudf/column/column.pxd       | 2 +-
 .../pylibcudf/libcudf/column/column_factories.pxd          | 2 +-
 python/pylibcudf/pylibcudf/libcudf/concatenate.pxd         | 2 +-
 python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd    | 2 +-
 python/pylibcudf/pylibcudf/libcudf/copying.pxd             | 2 +-
 python/pylibcudf/pylibcudf/libcudf/join.pxd                | 2 +-
 python/pylibcudf/pylibcudf/libcudf/null_mask.pxd           | 2 +-
 python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd         | 2 +-
 python/pylibcudf/pylibcudf/libcudf/transform.pxd           | 2 +-
 python/pylibcudf/pylibcudf/null_mask.pxd                   | 2 +-
 python/pylibcudf/pylibcudf/null_mask.pyx                   | 7 ++++---
 python/pylibcudf/pylibcudf/scalar.pxd                      | 2 +-
 python/pylibcudf/pylibcudf/scalar.pyx                      | 2 +-
 python/pylibcudf/pylibcudf/transform.pyx                   | 3 ++-
 25 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 95813907bf4..ecf619ddc44 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,7 +342,10 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
+    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
+    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
+    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8ceea4920e2..8b1d16f0d85 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport (
     mutable_column_view,
 )
 from pylibcudf.libcudf.types cimport size_type
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 99e4c21df8a..065655505b8 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 49714091f46..30353c4be6c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -8,7 +8,7 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 import pylibcudf
 
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 27095ca02d4..0f9820ed1db 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 78fc9f08bd8..dd2fafbe07f 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport (
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 32ae8c5ee53..caff019f575 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -284,7 +284,7 @@ def memoryview(
         """Read-only access to the buffer through host memory."""
         size = self._size if size is None else size
         host_buf = host_memory_allocation(size)
-        rmm._lib.device_buffer.copy_ptr_to_host(
+        rmm.pylibrmm.device_buffer.copy_ptr_to_host(
             self.get_ptr(mode="read") + offset, host_buf
         )
         return memoryview(host_buf).toreadonly()
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 4c9e524ee05..b40c56c9a6b 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -207,7 +207,7 @@ def spill(self, target: str = "cpu") -> None:
                     domain="cudf_python-spill",
                 ):
                     host_mem = host_memory_allocation(self.size)
-                    rmm._lib.device_buffer.copy_ptr_to_host(
+                    rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                         self._ptr, host_mem
                     )
                 self._ptr_desc["memoryview"] = host_mem
@@ -352,7 +352,7 @@ def memoryview(
             else:
                 assert self._ptr_desc["type"] == "gpu"
                 ret = host_memory_allocation(size)
-                rmm._lib.device_buffer.copy_ptr_to_host(
+                rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                     self._ptr + offset, ret
                 )
                 return ret
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 43604ab21a7..a0cbe7ada19 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs):
             ty.dtype, (StringView, UDFString)
         ):
             return types.uint64, val.ptr if isinstance(
-                val, rmm._lib.device_buffer.DeviceBuffer
+                val, rmm.pylibrmm.device_buffer.DeviceBuffer
             ) else val.get_ptr(mode="read")
         else:
             return ty, val
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index a37a12fc7e1..03808f0b664 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 25664286f19..b019ed8f099 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport null_equality
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index 7a369701bbd..76f35cbba71 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport (
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index f1a326bcd40..b2388858127 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.types cimport (
     type_id,
 )
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 92f5a185a54..a09b6c01392 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.table.table cimport table, table_view
 from pylibcudf.libcudf.utilities.host_span cimport host_span
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index cadac6a0022..6de9c4382d3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -6,7 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
index 4d4a4ba9b89..e6e719d6436 100644
--- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 ctypedef const scalar constscalar
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
index 6f6c145b23c..21033a0284e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport null_equality, size_type
 
-from rmm._lib.device_uvector cimport device_uvector
+from rmm.librmm.device_uvector cimport device_uvector
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 5f582091b06..27af4a3bdb1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
index 0c8fe1060ac..2eca043e451 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index 38298a7c1f1..d21510bd731 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
index ab5c0080312..9bdfaee2842 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -2,7 +2,7 @@
 
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 5bdde06f21f..aae39987dac 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -6,7 +6,8 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf cimport null_mask as cpp_null_mask
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
 
@@ -31,8 +32,8 @@ cpdef DeviceBuffer copy_bitmask(Column col):
     Returns
     -------
     rmm.DeviceBuffer
-        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer``
-        if ``col`` is not nullable
+        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty
+        ``DeviceBuffer`` if ``col`` is not nullable
     """
     cdef device_buffer db
 
diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
index 8664dfa4b7e..a273647c98d 100644
--- a/python/pylibcudf/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index 3e20938af0c..d4888a62ad1 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -6,7 +6,7 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
-from rmm._lib.memory_resource cimport get_current_device_resource
+from rmm.pylibrmm.memory_resource cimport get_current_device_resource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index de425a27c15..74134caeb78 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -9,7 +9,8 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview

From 7173b52fce25937bb69e22a083a5de4655078fa1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 10 Oct 2024 08:48:05 -0400
Subject: [PATCH 13/24] Fix regex parsing logic handling of nested quantifiers
 (#16798)

Fixes the libcudf regex parsing logic when handling nested fixed quantifiers. The logic handles fixed quantifiers by simple repeating the previous instruction. If the previous item is a group (capture or non-capture) that group may also contain an internal fixed quantifier as well.
Found while working on #16730

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16798
---
 cpp/src/strings/regex/regcomp.cpp    | 40 +++++++++++++++++++---------
 cpp/tests/strings/contains_tests.cpp | 14 ++++++++++
 cpp/tests/strings/extract_tests.cpp  | 16 ++++++++++-
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 51c6e765edd..775a2580f60 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -716,13 +716,13 @@ class regex_parser {
       if (item.type != COUNTED && item.type != COUNTED_LAZY) {
         out.push_back(item);
         if (item.type == LBRA || item.type == LBRA_NC) {
-          lbra_stack.push(index);
+          lbra_stack.push(out.size() - 1);
           repeat_start_index = -1;
         } else if (item.type == RBRA) {
           repeat_start_index = lbra_stack.top();
           lbra_stack.pop();
         } else if ((item.type & ITEM_MASK) != OPERATOR_MASK) {
-          repeat_start_index = index;
+          repeat_start_index = out.size() - 1;
         }
       } else {
         // item is of type COUNTED or COUNTED_LAZY
@@ -731,26 +731,39 @@ class regex_parser {
         CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location");
 
         // range of affected item(s) to repeat
-        auto const begin = in.begin() + repeat_start_index;
-        auto const end   = in.begin() + index;
+        auto const begin = out.begin() + repeat_start_index;
+        auto const end   = out.end();
+
         // count range values
         auto const n = item.d.count.n;  // minimum count
         auto const m = item.d.count.m;  // maximum count
-
         assert(n >= 0 && "invalid repeat count value n");
         // zero-repeat edge-case: need to erase the previous items
-        if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); }
-
-        // minimum repeats (n)
-        for (int j = 1; j < n; j++) {
-          out.insert(out.end(), begin, end);
+        if (n == 0 && m == 0) { out.erase(begin, end); }
+
+        std::vector<regex_parser::Item> repeat_copy(begin, end);
+        // special handling for quantified capture groups
+        if ((n > 1) && (*begin).type == LBRA) {
+          (*begin).type = LBRA_NC;  // change first one to non-capture
+          // add intermediate groups as non-capture
+          std::vector<regex_parser::Item> ncg_copy(begin, end);
+          for (int j = 1; j < (n - 1); j++) {
+            out.insert(out.end(), ncg_copy.begin(), ncg_copy.end());
+          }
+          // add the last entry as a regular capture-group
+          out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+        } else {
+          // minimum repeats (n)
+          for (int j = 1; j < n; j++) {
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+          }
         }
 
         // optional maximum repeats (m)
         if (m >= 0) {
           for (int j = n; j < m; j++) {
             out.emplace_back(LBRA_NC, 0);
-            out.insert(out.end(), begin, end);
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
           }
           for (int j = n; j < m; j++) {
             out.emplace_back(RBRA, 0);
@@ -760,8 +773,9 @@ class regex_parser {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
             out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0);
-          } else {  // copy it once then append '*'
-            out.insert(out.end(), begin, end);
+          } else {
+            // copy it once then append '*'
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
             out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0);
           }
         }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index bdfd38267e6..216ddfce5f1 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -474,6 +474,20 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false, true});
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::contains_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, QuantifierErrors)
 {
   EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error);
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 61246fb098d..7e0338f1bf4 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -240,6 +239,21 @@ TEST_F(StringsExtractTests, SpecialNewLines)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
 }
 
+TEST_F(StringsExtractTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::extract(sv, *prog);
+  // fixed quantifier on capture group only honors the last group
+  auto expected = cudf::test::strings_column_wrapper({"4444 ", "", "", "1111 "}, {1, 0, 0, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};

From 69b0f661ff2fc4c12bb0fe696e556f6b3224b381 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 08:38:11 -1000
Subject: [PATCH 14/24] Add string.convert.convert_lists APIs to pylibcudf
 (#16997)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16997
---
 .../strings/convert/convert_booleans.rst      |  6 ++
 .../strings/convert/convert_datetime.rst      |  6 ++
 .../strings/convert/convert_durations.rst     |  6 ++
 .../strings/convert/convert_fixed_point.rst   |  6 ++
 .../strings/convert/convert_floats.rst        |  6 ++
 .../strings/convert/convert_ipv4.rst          |  6 ++
 .../strings/convert/convert_lists.rst         |  6 ++
 .../strings/convert/convert_urls.rst          |  6 ++
 .../pylibcudf/strings/convert/index.rst       | 14 ++++
 .../api_docs/pylibcudf/strings/index.rst      |  6 ++
 .../_lib/strings/convert/convert_lists.pyx    | 32 ++-------
 .../libcudf/strings/convert/convert_lists.pxd |  2 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |  5 +-
 .../pylibcudf/strings/convert/__init__.pxd    |  1 +
 .../pylibcudf/strings/convert/__init__.py     |  1 +
 .../strings/convert/convert_fixed_point.pyx   |  6 +-
 .../strings/convert/convert_lists.pxd         | 11 +++
 .../strings/convert/convert_lists.pyx         | 72 +++++++++++++++++++
 .../tests/test_string_convert_lists.py        | 21 ++++++
 19 files changed, 187 insertions(+), 32 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
new file mode 100644
index 00000000000..de62221456f
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
@@ -0,0 +1,6 @@
+================
+convert_booleans
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_booleans
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
new file mode 100644
index 00000000000..fc5d5204ab3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
@@ -0,0 +1,6 @@
+================
+convert_datetime
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_datetime
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
new file mode 100644
index 00000000000..e80b0c15a61
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
@@ -0,0 +1,6 @@
+=================
+convert_durations
+=================
+
+.. automodule:: pylibcudf.strings.convert.convert_durations
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
new file mode 100644
index 00000000000..16d971a6849
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
@@ -0,0 +1,6 @@
+===================
+convert_fixed_point
+===================
+
+.. automodule:: pylibcudf.strings.convert.convert_fixed_point
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
new file mode 100644
index 00000000000..9ae4004cea9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
@@ -0,0 +1,6 @@
+==============
+convert_floats
+==============
+
+.. automodule:: pylibcudf.strings.convert.convert_floats
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
new file mode 100644
index 00000000000..4ead8677a69
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
@@ -0,0 +1,6 @@
+============
+convert_ipv4
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_ipv4
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
new file mode 100644
index 00000000000..33a719a42e1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
@@ -0,0 +1,6 @@
+=============
+convert_lists
+=============
+
+.. automodule:: pylibcudf.strings.convert.convert_lists
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
new file mode 100644
index 00000000000..f20d95e0cdd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
@@ -0,0 +1,6 @@
+============
+convert_urls
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_urls
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
new file mode 100644
index 00000000000..fa05cb7d786
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
@@ -0,0 +1,14 @@
+convert
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    convert_booleans
+    convert_datetime
+    convert_durations
+    convert_fixed_point
+    convert_floats
+    convert_ipv4
+    convert_lists
+    convert_urls
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 48dc8a13c3e..65dc5d2d1c3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -21,3 +21,9 @@ strings
     split
     strip
     wrap
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Subpackages
+
+    convert/index.rst
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 73aebf8ab35..3a2cb4bd5c7 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,23 +1,13 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.convert.convert_lists cimport (
-    format_list_column as cpp_format_list_column,
-)
-
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.scalar cimport DeviceScalar
-
 
 @acquire_spill_lock()
 def format_list_column(Column source_list, Column separators):
@@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators):
     -------
     Formatted strings column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_list.view()
-    cdef column_view separators_view = separators.view()
-    # Use 'None' as null-replacement string
-    cdef DeviceScalar str_na_rep = as_device_scalar("None")
-    cdef const string_scalar* string_scalar_na_rep = <const string_scalar*>(
-        str_na_rep.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_format_list_column(
-            source_view, string_scalar_na_rep[0], separators_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_lists.format_list_column(
+        source_list.to_pylibcudf(mode="read"),
+        as_device_scalar("None").c_value,
+        separators.to_pylibcudf(mode="read"),
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 109111568d8..6e1ecd30539 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[column] format_list_column(
-        column_view input_col,
+        column_view input,
         string_scalar na_rep,
         column_view separators) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 7b228c06a18..846070870b1 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
+set(cython_sources
+    convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
+    convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index be6145384ad..799532d72c6 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -6,5 +6,6 @@ from . cimport (
     convert_fixed_point,
     convert_floats,
     convert_ipv4,
+    convert_lists,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 7c94387282b..deb2d8ab74b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -6,5 +6,6 @@
     convert_fixed_point,
     convert_floats,
     convert_ipv4,
+    convert_lists,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 40dadf6f967..60a8fca8baf 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -15,7 +15,7 @@ cpdef Column to_fixed_point(Column input, DataType output_type):
     Returns a new fixed-point column parsing decimal values from the
     provided strings column.
 
-    For details, see :cpp:details:`cudf::strings::to_fixed_point`
+    For details, see :cpp:func:`cudf::strings::to_fixed_point`
 
     Parameters
     ----------
@@ -47,7 +47,7 @@ cpdef Column from_fixed_point(Column input):
     Returns a new strings column converting the fixed-point values
     into a strings column.
 
-    For details, see :cpp:details:`cudf::strings::from_fixed_point`
+    For details, see :cpp:func:`cudf::strings::from_fixed_point`
 
     Parameters
     ----------
@@ -75,7 +75,7 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
     Returns a boolean column identifying strings in which all
     characters are valid for conversion to fixed-point.
 
-    For details, see :cpp:details:`cudf::strings::is_fixed_point`
+    For details, see :cpp:func:`cudf::strings::is_fixed_point`
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
new file mode 100644
index 00000000000..1ba4272afa2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=*,
+    Column separators=*
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
new file mode 100644
index 00000000000..3fbc08a9ab5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.column_factories cimport make_empty_column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_lists as cpp_convert_lists,
+)
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.types cimport type_id
+
+from cython.operator import dereference
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=None,
+    Column separators=None
+):
+    """
+    Convert a list column of strings into a formatted strings column.
+
+    For details, see :cpp:func`cudf::strings::format_list_column`
+
+    Parameters
+    ----------
+    input : Column
+        Lists column to format
+
+    na_rep : Scalar
+        Replacement string for null elements.
+        Default, empty string
+
+    separators : Column
+        Strings to use for enclosing list components and separating elements.
+        Default, ``,``, ``[``, ``]``
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    if na_rep is None:
+        na_rep = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef const string_scalar* c_na_rep = <const string_scalar*>(
+        na_rep.c_obj.get()
+    )
+
+    if separators is None:
+        separators = make_empty_column(type_id.STRING)
+
+    with nogil:
+        c_result = move(
+            cpp_convert_lists.format_list_column(
+                input.view(),
+                dereference(c_na_rep),
+                separators.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
new file mode 100644
index 00000000000..8591732b39e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
+@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
+def test_format_list_column(na_rep, separators):
+    arr = pa.array([["1", "A"], None])
+    result = plc.strings.convert.convert_lists.format_list_column(
+        plc.interop.from_arrow(arr),
+        na_rep if na_rep is None else plc.interop.from_arrow(na_rep),
+        separators
+        if separators is None
+        else plc.interop.from_arrow(separators),
+    )
+    expected = pa.array(["[1,A]", ""])
+    assert_column_eq(result, expected)

From 7d49df7d8a1a49de628181d81ef87b186b1ea594 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:52:17 -1000
Subject: [PATCH 15/24] Add json APIs to pylibcudf (#17025)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17025
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/json.rst    |   6 +
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +-
 python/cudf/cudf/_lib/strings/json.pyx        |  80 ++-------
 python/cudf/cudf/core/column/string.py        |   3 +-
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 python/pylibcudf/pylibcudf/json.pxd           |  16 ++
 python/pylibcudf/pylibcudf/json.pyx           | 154 ++++++++++++++++++
 .../pylibcudf/libcudf/{strings => }/json.pxd  |   0
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   4 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   3 +
 python/pylibcudf/pylibcudf/tests/test_json.py |  42 +++++
 python/pylibcudf/pyproject.toml               |   3 +-
 15 files changed, 246 insertions(+), 73 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
 create mode 100644 python/pylibcudf/pylibcudf/json.pxd
 create mode 100644 python/pylibcudf/pylibcudf/json.pyx
 rename python/pylibcudf/pylibcudf/libcudf/{strings => }/json.pxd (100%)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_json.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 052479d6720..62e14a67ee5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     groupby
     interop
     join
+    json
     labeling
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
new file mode 100644
index 00000000000..bb38d179a57
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
@@ -0,0 +1,6 @@
+====
+json
+====
+
+.. automodule:: pylibcudf.json
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index e712937f816..ffa5e603408 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -72,7 +72,7 @@
 )
 from cudf._lib.strings.find_multiple import find_multiple
 from cudf._lib.strings.findall import find_re, findall
-from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
+from cudf._lib.strings.json import get_json_object
 from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index c9b0bba088d..374a104635a 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,84 +1,26 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
+from pylibcudf.json cimport GetJsonObjectOptions
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.json cimport (
-    get_json_object as cpp_get_json_object,
-    get_json_object_options,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
 def get_json_object(
-        Column col, object py_json_path, GetJsonObjectOptions options):
+    Column col,
+    object py_json_path,
+    GetJsonObjectOptions options
+):
     """
     Apply a JSONPath string to all rows in an input column
     of json strings.
     """
-    cdef unique_ptr[column] c_result
-
-    cdef column_view col_view = col.view()
-    cdef DeviceScalar json_path = py_json_path.device_value
-
-    cdef const string_scalar* scalar_json_path = <const string_scalar*>(
-        json_path.get_raw_ptr()
+    plc_column = plc.json.get_json_object(
+        col.to_pylibcudf(mode="read"),
+        py_json_path.device_value.c_value,
+        options
     )
-
-    with nogil:
-        c_result = move(cpp_get_json_object(
-            col_view,
-            scalar_json_path[0],
-            options.options,
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class GetJsonObjectOptions:
-    cdef get_json_object_options options
-
-    def __init__(
-        self,
-        *,
-        allow_single_quotes=False,
-        strip_quotes_from_single_strings=True,
-        missing_fields_as_nulls=False
-    ):
-        self.options.set_allow_single_quotes(allow_single_quotes)
-        self.options.set_strip_quotes_from_single_strings(
-            strip_quotes_from_single_strings
-        )
-        self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)
-
-    @property
-    def allow_single_quotes(self):
-        return self.options.get_allow_single_quotes()
-
-    @property
-    def strip_quotes_from_single_strings(self):
-        return self.options.get_strip_quotes_from_single_strings()
-
-    @property
-    def missing_fields_as_nulls(self):
-        return self.options.get_missing_fields_as_nulls()
-
-    @allow_single_quotes.setter
-    def allow_single_quotes(self, val):
-        self.options.set_allow_single_quotes(val)
-
-    @strip_quotes_from_single_strings.setter
-    def strip_quotes_from_single_strings(self, val):
-        self.options.set_strip_quotes_from_single_strings(val)
-
-    @missing_fields_as_nulls.setter
-    def missing_fields_as_nulls(self, val):
-        self.options.set_missing_fields_as_nulls(val)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b50e23bd52e..45d1a8b087b 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2385,8 +2385,7 @@ def get_json_object(
             0    [\n        { "category": "reference",\n       ...
             dtype: object
         """
-
-        options = libstrings.GetJsonObjectOptions(
+        options = plc.json.GetJsonObjectOptions(
             allow_single_quotes=allow_single_quotes,
             strip_quotes_from_single_strings=(
                 strip_quotes_from_single_strings
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 1d72eacac12..2854d7c42ac 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -27,6 +27,7 @@ set(cython_sources
     groupby.pyx
     interop.pyx
     join.pyx
+    json.pyx
     labeling.pyx
     lists.pyx
     merge.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index b98b37fe0fd..79c2f0c5455 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     filling,
     groupby,
     join,
+    json,
     labeling,
     lists,
     merge,
@@ -60,6 +61,7 @@ __all__ = [
     "gpumemoryview",
     "groupby",
     "join",
+    "json",
     "lists",
     "merge",
     "null_mask",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 304f27be340..88e72418cda 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     interop,
     io,
     join,
+    json,
     labeling,
     lists,
     merge,
@@ -73,6 +74,7 @@
     "interop",
     "io",
     "join",
+    "json",
     "labeling",
     "lists",
     "merge",
diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd
new file mode 100644
index 00000000000..87a87349b8a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.json cimport get_json_object_options
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    cdef get_json_object_options options
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=*
+)
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
new file mode 100644
index 00000000000..4a8d11068f9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf cimport json as cpp_json
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    """Settings for ``get_json_object()``"""
+    def __init__(
+        self,
+        *,
+        allow_single_quotes=False,
+        strip_quotes_from_single_strings=True,
+        missing_fields_as_nulls=False
+    ):
+        self.set_allow_single_quotes(allow_single_quotes)
+        self.set_strip_quotes_from_single_strings(
+            strip_quotes_from_single_strings
+        )
+        self.set_missing_fields_as_nulls(missing_fields_as_nulls)
+
+    def get_allow_single_quotes(self):
+        """
+        Returns true/false depending on whether single-quotes for representing strings
+        are allowed.
+
+        Returns
+        -------
+        bool
+            true if single-quotes are allowed, false otherwise.
+        """
+        return self.options.get_allow_single_quotes()
+
+    def get_strip_quotes_from_single_strings(self):
+        """
+        Returns true/false depending on whether individually returned string values have
+        their quotes stripped.
+
+        Returns
+        -------
+        bool
+            true if individually returned string values have their quotes stripped.
+        """
+        return self.options.get_strip_quotes_from_single_strings()
+
+    def get_missing_fields_as_nulls(self):
+        """
+        Whether a field not contained by an object is to be interpreted as null.
+
+        Returns
+        -------
+        bool
+            true if missing fields are interpreted as null.
+        """
+        return self.options.get_missing_fields_as_nulls()
+
+    def set_allow_single_quotes(self, bool val):
+        """
+        Set whether single-quotes for strings are allowed.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to allow single quotes
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_allow_single_quotes(val)
+
+    def set_strip_quotes_from_single_strings(self, bool val):
+        """
+        Set whether individually returned string values have their quotes stripped.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to strip quotes from single strings.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_strip_quotes_from_single_strings(val)
+
+    def set_missing_fields_as_nulls(self, bool val):
+        """
+        Set whether missing fields are interpreted as null.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to treat missing fields as nulls.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_missing_fields_as_nulls(val)
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=None
+):
+    """
+    Apply a JSONPath string to all rows in an input strings column.
+
+    For details, see :cpp:func:`cudf::get_json_object`
+
+    Parameters
+    ----------
+    col : Column
+        The input strings column. Each row must contain a valid json string.
+
+    json_path : Scalar
+        The JSONPath string to be applied to each row.
+
+    options : GetJsonObjectOptions
+        Options for controlling the behavior of the function.
+
+    Returns
+    -------
+    Column
+        New strings column containing the retrieved json object strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string_scalar* c_json_path = <string_scalar*>(
+        json_path.c_obj.get()
+    )
+    if options is None:
+        options = GetJsonObjectOptions()
+
+    cdef cpp_json.get_json_object_options c_options = options.options
+
+    with nogil:
+        c_result = move(
+            cpp_json.get_json_object(
+                col.view(),
+                dereference(c_json_path),
+                c_options
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd
similarity index 100%
rename from python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/json.pxd
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 187ef113073..e45048a500f 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -14,6 +14,7 @@ from . cimport (
     padding,
     regex_flags,
     regex_program,
+    repeat,
     replace,
     side_type,
     slice,
@@ -33,9 +34,12 @@ __all__ = [
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
     "slice",
     "strip",
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 6033cea0625..c6253d94b40 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -34,9 +34,12 @@
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
     "slice",
     "strip",
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
new file mode 100644
index 00000000000..3d2955211f8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def plc_col():
+    arr = pa.array(
+        ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]
+    )
+    return plc.interop.from_arrow(arr)
+
+
+@pytest.fixture(scope="module")
+def json_path():
+    slr = pa.scalar("$.foo.bar")
+    return plc.interop.from_arrow(slr)
+
+
+@pytest.mark.parametrize("allow_single_quotes", [True, False])
+@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False])
+@pytest.mark.parametrize("missing_fields_as_nulls", [True, False])
+def test_get_json_object(
+    plc_col,
+    json_path,
+    allow_single_quotes,
+    strip_quotes_from_single_strings,
+    missing_fields_as_nulls,
+):
+    result = plc.json.get_json_object(
+        plc_col,
+        json_path,
+        plc.json.GetJsonObjectOptions(
+            allow_single_quotes=allow_single_quotes,
+            strip_quotes_from_single_strings=strip_quotes_from_single_strings,
+            missing_fields_as_nulls=missing_fields_as_nulls,
+        ),
+    )
+    expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index c9a685de3e9..ea5b3065896 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -97,7 +97,8 @@ skip = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "--tb=native --strict-config --strict-markers"
+# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module
+addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error",

From 097778e82a6b580bfa91d7941379d33acc324c60 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:25:27 -1000
Subject: [PATCH 16/24] Move pylibcudf/libcudf/wrappers/decimals to
 pylibcudf/libcudf/fixed_point (#17048)

Contributes to https://github.com/rapidsai/cudf/issues/15162

I don't think there are any types in this file that needs to be exposed on the Python side; they're just used internally in pylibcudf.

Also moves this to `libcudf/fixed_point` matching the libcudf location more closely

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17048
---
 .../pylibcudf/libcudf/fixed_point/__init__.pxd   |  0
 .../libcudf/fixed_point/fixed_point.pxd          |  8 ++++++++
 .../pylibcudf/libcudf/scalar/scalar.pxd          |  2 +-
 .../pylibcudf/libcudf/wrappers/decimals.pxd      | 16 ----------------
 4 files changed, 9 insertions(+), 17 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
 delete mode 100644 python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd

diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
new file mode 100644
index 00000000000..e55574020f4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
+
+cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
+    cdef cppclass scale_type:
+        scale_type(int32_t)
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 4b40a8a26f6..a51413669c5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type
-from pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
deleted file mode 100644
index 558299501d6..00000000000
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
-from pylibcudf.libcudf.types cimport int128
-
-
-cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
-    # cython type stub to help resolve to numeric::decimal64
-    ctypedef int64_t decimal64
-    # cython type stub to help resolve to numeric::decimal32
-    ctypedef int64_t decimal32
-    # cython type stub to help resolve to numeric::decimal128
-    ctypedef int128 decimal128
-
-    cdef cppclass scale_type:
-        scale_type(int32_t)

From 1436cac9de8b450a32e71d5b779503e9a29edaa6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:26:44 -1000
Subject: [PATCH 17/24] Remove unneeded pylibcudf.libcudf.wrappers.duration
 usage in cudf (#17010)

Contributes to https://github.com/rapidsai/cudf/issues/15162

~I just assumed since the associated libcudf files just publicly expose C types, we just want to match the name spacing when importing from pylibcudf (avoid importing from `pylibcudf.libcudf`) and not necessary expose a Python equivalent?~

~Let me know if I am misunderstanding how to expose these types.~

https://github.com/rapidsai/cudf/pull/17010#issuecomment-2403658378

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17010
---
 python/cudf/cudf/_lib/scalar.pyx | 96 +-------------------------------
 1 file changed, 1 insertion(+), 95 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0dde91316fb..56712402919 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -6,7 +6,6 @@ import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,25 +24,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
 from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport (
-    duration_scalar,
-    list_scalar,
-    scalar,
-    struct_scalar,
-    timestamp_scalar,
-)
-from pylibcudf.libcudf.wrappers.durations cimport (
-    duration_ms,
-    duration_ns,
-    duration_s,
-    duration_us,
-)
-from pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_ns,
-    timestamp_s,
-    timestamp_us,
-)
+from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
@@ -284,62 +265,6 @@ cdef class DeviceScalar:
             ]
 
 
-# TODO: Currently the only uses of this function and the one below are in
-# _create_proxy_nat_scalar. See if that code path can be simplified to excise
-# or at least simplify these implementations.
-cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
-                                    object value,
-                                    object dtype,
-                                    bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "datetime64[s]":
-        s.reset(
-            new timestamp_scalar[timestamp_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ms]":
-        s.reset(
-            new timestamp_scalar[timestamp_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[us]":
-        s.reset(
-            new timestamp_scalar[timestamp_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ns]":
-        s.reset(
-            new timestamp_scalar[timestamp_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
-                                     object value,
-                                     object dtype,
-                                     bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "timedelta64[s]":
-        s.reset(
-            new duration_scalar[duration_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ms]":
-        s.reset(
-            new duration_scalar[duration_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[us]":
-        s.reset(
-            new duration_scalar[duration_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ns]":
-        s.reset(
-            new duration_scalar[duration_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
         if dtype == val.dtype or dtype is None:
@@ -361,22 +286,3 @@ def _is_null_host_scalar(slr):
         return True
     else:
         return False
-
-
-def _create_proxy_nat_scalar(dtype):
-    cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)
-
-    dtype = cudf.dtype(dtype)
-    if dtype.char in 'mM':
-        nat = dtype.type('NaT').astype(dtype)
-        if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        return result
-    else:
-        raise TypeError('NAT only valid for datetime and timedelta')

From 89a6fe575f2ac0caa661dd51f87b37dba07507a7 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 11 Oct 2024 08:59:20 -0500
Subject: [PATCH 18/24] make conda installs in CI stricter (part 2) (#17042)

Follow-up to #17013

Changes relative to that PR:

* switches to pinning CI conda installs to the output of `rapids-version` (`{major}.{minor}.{patch}`) instead of `rapids-version-major-minor` (`{major}.{minor}`), to get a bit more protection in the presence of hotfix releases
* restores some exporting of variables needed for docs builds

I made some mistakes in https://github.com/rapidsai/cudf/pull/17013#discussion_r1792317422. Missed that this project's Doxygen setup is expecting to find `RAPIDS_VERSION` and `RAPIDS_VERSION_MAJOR_MINOR` defined in the environment.

https://github.com/rapidsai/cudf/blob/7173b52fce25937bb69e22a083a5de4655078fa1/cpp/doxygen/Doxyfile#L41

https://github.com/rapidsai/cudf/blob/7173b52fce25937bb69e22a083a5de4655078fa1/cpp/doxygen/Doxyfile#L2229

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17042
---
 ci/build_docs.sh         | 16 ++++++++--------
 ci/test_cpp_common.sh    | 10 +++++-----
 ci/test_java.sh          |  4 ++--
 ci/test_notebooks.sh     |  6 +++---
 ci/test_python_common.sh |  6 +++---
 ci/test_python_other.sh  |  8 ++++----
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index dae6ac46757..4290d013fe4 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -3,8 +3,8 @@
 
 set -euo pipefail
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+export RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -28,16 +28,16 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "pylibcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}" \
+  "pylibcudf=${RAPIDS_VERSION}" \
+  "cudf=${RAPIDS_VERSION}" \
+  "dask-cudf=${RAPIDS_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
@@ -57,4 +57,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index e8f6e9388f4..8cd78eb11c2 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate C++ testing dependencies"
 
@@ -33,10 +33,10 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf-tests=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf-example=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}" \
+  "libcudf_kafka=${RAPIDS_VERSION}" \
+  "libcudf-tests=${RAPIDS_VERSION}" \
+  "libcudf-example=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 9b7b2e48dd6..7f1aa633afc 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate Java testing dependencies"
 
@@ -32,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index 3e0712a0691..4197dc5617f 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate notebook testing dependencies"
 
@@ -32,8 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
 
 NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
 pushd notebooks
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index 81e82908eb4..4327bfff3af 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate Python testing dependencies"
 
@@ -40,5 +40,5 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index eee1d54083f..21a59fa1494 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -7,14 +7,14 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 # Common setup steps shared by Python test jobs
 source ./ci/test_python_common.sh test_python_other
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "cudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "custreamz=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "dask-cudf=${RAPIDS_VERSION}" \
+  "cudf_kafka=${RAPIDS_VERSION}" \
+  "custreamz=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 7cf0a1b4c741f6cc3e4599d41d614e1c046f8a13 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 11 Oct 2024 16:46:53 +0200
Subject: [PATCH 19/24] Pylibcudf: pack and unpack (#17012)

Adding python bindings to [`cudf::pack()`](https://docs.rapids.ai/api/libcudf/legacy/group__copy__split#ga86716e7ec841541deb6edc7e91fcb9e4), [`cudf::unpack()`](https://docs.rapids.ai/api/libcudf/legacy/group__copy__split#ga1d62a18c2e6f087a92289c63693762cc), and [`cudf::packed_columns`](https://docs.rapids.ai/api/libcudf/legacy/structcudf_1_1packed__columns).

This is the first step to support serialization of cudf.polars' IR.

cc. @wence- @rjzamora

#

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17012
---
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/pylibcudf/contiguous_split.pxd  |  20 ++
 .../pylibcudf/pylibcudf/contiguous_split.pyx  | 198 ++++++++++++++++++
 .../pylibcudf/libcudf/contiguous_split.pxd    |   5 +
 .../pylibcudf/tests/test_contiguous_split.py  |  49 +++++
 7 files changed, 277 insertions(+)
 create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pxd
 create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_contiguous_split.py

diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 2854d7c42ac..15dd2b4c34f 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -17,6 +17,7 @@ set(cython_sources
     binaryop.pyx
     column.pyx
     column_factories.pyx
+    contiguous_split.pyx
     concatenate.pyx
     copying.pyx
     datetime.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 79c2f0c5455..aa67b4b1149 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -6,6 +6,7 @@ from . cimport (
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
@@ -52,6 +53,7 @@ __all__ = [
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 88e72418cda..4033062b7e2 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -15,6 +15,7 @@
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
@@ -63,6 +64,7 @@
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
new file mode 100644
index 00000000000..2a10cb5b3d5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.contiguous_split cimport packed_columns
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+
+
+cdef class PackedColumns:
+    cdef unique_ptr[packed_columns] c_obj
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data)
+
+cpdef PackedColumns pack(Table input)
+
+cpdef Table unpack(PackedColumns input)
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data)
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
new file mode 100644
index 00000000000..ed926a3fcc0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -0,0 +1,198 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint8_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf.contiguous_split cimport (
+    pack as cpp_pack,
+    packed_columns,
+    unpack as cpp_unpack,
+)
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .utils cimport int_to_void_ptr
+
+
+cdef class HostBuffer:
+    """Owning host buffer that implements the buffer protocol"""
+    cdef unique_ptr[vector[uint8_t]] c_obj
+    cdef size_t nbytes
+    cdef Py_ssize_t[1] shape
+    cdef Py_ssize_t[1] strides
+
+    @staticmethod
+    cdef HostBuffer from_unique_ptr(
+        unique_ptr[vector[uint8_t]] vec
+    ):
+        cdef HostBuffer out = HostBuffer()
+        out.c_obj = move(vec)
+        out.nbytes = dereference(out.c_obj).size()
+        out.shape[0] = out.nbytes
+        out.strides[0] = 1
+        return out
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        buffer.buf = dereference(self.c_obj).data()
+        buffer.format = NULL  # byte
+        buffer.internal = NULL
+        buffer.itemsize = 1
+        buffer.len = self.nbytes
+        buffer.ndim = 1
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+cdef class PackedColumns:
+    """Column data in a serialized format.
+
+    Contains data from an array of columns in two contiguous buffers:
+    one on host, which contains table metadata and one on device which
+    contains the table data.
+
+    For details, see :cpp:class:`cudf::packed_columns`.
+    """
+    def __init__(self):
+        raise ValueError(
+            "PackedColumns should not be constructed directly. "
+            "Use one of the factories."
+        )
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
+        """Create a Python PackedColumns from a libcudf packed_columns."""
+        cdef PackedColumns out = PackedColumns.__new__(PackedColumns)
+        out.c_obj = move(data)
+        return out
+
+    def release(self):
+        """Releases and returns the underlying serialized metadata and gpu data.
+
+        The ownership of the memory are transferred to the returned buffers. After
+        this call, `self` is empty.
+
+        Returns
+        -------
+        memoryview (of a HostBuffer)
+            The serialized metadata as contiguous host memory.
+        gpumemoryview (of a rmm.DeviceBuffer)
+            The serialized gpu data as contiguous device memory.
+        """
+        if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data):
+            raise ValueError("Cannot release empty PackedColumns")
+
+        return (
+            memoryview(
+                HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata))
+            ),
+            gpumemoryview(
+                DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data))
+            )
+        )
+
+
+cpdef PackedColumns pack(Table input):
+    """Deep-copy a table into a serialized contiguous memory format.
+
+    Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized
+    data back into the table.
+
+    Examples
+    --------
+    >>> packed = pylibcudf.contiguous_split.pack(...)
+    >>> # Either unpack the whole `PackedColumns` at once.
+    >>> pylibcudf.contiguous_split.unpack(packed)
+    >>> # Or unpack the two serialized buffers in `PackedColumns`.
+    >>> metadata, gpu_data = packed.release()
+    >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data)
+
+    For details, see :cpp:func:`cudf::pack`.
+
+    Parameters
+    ----------
+    input : Table
+        Table to pack.
+
+    Returns
+    -------
+    PackedColumns
+        The packed columns.
+    """
+    return PackedColumns.from_libcudf(
+        make_unique[packed_columns](cpp_pack(input.view()))
+    )
+
+
+cpdef Table unpack(PackedColumns input):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    input : PackedColumns
+        The packed columns to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    cdef table_view v = cpp_unpack(dereference(input.c_obj))
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
+
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    metadata : memoryview
+        The packed metadata to unpack.
+    gpu_data : gpumemoryview
+        The packed gpu_data to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    if metadata.nbytes == 0:
+        if gpu_data.__cuda_array_interface__["data"][0] != 0:
+            raise ValueError("Expected an empty gpu_data from unpacking an empty table")
+        return Table.from_libcudf(make_unique[table](table_view()))
+
+    # Extract the raw data pointers
+    cdef const uint8_t[::1] _metadata = metadata
+    cdef const uint8_t* metadata_ptr = &_metadata[0]
+    cdef const uint8_t* gpu_data_ptr = <uint8_t*>int_to_void_ptr(gpu_data.ptr)
+
+    cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr)
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index 6de9c4382d3..12090af16cc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef packed_columns pack (const table_view& input) except +
 
     cdef table_view unpack (const packed_columns& input) except +
+
+    cdef table_view unpack (
+        const uint8_t* metadata,
+        const uint8_t* gpu_data
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
new file mode 100644
index 00000000000..7a5c1664eed
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+param_pyarrow_tables = [
+    pa.table([]),
+    pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+    pa.table({"a": [1, 2, 3]}),
+    pa.table({"a": [1], "b": [2], "c": [3]}),
+    pa.table({"a": ["a", "bb", "ccc"]}),
+    pa.table({"a": [1, 2, None], "b": [None, 3, 4]}),
+    pa.table(
+        {
+            "a": [["a", "b"], ["cde"]],
+            "b": [
+                {"alpha": [1, 2], "beta": None},
+                {"alpha": [3, 4], "beta": 5},
+            ],
+        }
+    ),
+]
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    res = plc.contiguous_split.unpack(packed)
+    assert_table_eq(arrow_tbl, res)
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack_from_memoryviews(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    metadata, gpudata = packed.release()
+
+    with pytest.raises(ValueError, match="Cannot release empty"):
+        packed.release()
+
+    del packed  # `metadata` and `gpudata` will survive
+
+    res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata)
+    assert_table_eq(arrow_tbl, res)

From 66a94c3d025931b50b08a8a7bdda3363904dbef4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 08:41:47 -0700
Subject: [PATCH 20/24] Replace deprecated cuco APIs with updated versions
 (#17052)

This PR replaces the deprecated cuco APIs with the new ones, ensuring the code is up to date with the latest API changes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17052
---
 cpp/src/io/parquet/chunk_dict.cu        | 4 ++--
 cpp/src/join/mixed_join_kernels_semi.cu | 2 +-
 cpp/src/join/mixed_join_semi.cu         | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 17ccb73c0a8..1a2a9eac17d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -84,7 +84,7 @@ struct map_insert_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::insert` operator
-      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto map_insert_ref = hash_map_ref.rebind_operators(cuco::insert);
       auto const t        = threadIdx.x;
 
       // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
@@ -186,7 +186,7 @@ struct map_find_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::find` operator
-      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const map_find_ref = hash_map_ref.rebind_operators(cuco::find);
       auto const t            = threadIdx.x;
 
       // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index bd8c80652a0..a4ec97af235 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -67,7 +67,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
   // Create set ref with the new equality comparator
-  auto const set_ref_equality = set_ref.with_key_eq(equality);
+  auto const set_ref_equality = set_ref.rebind_key_eq(equality);
 
   // Total number of rows to query the set
   auto const outer_num_rows = left_table.num_rows();
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 83a55eca50f..62ba558b0bd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -184,7 +184,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+  hash_set_ref_type const row_set_ref =
+    row_set.ref(cuco::contains).rebind_hash_function(hash_probe);
 
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);

From 349010e75f94488c6385f773bbca872ccc5f34b6 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 08:42:12 -0700
Subject: [PATCH 21/24] Remove unused hash helper functions (#17056)

This PR removes unused hash detail implementations.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17056
---
 .../cudf/hashing/detail/helper_functions.cuh  | 194 ------------------
 1 file changed, 194 deletions(-)

diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
index 3489fdeccee..ea1accc62a4 100644
--- a/cpp/include/cudf/hashing/detail/helper_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh
@@ -47,197 +47,3 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert,
 
   return hash_table_size;
 }
-
-template <typename pair_type>
-__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0, 0, 0};
-    converter.vec_val            = *reinterpret_cast<uint4 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0};
-    converter.vec_val            = *reinterpret_cast<uint2 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<int const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<short const*>(ptr);
-    return converter.pair_val;
-  } else {
-    return *ptr;
-  }
-}
-
-template <typename pair_type>
-__forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr,
-                                                      pair_type const val)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0, 0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint4*>(ptr) = converter.vec_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint2*>(ptr) = converter.vec_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.pair_val           = val;
-    *reinterpret_cast<int*>(ptr) = converter.vec_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0};
-    converter.pair_val             = val;
-    *reinterpret_cast<short*>(ptr) = converter.vec_val;
-  } else {
-    *ptr = val;
-  }
-}
-
-template <typename value_type, typename size_type, typename key_type, typename elem_type>
-CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                              size_type const n,
-                              key_type const key_val,
-                              elem_type const elem_val)
-{
-  size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val));
-  }
-}
-
-template <typename T>
-struct equal_to {
-  using result_type          = bool;
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  __forceinline__ __host__ __device__ constexpr bool operator()(
-    first_argument_type const& lhs, second_argument_type const& rhs) const
-  {
-    return lhs == rhs;
-  }
-};
-
-template <typename Iterator>
-class cycle_iterator_adapter {
- public:
-  using value_type      = typename std::iterator_traits<Iterator>::value_type;
-  using difference_type = typename std::iterator_traits<Iterator>::difference_type;
-  using pointer         = typename std::iterator_traits<Iterator>::pointer;
-  using reference       = typename std::iterator_traits<Iterator>::reference;
-  using iterator_type   = Iterator;
-
-  cycle_iterator_adapter() = delete;
-
-  __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin,
-                                                      iterator_type const& end,
-                                                      iterator_type const& current)
-    : m_begin(begin), m_end(end), m_current(current)
-  {
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++()
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++() const
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++(int)
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++(int) const
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ bool equal(cycle_iterator_adapter<iterator_type> const& other) const
-  {
-    return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end;
-  }
-
-  __host__ __device__ reference& operator*() { return *m_current; }
-
-  __host__ __device__ reference const& operator*() const { return *m_current; }
-
-  __host__ __device__ const pointer operator->() const { return m_current.operator->(); }
-
-  __host__ __device__ pointer operator->() { return m_current; }
-
- private:
-  iterator_type m_current;
-  iterator_type m_begin;
-  iterator_type m_end;
-};
-
-template <class T>
-__host__ __device__ bool operator==(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return lhs.equal(rhs);
-}
-
-template <class T>
-__host__ __device__ bool operator!=(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return !lhs.equal(rhs);
-}

From 891e5aa7bf00355dea2b10906cebbe02f9ba25f5 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 11 Oct 2024 12:18:18 -0400
Subject: [PATCH 22/24] Organize parquet reader mukernel non-nullable code,
 introduce manual block scans (#16830)

This is a collection of a few small optimizations and tweaks for the parquet reader fixed-width mukernels (flat & nested, lists not implemented yet). The benchmark changes are negligible, this is mainly cleanup and code in preparation for the upcoming list mukernel.

1) If not reading the whole page (chunked reads) exit sooner

2) By having each thread keep track of the current valid_count (and not saving-to or reading-from the nesting_info until the end), we don't need to synchronize the block threads as frequently, so these extra syncs are removed.

3) For (non-list) nested columns that aren't nullable, we don't need to loop over the whole nesting depth; only the last level of nesting is used.  After removing this loop, the non-nullable code for nested and flat hierarchies is identical, so they're extracted and consolidated into a new function.

4) When doing block scans in the parquet reader we also need to know the per-warp results of the scan.  Because cub doesn't return those, we then do an additional warp-wide ballot that is unnecessary.  This introduces code that does a block scan manually, saving the intermediate results.  However using this code in the flat & nested kernels uses 8 more registers, so it isn't used yet.

5) By doing an exclusive-scan instead of an inclusive-scan, we don't need the extra "- 1's" that were everywhere.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16830
---
 cpp/src/io/parquet/decode_fixed.cu | 389 +++++++++++++++++------------
 1 file changed, 235 insertions(+), 154 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 8a866141c4b..4522ea7fe56 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,59 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously.
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for
+// lists.
+struct block_scan_results {
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+{
+  int const t              = threadIdx.x;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+}
+
+template <int decode_block_size>
+__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
+                                                int warp_lane,
+                                                int warp_index,
+                                                uint32_t lane_mask,
+                                                block_scan_results& results)
+{
+  // Compute # warps
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+
+  // Compute the warp-wide results
+  results.warp_bits                = warp_bits;
+  results.warp_count               = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
+
+  // Share the warp counts amongst the block threads
+  __shared__ int warp_counts[num_warps];
+  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
+  __syncthreads();
+
+  // Compute block-wide results
+  results.block_count               = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  }
+}
+
 template <int block_size, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
@@ -194,7 +247,7 @@ struct decode_fixed_width_split_values_func {
   }
 };
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -211,29 +264,28 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
   int const row_index_lower_bound = s->row_index_lower_bound;
 
-  int const max_depth = s->col.max_nesting_depth - 1;
+  int const max_depth       = s->col.max_nesting_depth - 1;
+  auto& max_depth_ni        = s->nesting_info[max_depth];
+  int max_depth_valid_count = max_depth_ni.valid_count;
+
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+    // definition level
+    int d = 1;
+    if (t >= batch_size) {
+      d = -1;
+    } else if (def) {
+      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
     }
 
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index           = (thread_value_count + value_count) - 1;
+    int const row_index           = thread_value_count + value_count;
     int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
@@ -242,90 +294,75 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
 
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_row_bounds;
-      }
+      int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
       int thread_valid_count, block_valid_count;
-      if constexpr (nullable) {
-        using block_scan = cub::BlockScan<int, decode_block_size>;
-        __shared__ typename block_scan::TempStorage scan_storage;
-        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-        __syncthreads();
-
-        // validity is processed per-warp
-        //
-        // nested schemas always read and write to the same bounds (that is, read and write
-        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-        // at the first value, even if that is before first_row, because we cannot trivially jump to
-        // the correct position to start reading. since we are about to write the validity vector
-        // here we need to adjust our computed mask to take into account the write row bounds.
-        int warp_null_count = 0;
-        if (write_start >= 0 && ni.valid_map != nullptr) {
-          int const valid_map_offset        = ni.valid_map_offset;
-          uint32_t const warp_validity_mask = ballot(is_valid);
-          // lane 0 from each warp writes out validity
-          if ((t % cudf::detail::warp_size) == 0) {
-            int const vindex =
-              (value_count + thread_value_count) - 1;  // absolute input value index
-            int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                   first_row;  // absolute bit offset into the output validity map
-            int const write_end = cudf::detail::warp_size -
-                                  __clz(in_write_row_bounds);  // last bit in the warp to store
-            int const bit_count = write_end - write_start;
-            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-          }
-        }
+      block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
 
-        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-        // valid_count) because valid_count also includes rows that potentially start before our row
-        // bounds. if we could come up with a way to clean that up, we could remove this and just
-        // compute it directly at the end of the kernel.
-        size_type const block_null_count =
-          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-        if (t == 0) { ni.null_count += block_null_count; }
-      }
-      // trivial for non-nullable columns
-      else {
-        thread_valid_count = thread_value_count;
-        block_valid_count  = block_value_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int warp_null_count = 0;
+      if (ni.valid_map != nullptr) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+          int const valid_map_offset = ni.valid_map_offset;
+          int const vindex     = value_count + thread_value_count;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
       }
 
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
+
       // if this is valid and we're at the leaf, output dst_pos
-      __syncthreads();  // handle modification of ni.value_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos = value_count + thread_value_count;
+          int const src_pos = max_depth_valid_count + thread_valid_count;
+          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        }
+        // update stuff
+        max_depth_valid_count += block_valid_count;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
 
-      // update stuff
-      if (t == 0) { ni.valid_count += block_valid_count; }
-    }
+    }  // end depth loop
 
     value_count += block_value_count;
-  }
+  }  // end loop
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = s->nesting_info[max_depth].valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
+    max_depth_ni.valid_count = max_depth_valid_count;
+    s->nz_count              = max_depth_valid_count;
+    s->input_value_count     = value_count;
+    s->input_row_count       = value_count;
   }
 
-  __syncthreads();
-  return s->nesting_info[max_depth].valid_count;
+  return max_depth_valid_count;
 }
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -351,83 +388,67 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
-    }
-
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
+    int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // use definition level & row bounds to determine if is valid
     int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    if (t >= batch_size) {
+      is_valid = 0;
+    } else if (def) {
+      int const def_level =
+        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
     } else {
       is_valid = in_row_bounds;
     }
 
     // thread and block validity count
+    using block_scan = cub::BlockScan<int, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
     int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
+    block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
+    uint32_t const warp_validity_mask = ballot(is_valid);
+
+    // validity is processed per-warp
+    //
+    // nested schemas always read and write to the same bounds (that is, read and write
+    // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+    // at the first value, even if that is before first_row, because we cannot trivially jump to
+    // the correct position to start reading. since we are about to write the validity vector
+    // here we need to adjust our computed mask to take into account the write row bounds.
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+    int warp_null_count   = 0;
+    // lane 0 from each warp writes out validity
+    if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+      int const vindex     = value_count + thread_value_count;  // absolute input value index
+      int const bit_offset = (valid_map_offset + vindex + write_start) -
+                             first_row;  // absolute bit offset into the output validity map
+      int const write_end =
+        cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+      int const bit_count = write_end - write_start;
+      warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+      store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
     }
 
+    // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+    // valid_count) because valid_count also includes rows that potentially start before our row
+    // bounds. if we could come up with a way to clean that up, we could remove this and just
+    // compute it directly at the end of the kernel.
+    size_type const block_null_count =
+      cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+    if (t == 0) { ni.null_count += block_null_count; }
+
     // output offset
     if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
+      int const dst_pos                                          = value_count + thread_value_count;
+      int const src_pos                                          = valid_count + thread_valid_count;
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -448,6 +469,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+template <int decode_block_size, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count,
+                                                                page_state_s* s,
+                                                                state_buf* sb,
+                                                                int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
+  int const row_index_lower_bound     = s->row_index_lower_bound;
+
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+
+  int const max_depth = s->col.max_nesting_depth - 1;
+  auto& ni            = s->nesting_info[max_depth];
+  int valid_count     = ni.valid_count;
+
+  __syncthreads();
+
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
+
+    int const thread_value_count = t;
+    int const block_value_count  = batch_size;
+
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = thread_value_count + value_count;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    int const is_valid           = in_row_bounds;
+    int const thread_valid_count = thread_value_count;
+    int const block_valid_count  = block_value_count;
+
+    // if this is valid and we're at the leaf, output dst_pos
+    if (is_valid) {
+      // for non-list types, the value count is always the same across
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
+
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }  // end loop
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
+  }
+
+  return valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -605,7 +690,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   int valid_count     = 0;
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
+  // For chunked reads we may not process all of the rows on the page; if not stop early
+  int last_row = s->first_row + s->num_rows;
+  while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
+         (s->input_row_count <= last_row)) {
     int next_valid_count;
 
     // only need to process definition levels if this is a nullable column
@@ -614,10 +702,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       __syncthreads();
 
       if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       }
     }
@@ -626,15 +714,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-
-      if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
-      } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
-          processed_count, s, sb, nullptr, t);
-      }
+      next_valid_count =
+        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
     }
     __syncthreads();
 

From 0b840bb0deeffffba8875f5a49395b13334f4f98 Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Sat, 12 Oct 2024 02:04:52 +0900
Subject: [PATCH 23/24] docs: change 'CSV' to 'csv' in
 python/custreamz/README.md to match kafka.py (#17041)

This PR corrects a typo in the `python/custreamz/README.md` file by changing the uppercase `'CSV'` to lowercase `'csv'`. This change aligns the documentation with the `message_format` options defined in `python/custreamz/custreamz/kafka.py`, ensuring consistency across the codebase.

Authors:
  - Hirota Akio (https://github.com/a-hirota)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17041
---
 python/custreamz/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 8da17ef09dc..e81fc35c544 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -26,7 +26,7 @@ tips_df = consumer.read_gdf(topic="custreamz_tips",
                         partition=0,
                         start=0,
                         end=10000,
-                        message_format="CSV")
+                        message_format="csv")
 
 print(tips_df.head())
 tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100

From b8f3e2100cff86cb48d23200b8250ecfc8714433 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 11 Oct 2024 12:23:37 -0500
Subject: [PATCH 24/24] Reorganize `cudf_polars` expression code (#17014)

This PR seeks to break up `expr.py` into a less unwieldy monolith.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17014
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 1826 +----------------
 .../cudf_polars/dsl/expressions/__init__.py   |    8 +
 .../dsl/expressions/aggregation.py            |  229 +++
 .../cudf_polars/dsl/expressions/base.py       |  334 +++
 .../cudf_polars/dsl/expressions/binaryop.py   |  135 ++
 .../cudf_polars/dsl/expressions/boolean.py    |  269 +++
 .../cudf_polars/dsl/expressions/datetime.py   |  132 ++
 .../cudf_polars/dsl/expressions/literal.py    |   88 +
 .../cudf_polars/dsl/expressions/rolling.py    |   40 +
 .../cudf_polars/dsl/expressions/selection.py  |   91 +
 .../cudf_polars/dsl/expressions/sorting.py    |   97 +
 .../cudf_polars/dsl/expressions/string.py     |  283 +++
 .../cudf_polars/dsl/expressions/ternary.py    |   53 +
 .../cudf_polars/dsl/expressions/unary.py      |  328 +++
 14 files changed, 2108 insertions(+), 1805 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/base.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/literal.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/selection.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/string.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/unary.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f7775ceb238..e748ec16f14 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -15,33 +15,30 @@
 
 from __future__ import annotations
 
-import enum
-from enum import IntEnum
-from functools import partial, reduce
-from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
-
-import pyarrow as pa
-import pyarrow.compute as pc
-import pylibcudf as plc
-
-from polars.exceptions import InvalidOperationError
-from polars.polars import _expr_nodes as pl_expr
-
-from cudf_polars.containers import Column
-from cudf_polars.utils import dtypes, sorting
-
-if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
-
-    import polars as pl
-    import polars.type_aliases as pl_types
-
-    from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.expressions.aggregation import Agg
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    Col,
+    Expr,
+    NamedExpr,
+)
+from cudf_polars.dsl.expressions.binaryop import BinOp
+from cudf_polars.dsl.expressions.boolean import BooleanFunction
+from cudf_polars.dsl.expressions.datetime import TemporalFunction
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
+from cudf_polars.dsl.expressions.selection import Filter, Gather
+from cudf_polars.dsl.expressions.sorting import Sort, SortBy
+from cudf_polars.dsl.expressions.string import StringFunction
+from cudf_polars.dsl.expressions.ternary import Ternary
+from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 
 __all__ = [
     "Expr",
     "NamedExpr",
     "Literal",
+    "LiteralColumn",
+    "Len",
     "Col",
     "BooleanFunction",
     "StringFunction",
@@ -54,1789 +51,8 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "AggInfo",
     "Ternary",
     "BinOp",
+    "UnaryFunction",
 ]
-
-
-class ExecutionContext(IntEnum):
-    FRAME = enum.auto()
-    GROUPBY = enum.auto()
-    ROLLING = enum.auto()
-
-
-class AggInfo(NamedTuple):
-    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
-
-
-class Expr:
-    """
-    An abstract expression object.
-
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
-    dtype: plc.DataType
-    """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
-    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
-    """Names of non-child data (not Exprs) for reconstruction."""
-
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Do not call this function directly, but rather
-        :meth:`evaluate` which handles the mapping lookups.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        raise NotImplementedError(
-            f"Evaluation of expression {type(self).__name__}"
-        )  # pragma: no cover; translation of unimplemented nodes trips first
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Individual subclasses should implement :meth:`do_evaluate`,
-        this method provides logic to handle lookups in the
-        substitution mapping.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        if mapping is None:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-        try:
-            return mapping[self]
-        except KeyError:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """
-        Collect information about aggregations in groupbys.
-
-        Parameters
-        ----------
-        depth
-            The depth of aggregating (reduction or sampling)
-            expressions we are currently at.
-
-        Returns
-        -------
-        Aggregation info describing the expression to aggregate in the
-        groupby.
-
-        Raises
-        ------
-        NotImplementedError
-            If we can't currently perform the aggregation request, for
-            example nested aggregations like ``a.max().min()``.
-        """
-        raise NotImplementedError(
-            f"Collecting aggregation info for {type(self).__name__}"
-        )  # pragma: no cover; check_agg trips first
-
-
-class NamedExpr:
-    # NamedExpr does not inherit from Expr since it does not appear
-    # when evaluating expressions themselves, only when constructing
-    # named return values in dataframe (IR) nodes.
-    __slots__ = ("name", "value")
-    value: Expr
-    name: str
-
-    def __init__(self, name: str, value: Expr) -> None:
-        self.name = name
-        self.value = value
-
-    def __hash__(self) -> int:
-        """Hash of the expression."""
-        return hash((type(self), self.name, self.value))
-
-    def __repr__(self) -> str:
-        """Repr of the expression."""
-        return f"NamedExpr({self.name}, {self.value})"
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of two expressions."""
-        return (
-            type(self) is type(other)
-            and self.name == other.name
-            and self.value == other.value
-        )
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame providing context
-        context
-            Execution context
-        mapping
-            Substitution mapping
-
-        Returns
-        -------
-        Evaluated Column with name attached.
-
-        See Also
-        --------
-        :meth:`Expr.evaluate` for details, this function just adds the
-        name to a column produced from an expression.
-        """
-        return self.value.evaluate(df, context=context, mapping=mapping).rename(
-            self.name
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return self.value.collect_agg(depth=depth)
-
-
-class Literal(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Scalar[Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
-        assert value.type == plc.interop.to_arrow(dtype)
-        self.value = value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow scalar is correct by construction.
-        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class LiteralColumn(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
-        data = value.to_arrow()
-        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
-
-    def get_hash(self) -> int:
-        """Compute a hash of the column."""
-        # This is stricter than necessary, but we only need this hash
-        # for identity in groupby replacements so it's OK. And this
-        # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow array is correct by construction.
-        return Column(plc.interop.from_arrow(self.value))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class Col(Expr):
-    __slots__ = ("name",)
-    _non_child = ("dtype", "name")
-    name: str
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, name: str) -> None:
-        self.dtype = dtype
-        self.name = name
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # Deliberately remove the name here so that we guarantee
-        # evaluation of the IR produces names.
-        return df.column_map[self.name].rename(None)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([(self, plc.aggregation.collect_list(), self)])
-
-
-class Len(Expr):
-    children: tuple[()]
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
-                ),
-                1,
-            )
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: polars returns a uint, not an int for count
-        return AggInfo(
-            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
-        )
-
-
-class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.BooleanFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name == pl_expr.BooleanFunction.IsIn and not all(
-            c.dtype == self.children[0].dtype for c in self.children
-        ):
-            # TODO: If polars IR doesn't put the casts in, we need to
-            # mimic the supertype promotion rules.
-            raise NotImplementedError("IsIn doesn't support supertype casting")
-
-    @staticmethod
-    def _distinct(
-        column: Column,
-        *,
-        keep: plc.stream_compaction.DuplicateKeepOption,
-        source_value: plc.Scalar,
-        target_value: plc.Scalar,
-    ) -> Column:
-        table = plc.Table([column.obj])
-        indices = plc.stream_compaction.distinct_indices(
-            table,
-            keep,
-            # TODO: polars doesn't expose options for these
-            plc.types.NullEquality.EQUAL,
-            plc.types.NanEquality.ALL_EQUAL,
-        )
-        return Column(
-            plc.copying.scatter(
-                [source_value],
-                indices,
-                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
-            ).columns()[0]
-        )
-
-    _BETWEEN_OPS: ClassVar[
-        dict[
-            pl_types.ClosedInterval,
-            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
-        ]
-    ] = {
-        "none": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "left": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "right": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-        "both": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-        ):
-            # Avoid evaluating the child if the dtype tells us it's unnecessary.
-            (child,) = self.children
-            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
-            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-                value = plc.interop.from_arrow(
-                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
-                )
-                return Column(plc.Column.from_scalar(value, df.num_rows))
-            needles = child.evaluate(df, context=context, mapping=mapping)
-            to_search = [-float("inf"), float("inf")]
-            if is_finite:
-                # NaN is neither finite not infinite
-                to_search.append(float("nan"))
-            haystack = plc.interop.from_arrow(
-                pa.array(
-                    to_search,
-                    type=plc.interop.to_arrow(needles.obj.type()),
-                )
-            )
-            result = plc.search.contains(haystack, needles.obj)
-            if is_finite:
-                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
-            return Column(result)
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
-        # False
-        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
-            (ignore_nulls,) = self.options
-            (column,) = columns
-            is_any = self.name == pl_expr.BooleanFunction.Any
-            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
-            result = plc.reduce.reduce(column.obj, agg, self.dtype)
-            if not ignore_nulls and column.obj.null_count() > 0:
-                #      Truth tables
-                #     Any         All
-                #   | F U T     | F U T
-                # --+------   --+------
-                # F | F U T   F | F F F
-                # U | U U T   U | F U U
-                # T | T T T   T | F U T
-                #
-                # If the input null count was non-zero, we must
-                # post-process the result to insert the correct value.
-                h_result = plc.interop.to_arrow(result).as_py()
-                if is_any and not h_result or not is_any and h_result:
-                    # Any                     All
-                    # False || Null => Null   True && Null => Null
-                    return Column(plc.Column.all_null_like(column.obj, 1))
-            return Column(plc.Column.from_scalar(result, 1))
-        if self.name == pl_expr.BooleanFunction.IsNull:
-            (column,) = columns
-            return Column(plc.unary.is_null(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNotNull:
-            (column,) = columns
-            return Column(plc.unary.is_valid(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsNotNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_not_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsUnique:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsIn:
-            needles, haystack = columns
-            return Column(plc.search.contains(haystack.obj, needles.obj))
-        elif self.name == pl_expr.BooleanFunction.Not:
-            (column,) = columns
-            return Column(
-                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
-            )
-        else:
-            raise NotImplementedError(
-                f"BooleanFunction {self.name}"
-            )  # pragma: no cover; handled by init raising
-
-
-class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.StringFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        self._validate_input()
-
-    def _validate_input(self):
-        if self.name not in (
-            pl_expr.StringFunction.Contains,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Replace,
-            pl_expr.StringFunction.ReplaceMany,
-            pl_expr.StringFunction.Slice,
-            pl_expr.StringFunction.Strptime,
-            pl_expr.StringFunction.StartsWith,
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-            pl_expr.StringFunction.Uppercase,
-        ):
-            raise NotImplementedError(f"String function {self.name}")
-        if self.name == pl_expr.StringFunction.Contains:
-            literal, strict = self.options
-            if not literal:
-                if not strict:
-                    raise NotImplementedError(
-                        "f{strict=} is not supported for regex contains"
-                    )
-                if not isinstance(self.children[1], Literal):
-                    raise NotImplementedError(
-                        "Regex contains only supports a scalar pattern"
-                    )
-                pattern = self.children[1].value.as_py()
-                try:
-                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
-                        pattern,
-                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-                    )
-                except RuntimeError as e:
-                    raise NotImplementedError(
-                        f"Unsupported regex {pattern} for GPU engine."
-                    ) from e
-        elif self.name == pl_expr.StringFunction.Replace:
-            _, literal = self.options
-            if not literal:
-                raise NotImplementedError("literal=False is not supported for replace")
-            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
-                raise NotImplementedError("replace only supports scalar target")
-            target = self.children[1]
-            if target.value == pa.scalar("", type=pa.string()):
-                raise NotImplementedError(
-                    "libcudf replace does not support empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            (ascii_case_insensitive,) = self.options
-            if ascii_case_insensitive:
-                raise NotImplementedError(
-                    "ascii_case_insensitive not implemented for replace_many"
-                )
-            if not all(
-                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
-            ):
-                raise NotImplementedError("replace_many only supports literal inputs")
-            target = self.children[1]
-            if pc.any(pc.equal(target.value, "")).as_py():
-                raise NotImplementedError(
-                    "libcudf replace_many is implemented differently from polars "
-                    "for empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            if not all(isinstance(child, Literal) for child in self.children[1:]):
-                raise NotImplementedError(
-                    "Slice only supports literal start and stop values"
-                )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            format, _, exact, cache = self.options
-            if cache:
-                raise NotImplementedError("Strptime cache is a CPU feature")
-            if format is None:
-                raise NotImplementedError("Strptime format is required")
-            if not exact:
-                raise NotImplementedError("Strptime does not support exact=False")
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            if not isinstance(self.children[1], Literal):
-                raise NotImplementedError(
-                    "strip operations only support scalar patterns"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == pl_expr.StringFunction.Contains:
-            child, arg = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
-
-            literal, _ = self.options
-            if literal:
-                pat = arg.evaluate(df, context=context, mapping=mapping)
-                pattern = (
-                    pat.obj_scalar
-                    if pat.is_scalar and pat.obj.size() != column.obj.size()
-                    else pat.obj
-                )
-                return Column(plc.strings.find.contains(column.obj, pattern))
-            else:
-                return Column(
-                    plc.strings.contains.contains_re(column.obj, self._regex_program)
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            child, expr_offset, expr_length = self.children
-            assert isinstance(expr_offset, Literal)
-            assert isinstance(expr_length, Literal)
-
-            column = child.evaluate(df, context=context, mapping=mapping)
-            # libcudf slices via [start,stop).
-            # polars slices with offset + length where start == offset
-            # stop = start + length. Negative values for start look backward
-            # from the last element of the string. If the end index would be
-            # below zero, an empty string is returned.
-            # Do this maths on the host
-            start = expr_offset.value.as_py()
-            length = expr_length.value.as_py()
-
-            if length == 0:
-                stop = start
-            else:
-                # No length indicates a scan to the end
-                # The libcudf equivalent is a null stop
-                stop = start + length if length else None
-                if length and start < 0 and length >= -start:
-                    stop = None
-            return Column(
-                plc.strings.slice.slice_strings(
-                    column.obj,
-                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
-                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
-                )
-            )
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            column, chars = (
-                c.evaluate(df, context=context, mapping=mapping) for c in self.children
-            )
-            if self.name == pl_expr.StringFunction.StripCharsStart:
-                side = plc.strings.SideType.LEFT
-            elif self.name == pl_expr.StringFunction.StripCharsEnd:
-                side = plc.strings.SideType.RIGHT
-            else:
-                side = plc.strings.SideType.BOTH
-            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
-
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        if self.name == pl_expr.StringFunction.Lowercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_lower(column.obj))
-        elif self.name == pl_expr.StringFunction.Uppercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_upper(column.obj))
-        elif self.name == pl_expr.StringFunction.EndsWith:
-            column, suffix = columns
-            return Column(
-                plc.strings.find.ends_with(
-                    column.obj,
-                    suffix.obj_scalar
-                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
-                    else suffix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.StartsWith:
-            column, prefix = columns
-            return Column(
-                plc.strings.find.starts_with(
-                    column.obj,
-                    prefix.obj_scalar
-                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
-                    else prefix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            # TODO: ignores ambiguous
-            format, strict, exact, cache = self.options
-            col = self.children[0].evaluate(df, context=context, mapping=mapping)
-
-            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
-                col.obj, format
-            )
-
-            if strict:
-                if not plc.interop.to_arrow(
-                    plc.reduce.reduce(
-                        is_timestamps,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                    )
-                ).as_py():
-                    raise InvalidOperationError("conversion from `str` failed.")
-            else:
-                not_timestamps = plc.unary.unary_operation(
-                    is_timestamps, plc.unary.UnaryOperator.NOT
-                )
-
-                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
-                res = plc.copying.boolean_mask_scatter(
-                    [null], plc.Table([col.obj]), not_timestamps
-                )
-                return Column(
-                    plc.strings.convert.convert_datetime.to_timestamps(
-                        res.columns()[0], self.dtype, format
-                    )
-                )
-        elif self.name == pl_expr.StringFunction.Replace:
-            column, target, repl = columns
-            n, _ = self.options
-            return Column(
-                plc.strings.replace.replace(
-                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
-                )
-            )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            column, target, repl = columns
-            return Column(
-                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
-            )
-        raise NotImplementedError(
-            f"StringFunction {self.name}"
-        )  # pragma: no cover; handled by init raising
-
-
-class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
-        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
-        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
-        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
-        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
-        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
-        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
-        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
-        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
-        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
-        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
-    }
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.TemporalFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name not in self._COMPONENT_MAP:
-            raise NotImplementedError(f"Temporal function {self.name}")
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        (column,) = columns
-        if self.name == pl_expr.TemporalFunction.Microsecond:
-            millis = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
-            )
-            micros = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
-            )
-            millis_as_micros = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.DataType(plc.TypeId.INT32),
-            )
-            total_micros = plc.binaryop.binary_operation(
-                micros,
-                millis_as_micros,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_micros)
-        elif self.name == pl_expr.TemporalFunction.Nanosecond:
-            millis = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
-            )
-            micros = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
-            )
-            nanos = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
-            )
-            millis_as_nanos = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            micros_as_nanos = plc.binaryop.binary_operation(
-                micros,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                nanos,
-                millis_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                total_nanos,
-                micros_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_nanos)
-
-        return Column(
-            plc.datetime.extract_datetime_component(
-                column.obj,
-                self._COMPONENT_MAP[self.name],
-            )
-        )
-
-
-class UnaryFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    # Note: log, and pow are handled via translation to binops
-    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
-        "sin": plc.unary.UnaryOperator.SIN,
-        "cos": plc.unary.UnaryOperator.COS,
-        "tan": plc.unary.UnaryOperator.TAN,
-        "arcsin": plc.unary.UnaryOperator.ARCSIN,
-        "arccos": plc.unary.UnaryOperator.ARCCOS,
-        "arctan": plc.unary.UnaryOperator.ARCTAN,
-        "sinh": plc.unary.UnaryOperator.SINH,
-        "cosh": plc.unary.UnaryOperator.COSH,
-        "tanh": plc.unary.UnaryOperator.TANH,
-        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
-        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
-        "arctanh": plc.unary.UnaryOperator.ARCTANH,
-        "exp": plc.unary.UnaryOperator.EXP,
-        "sqrt": plc.unary.UnaryOperator.SQRT,
-        "cbrt": plc.unary.UnaryOperator.CBRT,
-        "ceil": plc.unary.UnaryOperator.CEIL,
-        "floor": plc.unary.UnaryOperator.FLOOR,
-        "abs": plc.unary.UnaryOperator.ABS,
-        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
-        "not": plc.unary.UnaryOperator.NOT,
-    }
-    _supported_misc_fns = frozenset(
-        {
-            "drop_nulls",
-            "fill_null",
-            "mask_nans",
-            "round",
-            "set_sorted",
-            "unique",
-        }
-    )
-    _supported_cum_aggs = frozenset(
-        {
-            "cum_min",
-            "cum_max",
-            "cum_prod",
-            "cum_sum",
-        }
-    )
-    _supported_fns = frozenset().union(
-        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
-    )
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-
-        if self.name not in UnaryFunction._supported_fns:
-            raise NotImplementedError(f"Unary function {name=}")
-        if self.name in UnaryFunction._supported_cum_aggs:
-            (reverse,) = self.options
-            if reverse:
-                raise NotImplementedError(
-                    "reverse=True is not supported for cumulative aggregations"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == "mask_nans":
-            (child,) = self.children
-            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
-        if self.name == "round":
-            (decimal_places,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.round.round(
-                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
-                )
-            ).sorted_like(values)
-        elif self.name == "unique":
-            (maintain_order,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            # Only one column, so keep_any is the same as keep_first
-            # for stable distinct
-            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
-            if values.is_sorted:
-                maintain_order = True
-                result = plc.stream_compaction.unique(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                )
-            else:
-                distinct = (
-                    plc.stream_compaction.stable_distinct
-                    if maintain_order
-                    else plc.stream_compaction.distinct
-                )
-                result = distinct(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                    plc.types.NanEquality.ALL_EQUAL,
-                )
-            (column,) = result.columns()
-            if maintain_order:
-                return Column(column).sorted_like(values)
-            return Column(column)
-        elif self.name == "set_sorted":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            (asc,) = self.options
-            order = (
-                plc.types.Order.ASCENDING
-                if asc == "ascending"
-                else plc.types.Order.DESCENDING
-            )
-            null_order = plc.types.NullOrder.BEFORE
-            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
-                # PERF: This invokes four stream synchronisations!
-                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
-                has_nulls_last = not plc.copying.get_element(
-                    column.obj, n - 1
-                ).is_valid()
-                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
-                    order == plc.types.Order.ASCENDING and has_nulls_last
-                ):
-                    null_order = plc.types.NullOrder.AFTER
-            return column.set_sorted(
-                is_sorted=plc.types.Sorted.YES,
-                order=order,
-                null_order=null_order,
-            )
-        elif self.name == "drop_nulls":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.stream_compaction.drop_nulls(
-                    plc.Table([column.obj]), [0], 1
-                ).columns()[0]
-            )
-        elif self.name == "fill_null":
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if isinstance(self.children[1], Literal):
-                arg = plc.interop.from_arrow(self.children[1].value)
-            else:
-                evaluated = self.children[1].evaluate(
-                    df, context=context, mapping=mapping
-                )
-                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
-            return Column(plc.replace.replace_nulls(column.obj, arg))
-        elif self.name in self._OP_MAPPING:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if column.obj.type().id() != self.dtype.id():
-                arg = plc.unary.cast(column.obj, self.dtype)
-            else:
-                arg = column.obj
-            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
-        elif self.name in UnaryFunction._supported_cum_aggs:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            plc_col = column.obj
-            col_type = column.obj.type()
-            # cum_sum casts
-            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
-            # Bool -> UInt32
-            # cum_prod casts integer dtypes < int64 and bool to int64
-            # See:
-            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
-            if (
-                self.name == "cum_sum"
-                and col_type.id()
-                in {
-                    plc.types.TypeId.INT8,
-                    plc.types.TypeId.UINT8,
-                    plc.types.TypeId.INT16,
-                    plc.types.TypeId.UINT16,
-                }
-            ) or (
-                self.name == "cum_prod"
-                and plc.traits.is_integral(col_type)
-                and plc.types.size_of(col_type) <= 4
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
-                )
-            elif (
-                self.name == "cum_sum"
-                and column.obj.type().id() == plc.types.TypeId.BOOL8
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
-                )
-            if self.name == "cum_sum":
-                agg = plc.aggregation.sum()
-            elif self.name == "cum_prod":
-                agg = plc.aggregation.product()
-            elif self.name == "cum_min":
-                agg = plc.aggregation.min()
-            elif self.name == "cum_max":
-                agg = plc.aggregation.max()
-
-            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
-        raise NotImplementedError(
-            f"Unimplemented unary function {self.name=}"
-        )  # pragma: no cover; init trips first
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
-            raise NotImplementedError(f"{self.name} in groupby")
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate, groupby
-            # construction has checked that we don't have nested aggs,
-            # so stop the recursion and return ourselves for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            (child,) = self.children
-            return child.collect_agg(depth=depth)
-
-
-class Sort(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column,)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            [descending], nulls_last=[nulls_last], num_keys=1
-        )
-        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
-        table = do_sort(plc.Table([column.obj]), order, null_order)
-        return Column(
-            table.columns()[0],
-            is_sorted=plc.types.Sorted.YES,
-            order=order[0],
-            null_order=null_order[0],
-        )
-
-
-class SortBy(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        options: tuple[bool, tuple[bool], tuple[bool]],
-        column: Expr,
-        *by: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column, *by)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        column, *by = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        table = do_sort(
-            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
-        )
-        return Column(table.columns()[0])
-
-
-class Gather(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, indices = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lo, hi = plc.reduce.minmax(indices.obj)
-        lo = plc.interop.to_arrow(lo).as_py()
-        hi = plc.interop.to_arrow(hi).as_py()
-        n = df.num_rows
-        if hi >= n or lo < -n:
-            raise ValueError("gather indices are out of bounds")
-        if indices.obj.null_count():
-            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
-            obj = plc.replace.replace_nulls(
-                indices.obj,
-                plc.interop.from_arrow(
-                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
-                ),
-            )
-        else:
-            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
-            obj = indices.obj
-        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
-        return Column(table.columns()[0])
-
-
-class Filter(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, mask = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        table = plc.stream_compaction.apply_boolean_mask(
-            plc.Table([values.obj]), mask.obj
-        )
-        return Column(table.columns()[0]).sorted_like(values)
-
-
-class RollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg,)
-        raise NotImplementedError("Rolling window not implemented")
-
-
-class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg, *by)
-        raise NotImplementedError("Grouped rolling window not implemented")
-
-
-class Cast(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (value,)
-        if not dtypes.can_cast(value.dtype, self.dtype):
-            raise NotImplementedError(
-                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
-            )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented filter
-        (child,) = self.children
-        return child.collect_agg(depth=depth)
-
-
-class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-        if name not in Agg._SUPPORTED:
-            raise NotImplementedError(
-                f"Unsupported aggregation {name=}"
-            )  # pragma: no cover; all valid aggs are supported
-        # TODO: nan handling in groupby case
-        if name == "min":
-            req = plc.aggregation.min()
-        elif name == "max":
-            req = plc.aggregation.max()
-        elif name == "median":
-            req = plc.aggregation.median()
-        elif name == "n_unique":
-            # TODO: datatype of result
-            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
-        elif name == "first" or name == "last":
-            req = None
-        elif name == "mean":
-            req = plc.aggregation.mean()
-        elif name == "sum":
-            req = plc.aggregation.sum()
-        elif name == "std":
-            # TODO: handle nans
-            req = plc.aggregation.std(ddof=options)
-        elif name == "var":
-            # TODO: handle nans
-            req = plc.aggregation.variance(ddof=options)
-        elif name == "count":
-            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
-        elif name == "quantile":
-            _, quantile = self.children
-            if not isinstance(quantile, Literal):
-                raise NotImplementedError("Only support literal quantile values")
-            req = plc.aggregation.quantile(
-                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
-            )
-        else:
-            raise NotImplementedError(
-                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
-            )  # pragma: no cover
-        self.request = req
-        op = getattr(self, f"_{name}", None)
-        if op is None:
-            op = partial(self._reduce, request=req)
-        elif name in {"min", "max"}:
-            op = partial(op, propagate_nans=options)
-        elif name in {"count", "first", "last"}:
-            pass
-        else:
-            raise NotImplementedError(
-                f"Unreachable, supported agg {name=} has no implementation"
-            )  # pragma: no cover
-        self.op = op
-
-    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
-        [
-            "min",
-            "max",
-            "median",
-            "n_unique",
-            "first",
-            "last",
-            "mean",
-            "sum",
-            "count",
-            "std",
-            "var",
-            "quantile",
-        ]
-    )
-
-    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
-        "nearest": plc.types.Interpolation.NEAREST,
-        "higher": plc.types.Interpolation.HIGHER,
-        "lower": plc.types.Interpolation.LOWER,
-        "midpoint": plc.types.Interpolation.MIDPOINT,
-        "linear": plc.types.Interpolation.LINEAR,
-    }
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth >= 1:
-            raise NotImplementedError(
-                "Nested aggregations in groupby"
-            )  # pragma: no cover; check_agg trips first
-        if (isminmax := self.name in {"min", "max"}) and self.options:
-            raise NotImplementedError("Nan propagation in groupby for min/max")
-        (child,) = self.children
-        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
-        request = self.request
-        # These are handled specially here because we don't set up the
-        # request for the whole-frame agg because we can avoid a
-        # reduce for these.
-        if self.name == "first":
-            request = plc.aggregation.nth_element(
-                0, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        elif self.name == "last":
-            request = plc.aggregation.nth_element(
-                -1, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        if request is None:
-            raise NotImplementedError(
-                f"Aggregation {self.name} in groupby"
-            )  # pragma: no cover; __init__ trips first
-        if isminmax and plc.traits.is_floating_point(self.dtype):
-            assert expr is not None
-            # Ignore nans in these groupby aggs, do this by masking
-            # nans in the input
-            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
-        return AggInfo([(expr, request, self)])
-
-    def _reduce(
-        self, column: Column, *, request: plc.aggregation.Aggregation
-    ) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, request, self.dtype),
-                1,
-            )
-        )
-
-    def _count(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(
-                        column.obj.size() - column.obj.null_count(),
-                        type=plc.interop.to_arrow(self.dtype),
-                    ),
-                ),
-                1,
-            )
-        )
-
-    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.min())
-
-    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.max())
-
-    def _first(self, column: Column) -> Column:
-        return Column(plc.copying.slice(column.obj, [0, 1])[0])
-
-    def _last(self, column: Column) -> Column:
-        n = column.obj.size()
-        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if context is not ExecutionContext.FRAME:
-            raise NotImplementedError(
-                f"Agg in context {context}"
-            )  # pragma: no cover; unreachable
-
-        # Aggregations like quantiles may have additional children that were
-        # preprocessed into pylibcudf requests.
-        child = self.children[0]
-        return self.op(child.evaluate(df, context=context, mapping=mapping))
-
-
-class Ternary(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.children = (when, then, otherwise)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        when, then, otherwise = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        then_obj = then.obj_scalar if then.is_scalar else then.obj
-        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
-        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
-
-
-class BinOp(Expr):
-    __slots__ = ("op", "children")
-    _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        op: plc.binaryop.BinaryOperator,
-        left: Expr,
-        right: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        if plc.traits.is_boolean(self.dtype):
-            # For boolean output types, bitand and bitor implement
-            # boolean logic, so translate. bitxor also does, but the
-            # default behaviour is correct.
-            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
-        self.op = op
-        self.children = (left, right)
-        if not plc.binaryop.is_supported_operation(
-            self.dtype, left.dtype, right.dtype, op
-        ):
-            raise NotImplementedError(
-                f"Operation {op.name} not supported "
-                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
-                f"with output type {self.dtype.id().name}"
-            )
-
-    _BOOL_KLEENE_MAPPING: ClassVar[
-        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
-    ] = {
-        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-    }
-
-    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
-        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
-        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
-        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
-        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
-        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
-        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
-        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
-        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
-        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
-        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
-        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
-        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
-        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
-        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
-        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
-        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
-        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
-        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
-        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
-        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        left, right = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lop = left.obj
-        rop = right.obj
-        if left.obj.size() != right.obj.size():
-            if left.is_scalar:
-                lop = left.obj_scalar
-            elif right.is_scalar:
-                rop = right.obj_scalar
-        return Column(
-            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate,
-            # groupby construction has checked that we don't have
-            # nested aggs, so stop the recursion and return ourselves
-            # for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            left_info, right_info = (
-                child.collect_agg(depth=depth) for child in self.children
-            )
-            requests = [*left_info.requests, *right_info.requests]
-            # TODO: Hack, if there were no reductions inside this
-            # binary expression then we want to pre-evaluate and
-            # collect ourselves. Otherwise we want to collect the
-            # aggregations inside and post-evaluate. This is a bad way
-            # of checking that we are in case 1.
-            if all(
-                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
-                for _, agg, _ in requests
-            ):
-                return AggInfo([(self, plc.aggregation.collect_list(), self)])
-            return AggInfo(
-                [*left_info.requests, *right_info.requests],
-            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
new file mode 100644
index 00000000000..acbea129088
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Implementations of various expressions."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
new file mode 100644
index 00000000000..b8b18ec5039
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for aggregations."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    ExecutionContext,
+    Expr,
+)
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.dsl.expressions.unary import UnaryFunction
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Agg"]
+
+
+class Agg(Expr):
+    __slots__ = ("name", "options", "op", "request", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(
+                f"Unsupported aggregation {name=}"
+            )  # pragma: no cover; all valid aggs are supported
+        # TODO: nan handling in groupby case
+        if name == "min":
+            req = plc.aggregation.min()
+        elif name == "max":
+            req = plc.aggregation.max()
+        elif name == "median":
+            req = plc.aggregation.median()
+        elif name == "n_unique":
+            # TODO: datatype of result
+            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
+        elif name == "first" or name == "last":
+            req = None
+        elif name == "mean":
+            req = plc.aggregation.mean()
+        elif name == "sum":
+            req = plc.aggregation.sum()
+        elif name == "std":
+            # TODO: handle nans
+            req = plc.aggregation.std(ddof=options)
+        elif name == "var":
+            # TODO: handle nans
+            req = plc.aggregation.variance(ddof=options)
+        elif name == "count":
+            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        elif name == "quantile":
+            _, quantile = self.children
+            if not isinstance(quantile, Literal):
+                raise NotImplementedError("Only support literal quantile values")
+            req = plc.aggregation.quantile(
+                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
+            )
+        else:
+            raise NotImplementedError(
+                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
+            )  # pragma: no cover
+        self.request = req
+        op = getattr(self, f"_{name}", None)
+        if op is None:
+            op = partial(self._reduce, request=req)
+        elif name in {"min", "max"}:
+            op = partial(op, propagate_nans=options)
+        elif name in {"count", "first", "last"}:
+            pass
+        else:
+            raise NotImplementedError(
+                f"Unreachable, supported agg {name=} has no implementation"
+            )  # pragma: no cover
+        self.op = op
+
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "n_unique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+            "quantile",
+        ]
+    )
+
+    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
+        "nearest": plc.types.Interpolation.NEAREST,
+        "higher": plc.types.Interpolation.HIGHER,
+        "lower": plc.types.Interpolation.LOWER,
+        "midpoint": plc.types.Interpolation.MIDPOINT,
+        "linear": plc.types.Interpolation.LINEAR,
+    }
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError(
+                "Nested aggregations in groupby"
+            )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
+        (child,) = self.children
+        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
+        request = self.request
+        # These are handled specially here because we don't set up the
+        # request for the whole-frame agg because we can avoid a
+        # reduce for these.
+        if self.name == "first":
+            request = plc.aggregation.nth_element(
+                0, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        elif self.name == "last":
+            request = plc.aggregation.nth_element(
+                -1, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        if request is None:
+            raise NotImplementedError(
+                f"Aggregation {self.name} in groupby"
+            )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
+        return AggInfo([(expr, request, self)])
+
+    def _reduce(
+        self, column: Column, *, request: plc.aggregation.Aggregation
+    ) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, request, self.dtype),
+                1,
+            )
+        )
+
+    def _count(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(
+                        column.obj.size() - column.obj.null_count(),
+                        type=plc.interop.to_arrow(self.dtype),
+                    ),
+                ),
+                1,
+            )
+        )
+
+    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.min())
+
+    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.max())
+
+    def _first(self, column: Column) -> Column:
+        return Column(plc.copying.slice(column.obj, [0, 1])[0])
+
+    def _last(self, column: Column) -> Column:
+        n = column.obj.size()
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if context is not ExecutionContext.FRAME:
+            raise NotImplementedError(
+                f"Agg in context {context}"
+            )  # pragma: no cover; unreachable
+
+        # Aggregations like quantiles may have additional children that were
+        # preprocessed into pylibcudf requests.
+        child = self.children[0]
+        return self.op(child.evaluate(df, context=context, mapping=mapping))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
new file mode 100644
index 00000000000..8d021b0231d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -0,0 +1,334 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Base and common classes for expression DSL nodes."""
+
+from __future__ import annotations
+
+import enum
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+
+    from cudf_polars.containers import Column, DataFrame
+
+__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"]
+
+
+class AggInfo(NamedTuple):
+    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+
+
+class ExecutionContext(IntEnum):
+    FRAME = enum.auto()
+    GROUPBY = enum.auto()
+    ROLLING = enum.auto()
+
+
+class Expr:
+    """
+    An abstract expression object.
+
+    This contains a (potentially empty) tuple of child expressions,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    dtype: plc.DataType
+    """Data type of the expression."""
+    _hash_value: int
+    """Caching slot for the hash of the expression."""
+    _repr_value: str
+    """Caching slot for repr of the expression."""
+    children: tuple[Expr, ...] = ()
+    """Children of the expression."""
+    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+    """Names of non-child data (not Exprs) for reconstruction."""
+
+    # Constructor must take arguments in order (*_non_child, *children)
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+
+    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def get_hash(self) -> int:
+        """
+        Return the hash of this expr.
+
+        Override this in subclasses, rather than __hash__.
+
+        Returns
+        -------
+        The integer hash value.
+        """
+        return hash((type(self), self._ctor_arguments(self.children)))
+
+    def __hash__(self) -> int:
+        """Hash of an expression with caching."""
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = self.get_hash()
+            return self._hash_value
+
+    def is_equal(self, other: Any) -> bool:
+        """
+        Equality of two expressions.
+
+        Override this in subclasses, rather than __eq__.
+
+        Parameter
+        ---------
+        other
+            object to compare to
+
+        Returns
+        -------
+        True if the two expressions are equal, false otherwise.
+        """
+        if type(self) is not type(other):
+            return False  # pragma: no cover; __eq__ trips first
+        return self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+
+    def __eq__(self, other: Any) -> bool:
+        """Equality of expressions."""
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Do not call this function directly, but rather
+        :meth:`evaluate` which handles the mapping lookups.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        raise NotImplementedError(
+            f"Evaluation of expression {type(self).__name__}"
+        )  # pragma: no cover; translation of unimplemented nodes trips first
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Individual subclasses should implement :meth:`do_evaluate`,
+        this method provides logic to handle lookups in the
+        substitution mapping.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        if mapping is None:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+        try:
+            return mapping[self]
+        except KeyError:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """
+        Collect information about aggregations in groupbys.
+
+        Parameters
+        ----------
+        depth
+            The depth of aggregating (reduction or sampling)
+            expressions we are currently at.
+
+        Returns
+        -------
+        Aggregation info describing the expression to aggregate in the
+        groupby.
+
+        Raises
+        ------
+        NotImplementedError
+            If we can't currently perform the aggregation request, for
+            example nested aggregations like ``a.max().min()``.
+        """
+        raise NotImplementedError(
+            f"Collecting aggregation info for {type(self).__name__}"
+        )  # pragma: no cover; check_agg trips first
+
+
+class NamedExpr:
+    # NamedExpr does not inherit from Expr since it does not appear
+    # when evaluating expressions themselves, only when constructing
+    # named return values in dataframe (IR) nodes.
+    __slots__ = ("name", "value")
+    value: Expr
+    name: str
+
+    def __init__(self, name: str, value: Expr) -> None:
+        self.name = name
+        self.value = value
+
+    def __hash__(self) -> int:
+        """Hash of the expression."""
+        return hash((type(self), self.name, self.value))
+
+    def __repr__(self) -> str:
+        """Repr of the expression."""
+        return f"NamedExpr({self.name}, {self.value})"
+
+    def __eq__(self, other: Any) -> bool:
+        """Equality of two expressions."""
+        return (
+            type(self) is type(other)
+            and self.name == other.name
+            and self.value == other.value
+        )
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame providing context
+        context
+            Execution context
+        mapping
+            Substitution mapping
+
+        Returns
+        -------
+        Evaluated Column with name attached.
+
+        See Also
+        --------
+        :meth:`Expr.evaluate` for details, this function just adds the
+        name to a column produced from an expression.
+        """
+        return self.value.evaluate(df, context=context, mapping=mapping).rename(
+            self.name
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return self.value.collect_agg(depth=depth)
+
+
+class Col(Expr):
+    __slots__ = ("name",)
+    _non_child = ("dtype", "name")
+    name: str
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, name: str) -> None:
+        self.dtype = dtype
+        self.name = name
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # Deliberately remove the name here so that we guarantee
+        # evaluation of the IR produces names.
+        return df.column_map[self.name].rename(None)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([(self, plc.aggregation.collect_list(), self)])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
new file mode 100644
index 00000000000..19baae3611d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""BinaryOp DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BinOp"]
+
+
+class BinOp(Expr):
+    __slots__ = ("op", "children")
+    _non_child = ("dtype", "op")
+    children: tuple[Expr, Expr]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        op: plc.binaryop.BinaryOperator,
+        left: Expr,
+        right: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        if plc.traits.is_boolean(self.dtype):
+            # For boolean output types, bitand and bitor implement
+            # boolean logic, so translate. bitxor also does, but the
+            # default behaviour is correct.
+            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
+        self.op = op
+        self.children = (left, right)
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
+        ):
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
+
+    _BOOL_KLEENE_MAPPING: ClassVar[
+        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
+    ] = {
+        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+    }
+
+    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        left, right = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lop = left.obj
+        rop = right.obj
+        if left.obj.size() != right.obj.size():
+            if left.is_scalar:
+                lop = left.obj_scalar
+            elif right.is_scalar:
+                rop = right.obj_scalar
+        return Column(
+            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate,
+            # groupby construction has checked that we don't have
+            # nested aggs, so stop the recursion and return ourselves
+            # for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            left_info, right_info = (
+                child.collect_agg(depth=depth) for child in self.children
+            )
+            requests = [*left_info.requests, *right_info.requests]
+            # TODO: Hack, if there were no reductions inside this
+            # binary expression then we want to pre-evaluate and
+            # collect ourselves. Otherwise we want to collect the
+            # aggregations inside and post-evaluate. This is a bad way
+            # of checking that we are in case 1.
+            if all(
+                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
+                for _, agg, _ in requests
+            ):
+                return AggInfo([(self, plc.aggregation.collect_list(), self)])
+            return AggInfo(
+                [*left_info.requests, *right_info.requests],
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
new file mode 100644
index 00000000000..ff9973a47d5
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -0,0 +1,269 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Boolean DSL nodes."""
+
+from __future__ import annotations
+
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import polars.type_aliases as pl_types
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BooleanFunction"]
+
+
+class BooleanFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.BooleanFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
+        ):
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
+
+    @staticmethod
+    def _distinct(
+        column: Column,
+        *,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        source_value: plc.Scalar,
+        target_value: plc.Scalar,
+    ) -> Column:
+        table = plc.Table([column.obj])
+        indices = plc.stream_compaction.distinct_indices(
+            table,
+            keep,
+            # TODO: polars doesn't expose options for these
+            plc.types.NullEquality.EQUAL,
+            plc.types.NanEquality.ALL_EQUAL,
+        )
+        return Column(
+            plc.copying.scatter(
+                [source_value],
+                indices,
+                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
+            ).columns()[0]
+        )
+
+    _BETWEEN_OPS: ClassVar[
+        dict[
+            pl_types.ClosedInterval,
+            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
+        ]
+    ] = {
+        "none": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "left": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "right": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+        "both": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
+        # False
+        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+            (ignore_nulls,) = self.options
+            (column,) = columns
+            is_any = self.name == pl_expr.BooleanFunction.Any
+            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
+            result = plc.reduce.reduce(column.obj, agg, self.dtype)
+            if not ignore_nulls and column.obj.null_count() > 0:
+                #      Truth tables
+                #     Any         All
+                #   | F U T     | F U T
+                # --+------   --+------
+                # F | F U T   F | F F F
+                # U | U U T   U | F U U
+                # T | T T T   T | F U T
+                #
+                # If the input null count was non-zero, we must
+                # post-process the result to insert the correct value.
+                h_result = plc.interop.to_arrow(result).as_py()
+                if is_any and not h_result or not is_any and h_result:
+                    # Any                     All
+                    # False || Null => Null   True && Null => Null
+                    return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(plc.Column.from_scalar(result, 1))
+        if self.name == pl_expr.BooleanFunction.IsNull:
+            (column,) = columns
+            return Column(plc.unary.is_null(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            (column,) = columns
+            return Column(plc.unary.is_valid(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_not_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsUnique:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
+            return Column(
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
+            )
+        else:
+            raise NotImplementedError(
+                f"BooleanFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
new file mode 100644
index 00000000000..f752a23b628
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for datetime operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["TemporalFunction"]
+
+
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
+        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
+        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
+        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
+        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
+        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
+        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
+        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
+        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
+        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
+    }
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name not in self._COMPONENT_MAP:
+            raise NotImplementedError(f"Temporal function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        (column,) = columns
+        if self.name == pl_expr.TemporalFunction.Microsecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            millis_as_micros = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.DataType(plc.TypeId.INT32),
+            )
+            total_micros = plc.binaryop.binary_operation(
+                micros,
+                millis_as_micros,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_micros)
+        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            nanos = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
+            )
+            millis_as_nanos = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            micros_as_nanos = plc.binaryop.binary_operation(
+                micros,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                nanos,
+                millis_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                total_nanos,
+                micros_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_nanos)
+
+        return Column(
+            plc.datetime.extract_datetime_component(
+                column.obj,
+                self._COMPONENT_MAP[self.name],
+            )
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
new file mode 100644
index 00000000000..562a2255033
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Literal DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import pyarrow as pa
+
+    import polars as pl
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Literal", "LiteralColumn"]
+
+
+class Literal(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Scalar[Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
+        super().__init__(dtype)
+        assert value.type == plc.interop.to_arrow(dtype)
+        self.value = value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow scalar is correct by construction.
+        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
+
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
+        super().__init__(dtype)
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
new file mode 100644
index 00000000000..f7dcc3c542c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Rolling DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from cudf_polars.dsl.expressions.base import Expr
+
+if TYPE_CHECKING:
+    import pylibcudf as plc
+
+__all__ = ["RollingWindow", "GroupedRollingWindow"]
+
+
+class RollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg,)
+        raise NotImplementedError("Rolling window not implemented")
+
+
+class GroupedRollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg, *by)
+        raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
new file mode 100644
index 00000000000..a7a3e68a28c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for selection operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Gather", "Filter"]
+
+
+class Gather(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, indices = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lo, hi = plc.reduce.minmax(indices.obj)
+        lo = plc.interop.to_arrow(lo).as_py()
+        hi = plc.interop.to_arrow(hi).as_py()
+        n = df.num_rows
+        if hi >= n or lo < -n:
+            raise ValueError("gather indices are out of bounds")
+        if indices.obj.null_count():
+            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
+            obj = plc.replace.replace_nulls(
+                indices.obj,
+                plc.interop.from_arrow(
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
+                ),
+            )
+        else:
+            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            obj = indices.obj
+        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
+        return Column(table.columns()[0])
+
+
+class Filter(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, mask = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        table = plc.stream_compaction.apply_boolean_mask(
+            plc.Table([values.obj]), mask.obj
+        )
+        return Column(table.columns()[0]).sorted_like(values)
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
new file mode 100644
index 00000000000..861b73ce6a0
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Sorting DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.utils import sorting
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Sort", "SortBy"]
+
+
+class Sort(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            [descending], nulls_last=[nulls_last], num_keys=1
+        )
+        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
+        table = do_sort(plc.Table([column.obj]), order, null_order)
+        return Column(
+            table.columns()[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=order[0],
+            null_order=null_order[0],
+        )
+
+
+class SortBy(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        options: tuple[bool, tuple[bool], tuple[bool]],
+        column: Expr,
+        *by: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column, *by)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column, *by = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        table = do_sort(
+            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
+        )
+        return Column(table.columns()[0])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
new file mode 100644
index 00000000000..6669669aadc
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -0,0 +1,283 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for string operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+from polars.exceptions import InvalidOperationError
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["StringFunction"]
+
+
+class StringFunction(Expr):
+    __slots__ = ("name", "options", "children", "_regex_program")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.StringFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        self._validate_input()
+
+    def _validate_input(self):
+        if self.name not in (
+            pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Replace,
+            pl_expr.StringFunction.ReplaceMany,
+            pl_expr.StringFunction.Slice,
+            pl_expr.StringFunction.Strptime,
+            pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+            pl_expr.StringFunction.Uppercase,
+        ):
+            raise NotImplementedError(f"String function {self.name}")
+        if self.name == pl_expr.StringFunction.Contains:
+            literal, strict = self.options
+            if not literal:
+                if not strict:
+                    raise NotImplementedError(
+                        "f{strict=} is not supported for regex contains"
+                    )
+                if not isinstance(self.children[1], Literal):
+                    raise NotImplementedError(
+                        "Regex contains only supports a scalar pattern"
+                    )
+                pattern = self.children[1].value.as_py()
+                try:
+                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
+                        pattern,
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except RuntimeError as e:
+                    raise NotImplementedError(
+                        f"Unsupported regex {pattern} for GPU engine."
+                    ) from e
+        elif self.name == pl_expr.StringFunction.Replace:
+            _, literal = self.options
+            if not literal:
+                raise NotImplementedError("literal=False is not supported for replace")
+            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
+                raise NotImplementedError("replace only supports scalar target")
+            target = self.children[1]
+            if target.value == pa.scalar("", type=pa.string()):
+                raise NotImplementedError(
+                    "libcudf replace does not support empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            (ascii_case_insensitive,) = self.options
+            if ascii_case_insensitive:
+                raise NotImplementedError(
+                    "ascii_case_insensitive not implemented for replace_many"
+                )
+            if not all(
+                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
+            ):
+                raise NotImplementedError("replace_many only supports literal inputs")
+            target = self.children[1]
+            if pc.any(pc.equal(target.value, "")).as_py():
+                raise NotImplementedError(
+                    "libcudf replace_many is implemented differently from polars "
+                    "for empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            format, _, exact, cache = self.options
+            if cache:
+                raise NotImplementedError("Strptime cache is a CPU feature")
+            if format is None:
+                raise NotImplementedError("Strptime format is required")
+            if not exact:
+                raise NotImplementedError("Strptime does not support exact=False")
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            if not isinstance(self.children[1], Literal):
+                raise NotImplementedError(
+                    "strip operations only support scalar patterns"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == pl_expr.StringFunction.Contains:
+            child, arg = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+
+            literal, _ = self.options
+            if literal:
+                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pattern = (
+                    pat.obj_scalar
+                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    else pat.obj
+                )
+                return Column(plc.strings.find.contains(column.obj, pattern))
+            else:
+                return Column(
+                    plc.strings.contains.contains_re(column.obj, self._regex_program)
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (
+                c.evaluate(df, context=context, mapping=mapping) for c in self.children
+            )
+            if self.name == pl_expr.StringFunction.StripCharsStart:
+                side = plc.strings.SideType.LEFT
+            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+                side = plc.strings.SideType.RIGHT
+            else:
+                side = plc.strings.SideType.BOTH
+            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
+
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.StringFunction.Lowercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_lower(column.obj))
+        elif self.name == pl_expr.StringFunction.Uppercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_upper(column.obj))
+        elif self.name == pl_expr.StringFunction.EndsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.ends_with(
+                    column.obj,
+                    suffix.obj_scalar
+                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
+                    else suffix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.StartsWith:
+            column, prefix = columns
+            return Column(
+                plc.strings.find.starts_with(
+                    column.obj,
+                    prefix.obj_scalar
+                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
+                    else prefix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            # TODO: ignores ambiguous
+            format, strict, exact, cache = self.options
+            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+
+            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
+                col.obj, format
+            )
+
+            if strict:
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        is_timestamps,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("conversion from `str` failed.")
+            else:
+                not_timestamps = plc.unary.unary_operation(
+                    is_timestamps, plc.unary.UnaryOperator.NOT
+                )
+
+                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                res = plc.copying.boolean_mask_scatter(
+                    [null], plc.Table([col.obj]), not_timestamps
+                )
+                return Column(
+                    plc.strings.convert.convert_datetime.to_timestamps(
+                        res.columns()[0], self.dtype, format
+                    )
+                )
+        elif self.name == pl_expr.StringFunction.Replace:
+            column, target, repl = columns
+            n, _ = self.options
+            return Column(
+                plc.strings.replace.replace(
+                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
+                )
+            )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            column, target, repl = columns
+            return Column(
+                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
+            )
+        raise NotImplementedError(
+            f"StringFunction {self.name}"
+        )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
new file mode 100644
index 00000000000..c7d7a802ded
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for ternary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+
+__all__ = ["Ternary"]
+
+
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
new file mode 100644
index 00000000000..3d4d15be1ce
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -0,0 +1,328 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+"""DSL nodes for unary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Cast", "UnaryFunction", "Len"]
+
+
+class Cast(Expr):
+    """Class representing a cast of an expression."""
+
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr]
+
+    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
+        super().__init__(dtype)
+        self.children = (value,)
+        if not dtypes.can_cast(value.dtype, self.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        (child,) = self.children
+        return child.collect_agg(depth=depth)
+
+
+class Len(Expr):
+    """Class representing the length of an expression."""
+
+    children: tuple[()]
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
+                ),
+                1,
+            )
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: polars returns a uint, not an int for count
+        return AggInfo(
+            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
+        )
+
+
+class UnaryFunction(Expr):
+    """Class representing unary functions of an expression."""
+
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    # Note: log, and pow are handled via translation to binops
+    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
+        "sin": plc.unary.UnaryOperator.SIN,
+        "cos": plc.unary.UnaryOperator.COS,
+        "tan": plc.unary.UnaryOperator.TAN,
+        "arcsin": plc.unary.UnaryOperator.ARCSIN,
+        "arccos": plc.unary.UnaryOperator.ARCCOS,
+        "arctan": plc.unary.UnaryOperator.ARCTAN,
+        "sinh": plc.unary.UnaryOperator.SINH,
+        "cosh": plc.unary.UnaryOperator.COSH,
+        "tanh": plc.unary.UnaryOperator.TANH,
+        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
+        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
+        "arctanh": plc.unary.UnaryOperator.ARCTANH,
+        "exp": plc.unary.UnaryOperator.EXP,
+        "sqrt": plc.unary.UnaryOperator.SQRT,
+        "cbrt": plc.unary.UnaryOperator.CBRT,
+        "ceil": plc.unary.UnaryOperator.CEIL,
+        "floor": plc.unary.UnaryOperator.FLOOR,
+        "abs": plc.unary.UnaryOperator.ABS,
+        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
+        "not": plc.unary.UnaryOperator.NOT,
+    }
+    _supported_misc_fns = frozenset(
+        {
+            "drop_nulls",
+            "fill_null",
+            "mask_nans",
+            "round",
+            "set_sorted",
+            "unique",
+        }
+    )
+    _supported_cum_aggs = frozenset(
+        {
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+        }
+    )
+    _supported_fns = frozenset().union(
+        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
+    )
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+
+        if self.name not in UnaryFunction._supported_fns:
+            raise NotImplementedError(f"Unary function {name=}")
+        if self.name in UnaryFunction._supported_cum_aggs:
+            (reverse,) = self.options
+            if reverse:
+                raise NotImplementedError(
+                    "reverse=True is not supported for cumulative aggregations"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        elif self.name == "set_sorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        elif self.name == "drop_nulls":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+        elif self.name in self._OP_MAPPING:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if column.obj.type().id() != self.dtype.id():
+                arg = plc.unary.cast(column.obj, self.dtype)
+            else:
+                arg = column.obj
+            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
+        elif self.name in UnaryFunction._supported_cum_aggs:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            plc_col = column.obj
+            col_type = column.obj.type()
+            # cum_sum casts
+            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
+            # Bool -> UInt32
+            # cum_prod casts integer dtypes < int64 and bool to int64
+            # See:
+            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
+            if (
+                self.name == "cum_sum"
+                and col_type.id()
+                in {
+                    plc.types.TypeId.INT8,
+                    plc.types.TypeId.UINT8,
+                    plc.types.TypeId.INT16,
+                    plc.types.TypeId.UINT16,
+                }
+            ) or (
+                self.name == "cum_prod"
+                and plc.traits.is_integral(col_type)
+                and plc.types.size_of(col_type) <= 4
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
+                )
+            elif (
+                self.name == "cum_sum"
+                and column.obj.type().id() == plc.types.TypeId.BOOL8
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
+                )
+            if self.name == "cum_sum":
+                agg = plc.aggregation.sum()
+            elif self.name == "cum_prod":
+                agg = plc.aggregation.product()
+            elif self.name == "cum_min":
+                agg = plc.aggregation.min()
+            elif self.name == "cum_max":
+                agg = plc.aggregation.max()
+
+            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
+            raise NotImplementedError(f"{self.name} in groupby")
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)