From f364fdcd44540b6d5403f1d08acbebfff4e78bd4 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 Jul 2024 14:56:13 -0400
Subject: [PATCH 01/72] DOC v24.10 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +--
 .../cuda12.5-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 +--
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .github/workflows/test.yaml                   | 22 +++++-----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/test_wheel_cudf_polars.sh                  |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++---
 .../all_cuda-125_arch-x86_64.yaml             | 10 ++---
 cpp/examples/versions.cmake                   |  2 +-
 dependencies.yaml                             | 32 +++++++-------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 python/cudf/pyproject.toml                    |  4 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 python/cudf_polars/docs/overview.md           |  2 +-
 python/cudf_polars/pyproject.toml             |  2 +-
 python/custreamz/pyproject.toml               |  4 +-
 python/dask_cudf/pyproject.toml               |  6 +--
 23 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 8423fe21c29..7a1361e52c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4945d6cf753..64d7cd54130 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index fadce01d060..c1924243506 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 026eb540952..beab2940176 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..2fc39c06fad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 5a937b2f362..cf0c2b377dd 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d5dfc9e1ff5..c2e7f64f952 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..9feea050b19 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 1ab6a2d7457..fd8b0365807 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index ec8489fda92..7c7ba04436f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 900acd5d473..cc9f5788685 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b8d73a01f96..b1a1cc3c68e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -43,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -77,9 +77,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 3f5fae49cbb..1017b11779c 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -75,9 +75,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 144b3d3721b..44493011673 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-24.08)
+set(CUDF_TAG branch-24.10)
diff --git a/dependencies.yaml b/dependencies.yaml
index a19574b7658..a90ac64387b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*,>=0.0.0a0
-          - libkvikio==24.8.*,>=0.0.0a0
+          - librmm==24.10.*,>=0.0.0a0
+          - libkvikio==24.10.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -329,7 +329,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_conda rmm==24.10.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -345,10 +345,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -505,7 +505,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -597,11 +597,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
               - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
@@ -614,7 +614,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -700,13 +700,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_conda cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -718,16 +718,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
+              - cudf-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
+              - cudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
+          - &cudf_kafka_conda cudf_kafka==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -739,10 +739,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/java/ci/README.md b/java/ci/README.md
index 49481efab6b..ccb9efb50b6 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 70230e6bc71..9694e741f16 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.08.0-SNAPSHOT</version>
+    <version>24.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index dcb33b1fc1a..da57622dec7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -126,7 +126,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index badfdf06d15..bff1a9b8493 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 874bb849747..6cd36136bf8 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 > ![NOTE] These instructions will get simpler as we merge code in.
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0b559f7a8e9..393a7510c89 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7b99e041b54..59ce15ac4ef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.8.*,>=0.0.0a0",
-    "cudf_kafka==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 9b2e3a5a7b1..4968ff0b076 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,12 +19,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -45,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",

From 29ce5c529ea9ea18edc32ab905f1ef076f266008 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 25 Jul 2024 01:29:41 +0200
Subject: [PATCH 02/72] Fix some issues with deprecated / removed cccl
 facilities (#16377)

`cub::If` has been deprecated and should not be used. There is a better alternative in `cuda::std::conditional_t`

`thrust::{binary, unary}_function` has been deprecated and does not serve a purpose similar to the removed `std::{binary, unary}_function`

Rather than relying on the type aliases one should use the `std::invoke` machinery

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Bernhard Manfred Gruber (https://github.com/bernhardmgruber)

URL: https://github.com/rapidsai/cudf/pull/16377
---
 cpp/benchmarks/common/generate_input.cu          | 2 +-
 cpp/include/cudf/detail/gather.cuh               | 2 +-
 cpp/src/io/fst/agent_dfa.cuh                     | 2 +-
 cpp/src/reductions/minmax.cu                     | 3 +--
 java/src/main/native/src/aggregation128_utils.cu | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 6df2cb44adc..0970003deb2 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -718,7 +718,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
 }
 
 template <typename T>
-struct clamp_down : public thrust::unary_function<T, T> {
+struct clamp_down {
   T max;
   clamp_down(T max) : max(max) {}
   __host__ __device__ T operator()(T x) const { return min(x, max); }
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index d3e9fc4974d..e8e95380815 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -518,7 +518,7 @@ struct column_gatherer_impl<struct_view> {
  * Positive indices are unchanged by this transformation.
  */
 template <typename map_type>
-struct index_converter : public thrust::unary_function<map_type, map_type> {
+struct index_converter {
   index_converter(size_type n_rows) : n_rows(n_rows) {}
 
   __device__ map_type operator()(map_type in) const { return ((in % n_rows) + n_rows) % n_rows; }
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index bc5b94e2718..0e70984b39c 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -791,7 +791,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     can_use_smem_cache;
 
   using DFASimulationCallbackWrapperT =
-    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+    cuda::std::conditional_t<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>;
 
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 2c1181972c5..6cb58786971 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -107,8 +107,7 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
  * respectively of the minimums and maximums of the input pairs.
  */
 template <typename T>
-struct minmax_binary_op
-  : public thrust::binary_function<minmax_pair<T>, minmax_pair<T>, minmax_pair<T>> {
+struct minmax_binary_op {
   __device__ minmax_pair<T> operator()(minmax_pair<T> const& lhs, minmax_pair<T> const& rhs) const
   {
     return minmax_pair<T>{thrust::min(lhs.min_val, rhs.min_val),
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index a32e7d27085..631df58b017 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -34,7 +34,7 @@
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
-class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
+class chunk_assembler {
  public:
   chunk_assembler(bool* overflows,
                   uint64_t const* chunks0,

From 5a3399bec868f44d13c003f172c665919096d8e8 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 19:26:12 -0500
Subject: [PATCH 03/72] fix [tool.setuptools] reference in custreamz config
 (#16365)

Noticed this warning in logs from #16183

> _/python3.10/site-packages/setuptools/config/pyprojecttoml.py:70: _ToolsTypoInMetadata: Ignoring [tools.setuptools] in pyproject.toml, did you mean [tool.setuptools]?_

This fixes that.

## Notes for Reviewers

Intentionally targeting this at 24.10.

This misconfiguration has been in `custreamz` since the 23.04 release ([git blame link](https://github.com/rapidsai/cudf/blame/e6d412cba7c23df7ee500c28257ed9281cea49b9/python/custreamz/pyproject.toml#L60)).

I think the only effect might be that some test files are included in wheels when we don't want to.

I don't think the fix for it needs to be rushed into 24.08.

I searched across RAPIDS in case this was copied from somewhere else... don't see any other instances of this typo that need to be fixed.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16365
---
 python/custreamz/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 59ce15ac4ef..4be94aa3368 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -57,7 +57,7 @@ zip-safe = false
 [tool.setuptools.dynamic]
 version = {file = "custreamz/VERSION"}
 
-[tools.setuptools.packages.find]
+[tool.setuptools.packages.find]
 include = [
     "custreamz",
     "custreamz.*",

From 473dec55abd1a3d9d540c541443f831d18ebb532 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 26 Jul 2024 14:45:12 -0700
Subject: [PATCH 04/72] Add query 10 to the TPC-H suite (#16392)

Adds Q10 to the TPC-H benchmark suite

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16392
---
 cpp/examples/tpch/CMakeLists.txt |   4 +
 cpp/examples/tpch/q1.cpp         |   2 +-
 cpp/examples/tpch/q10.cpp        | 166 +++++++++++++++++++++++++++++++
 cpp/examples/tpch/q5.cpp         |  20 ++--
 cpp/examples/tpch/q6.cpp         |   2 +-
 5 files changed, 182 insertions(+), 12 deletions(-)
 create mode 100644 cpp/examples/tpch/q10.cpp

diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
index 1b91d07e148..373a6d72d56 100644
--- a/cpp/examples/tpch/CMakeLists.txt
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -30,3 +30,7 @@ target_compile_features(tpch_q6 PRIVATE cxx_std_17)
 add_executable(tpch_q9 q9.cpp)
 target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
 target_compile_features(tpch_q9 PRIVATE cxx_std_17)
+
+add_executable(tpch_q10 q10.cpp)
+target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
+target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index 1bdf039da4a..fe03320b888 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -124,7 +124,7 @@ int main(int argc, char const** argv)
   auto shipdate_upper =
     cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
   auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
-  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred          = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
new file mode 100644
index 00000000000..94da46f6930
--- /dev/null
+++ b/cpp/examples/tpch/q10.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q10.cpp
+ * @brief Implement query 10 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    c_custkey,
+ *    c_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue,
+ *    c_acctbal,
+ *    n_name,
+ *    c_address,
+ *    c_phone,
+ *    c_comment
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    nation
+ * where
+ *     c_custkey = o_custkey
+ *     and l_orderkey = o_orderkey
+ *     and o_orderdate >= date '1993-10-01'
+ *     and o_orderdate < date '1994-01-01'
+ *     and l_returnflag = 'R'
+ *     and c_nationkey = n_nationkey
+ * group by
+ *     c_custkey,
+ *     c_name,
+ *     c_acctbal,
+ *     c_phone,
+ *     n_name,
+ *     c_address,
+ *     c_comment
+ * order by
+ *     revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1993, 10, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  auto const l_returnflag_ref = cudf::ast::column_reference(3);
+  auto r_scalar               = cudf::string_scalar("R");
+  auto const r_literal        = cudf::ast::literal(r_scalar);
+  auto const lineitem_pred    = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, l_returnflag_ref, r_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer = read_parquet(
+    args.dataset_dir + "/customer.parquet",
+    {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet",
+                 {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
+                 std::move(lineitem_pred));
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+
+  // Perform the joins
+  auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
+  auto const join_b       = apply_inner_join(lineitem, orders, {"l_orderkey"}, {"o_orderkey"});
+  auto const joined_table = apply_inner_join(join_a, join_b, {"c_custkey"}, {"o_custkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    joined_table,
+    groupby_context_t{
+      {"c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"},
+      {
+        {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q10.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index e56850b94d6..89396a6c968 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -44,14 +44,14 @@
  *    region
  * where
  *     c_custkey = o_custkey
- *    and l_orderkey = o_orderkey
- *    and l_suppkey = s_suppkey
- *    and c_nationkey = s_nationkey
- *    and s_nationkey = n_nationkey
- *    and n_regionkey = r_regionkey
- *    and r_name = 'ASIA'
- *    and o_orderdate >= date '1994-01-01'
- *    and o_orderdate < date '1995-01-01'
+ *     and l_orderkey = o_orderkey
+ *     and l_suppkey = s_suppkey
+ *     and c_nationkey = s_nationkey
+ *     and s_nationkey = n_nationkey
+ *     and n_regionkey = r_regionkey
+ *     and r_name = 'ASIA'
+ *     and o_orderdate >= date '1994-01-01'
+ *     and o_orderdate < date '1995-01-01'
  * group by
  *    n_name
  * order by
@@ -109,7 +109,7 @@ int main(int argc, char const** argv)
   auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
   auto const o_orderdate_pred_upper =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
-  auto orders_pred = std::make_unique<cudf::ast::operation>(
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
 
   // Define the column projection and filter predicate for the `region` table
@@ -118,7 +118,7 @@ int main(int argc, char const** argv)
     region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
   auto r_name_value                          = cudf::string_scalar("ASIA");
   auto const r_name_literal                  = cudf::ast::literal(r_name_value);
-  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+  auto const region_pred                     = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
 
   // Read out the tables from parquet files
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index f11b3d6ab3b..405b2ac73ca 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -84,7 +84,7 @@ int main(int argc, char const** argv)
     cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
   auto const shipdate_pred_b =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
-  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
     read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));

From 24997fda194d5b8af34048a8bf275830cabbff8c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 26 Jul 2024 18:37:30 -0700
Subject: [PATCH 05/72] Deduplicate decimal32/decimal64 to decimal128
 conversion function (#16236)

Closes #16194

This PR deduplicates the `convert_data_to_decimal128` function from `to_arrow.cu`, `writer_impl.cu` and `to_arrow_device.cu` to a common location.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16236
---
 cpp/CMakeLists.txt                            |  1 +
 .../interop/decimal_conversion_utilities.cu   | 70 +++++++++++++++++
 .../interop/decimal_conversion_utilities.cuh  | 44 +++++++++++
 cpp/src/interop/to_arrow.cu                   |  8 +-
 cpp/src/interop/to_arrow_device.cu            |  5 +-
 cpp/src/interop/to_arrow_host.cu              | 40 +---------
 cpp/src/io/parquet/writer_impl.cu             | 60 ++++-----------
 cpp/tests/interop/to_arrow_device_test.cpp    | 77 +++++++++++++++++++
 8 files changed, 220 insertions(+), 85 deletions(-)
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cu
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 95c509efc5b..310bc99b279 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -365,6 +365,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
+  src/interop/decimal_conversion_utilities.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu
new file mode 100644
index 00000000000..2f81c754a30
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cu
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decimal_conversion_utilities.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  static_assert(std::is_same_v<DecimalType, int32_t> or std::is_same_v<DecimalType, int64_t>,
+                "Only int32 and int64 decimal types can be converted to decimal128.");
+
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+  auto buf = std::make_unique<rmm::device_buffer>(column.size() * sizeof(__int128_t), stream, mr);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream, mr),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+// Instantiate templates for int32_t and int64_t decimal types
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int32_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int64_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
new file mode 100644
index 00000000000..41263147404
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+/**
+ * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector
+ *
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::detail
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 6b163e3441e..3d41f856f4f 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -158,8 +159,11 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  auto buf =
-    detail::decimals_to_arrow<DeviceType>(input, stream, rmm::mr::get_current_device_resource());
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
+    input, stream, rmm::mr::get_current_device_resource());
+
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
 
   auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 2eb9b912054..cea7cdebcba 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -141,7 +142,9 @@ int construct_decimals(cudf::column_view input,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  auto buf = detail::decimals_to_arrow<DeviceType>(input, stream, mr);
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index c9e53ebaab7..193b3a3b5a2 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
@@ -50,41 +51,6 @@
 namespace cudf {
 namespace detail {
 
-template <typename DeviceType>
-std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr)
-{
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf = std::make_unique<rmm::device_buffer>(input.size() * sizeof(__int128_t), stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-  thrust::for_each(rmm::exec_policy(stream, mr),
-                   count,
-                   count + input.size(),
-                   [in  = input.begin<DeviceType>(),
-                    out = reinterpret_cast<DeviceType*>(buf->data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // the lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return buf;
-}
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int32_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int64_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
 namespace {
 
 struct dispatch_to_arrow_host {
@@ -156,7 +122,9 @@ struct dispatch_to_arrow_host {
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
 
     NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-    auto buf = detail::decimals_to_arrow<DeviceType>(column, stream, mr);
+    auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
+    // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
+    // to host.
     NANOARROW_RETURN_NOT_OK(
       populate_data_buffer(device_span<__int128_t const>(
                              reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2df71b77301..36a1d8377bf 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,7 @@
 #include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
+#include "interop/decimal_conversion_utilities.cuh"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
@@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
   return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size;
 }
 
-/**
- * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
- *
- * @tparam DecimalType to convert from
- *
- * @param column A view of the input columns
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return A device vector containing the converted decimal128 data
- */
-template <typename DecimalType>
-rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
-                                                           rmm::cuda_stream_view stream)
-{
-  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
-
-  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
-
-  thrust::for_each(rmm::exec_policy_nosync(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(column.size()),
-                   [in  = column.begin<DecimalType>(),
-                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // The lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return d128_buffer;
-}
-
 /**
  * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
  *        update the input table metadata, and return a new vector of column views.
  *
  * @param[in,out] table_meta The table metadata
- * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param[in,out] d128_buffers Buffers containing the converted decimal128 data.
  * @param input The input table
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
@@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
  */
 std::vector<column_view> convert_decimal_columns_and_metadata(
   table_input_metadata& table_meta,
-  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  std::vector<std::unique_ptr<rmm::device_buffer>>& d128_buffers,
   table_view const& table,
   rmm::cuda_stream_view stream)
 {
@@ -1673,28 +1636,30 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     switch (column.type().id()) {
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
                 converted_children};
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
@@ -1722,6 +1687,9 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     std::back_inserter(converted_column_views),
     [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
 
+  // Synchronize stream here to ensure all decimal128 buffers are ready.
+  stream.synchronize();
+
   return converted_column_views;
 }
 
@@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   // Container to store decimal128 converted data if needed
-  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+  std::vector<std::unique_ptr<rmm::device_buffer>> d128_buffers;
 
   // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
   // and initialize LinkedColVector
   auto vec = table_to_linked_columns(
     (write_arrow_schema)
-      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)})
       : input);
 
   auto schema_tree = construct_parquet_schema_tree(
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 77da4039103..51216a8512c 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -710,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
+TEST_F(ToArrowDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) {
+    auto const expect_data =
+      std::vector<int32_t>{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1,
+                           4650,  0,  0,  0,  5154, 0, 0, 0, 6800,  0,  0,  0};
+    auto col = fp_wrapper<int32_t>({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int32_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int32_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int32_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int32_t>>(std::move(result_dev_data)))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
+
 TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 {
   using namespace numeric;

From 18c1465b597284d8b558964cc0ca48de7da60a17 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:06:07 -1000
Subject: [PATCH 06/72] Align ewm APIs with pandas 2.x (#16413)

These all currently are not implemented and raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16413
---
 python/cudf/cudf/core/window/ewm.py | 52 ++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index bb153d4b549..1203a840076 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -114,23 +114,57 @@ def __init__(
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
 
-    def mean(self):
+    def online(self, engine: str = "numba", engine_kwargs=None):
+        """
+        Return an ``OnlineExponentialMovingWindow`` object to calculate
+        exponentially moving window aggregations in an online method.
+
+        Currently not supported.
+        """
+        raise NotImplementedError("online is currently not supported.")
+
+    def mean(
+        self, numeric_only: bool = False, engine=None, engine_kwargs=None
+    ):
         """
         Calculate the ewm (exponential weighted moment) mean.
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas."
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional and added for compatibility with pandas."
+            )
         return self._apply_agg("ewma")
 
-    def var(self, bias):
-        raise NotImplementedError("ewmvar not yet supported.")
+    def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None):
+        raise NotImplementedError("sum not yet supported.")
 
-    def std(self, bias):
-        raise NotImplementedError("ewmstd not yet supported.")
+    def var(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("var not yet supported.")
 
-    def corr(self, other):
-        raise NotImplementedError("ewmcorr not yet supported.")
+    def std(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("std not yet supported.")
 
-    def cov(self, other):
-        raise NotImplementedError("ewmcov not yet supported.")
+    def corr(
+        self, other, pairwise: bool | None = None, numeric_only: bool = False
+    ):
+        raise NotImplementedError("corr not yet supported.")
+
+    def cov(
+        self,
+        other,
+        pairwise: bool | None = None,
+        bias: bool = False,
+        numeric_only: bool = False,
+    ):
+        raise NotImplementedError("cov not yet supported.")
 
     def _apply_agg_series(self, sr, agg_name):
         if not is_numeric_dtype(sr.dtype):

From 58f47242fe04b1e25fd42e1e45e8c15417140777 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:09:21 -1000
Subject: [PATCH 07/72] Align groupby APIs with pandas 2.x (#16403)

The following breaking APIs are affected:

* `apply`
* `transform`
* `describe`

The rest of the APIs are non-breaking and generally will raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16403
---
 .../source/user_guide/api_docs/groupby.rst    |   3 +-
 python/cudf/cudf/core/groupby/groupby.py      | 629 ++++++++++++++----
 python/cudf/cudf/core/resample.py             |   6 +-
 python/cudf/cudf/tests/test_groupby.py        |  25 +
 4 files changed, 514 insertions(+), 149 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst
index 80811efa33f..ca29087cbf9 100644
--- a/docs/cudf/source/user_guide/api_docs/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/groupby.rst
@@ -68,7 +68,6 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
-   GroupBy.corr
    GroupBy.cov
 
 The following methods are available in both ``SeriesGroupBy`` and
@@ -81,6 +80,7 @@ application to columns of a specific data type.
    :toctree: api/
 
    DataFrameGroupBy.bfill
+   DataFrameGroupBy.corr
    DataFrameGroupBy.count
    DataFrameGroupBy.cumcount
    DataFrameGroupBy.cummax
@@ -102,5 +102,6 @@ The following methods are available only for ``SeriesGroupBy`` objects.
 .. autosummary::
    :toctree: api/
 
+   SeriesGroupBy.corr
    SeriesGroupBy.nunique
    SeriesGroupBy.unique
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1646c5042fd..3cfbd1d736a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Any, Iterable, Literal
 
 import cupy as cp
 import numpy as np
@@ -306,6 +306,18 @@ def __iter__(self):
                 grouped_values[offsets[i] : offsets[i + 1]],
             )
 
+    def __len__(self) -> int:
+        return self.ngroups
+
+    @property
+    def ngroups(self) -> int:
+        _, offsets, _, _ = self._grouped()
+        return len(offsets) - 1
+
+    @property
+    def ndim(self) -> int:
+        return self.obj.ndim
+
     @property
     def dtypes(self):
         """
@@ -457,10 +469,20 @@ def size(self):
         )
 
     @_performance_tracking
-    def cumcount(self):
+    def cumcount(self, ascending: bool = True):
         """
         Return the cumulative count of keys in each group.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
+            Currently not supported
         """
+        if ascending is not True:
+            raise NotImplementedError(
+                "ascending is currently not implemented."
+            )
         return (
             cudf.Series(
                 cudf.core.column.column_empty(
@@ -527,7 +549,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func):
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -615,6 +637,22 @@ def agg(self, func):
         1  1.5  1.75  2.0   2.0
         2  3.0  3.00  1.0   1.0
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         column_names, columns, normalized_aggs = self._normalize_aggs(func)
         orig_dtypes = tuple(c.dtype for c in columns)
 
@@ -935,12 +973,13 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
         )
 
     @_performance_tracking
-    def nth(self, n):
+    def nth(self, n, dropna: Literal["any", "all", None] = None):
         """
         Return the nth row from each group.
         """
-
-        self.obj["__groupbynth_order__"] = range(0, len(self.obj))
+        if dropna is not None:
+            raise NotImplementedError("dropna is not currently supported.")
+        self.obj["__groupbynth_order__"] = range(0, len(self.obj))  # type: ignore[index]
         # We perform another groupby here to have the grouping columns
         # be a part of dataframe columns.
         result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n))
@@ -1423,13 +1462,13 @@ def _post_process_chunk_results(
 
     @_performance_tracking
     def apply(
-        self, function, *args, engine="auto", include_groups: bool = True
+        self, func, *args, engine="auto", include_groups: bool = True, **kwargs
     ):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
         ----------
-        function : callable
+        func : callable
           The python transformation function that will be applied
           on the grouped chunk.
         args : tuple
@@ -1452,6 +1491,9 @@ def apply(
             When True, will attempt to apply ``func`` to the groupings in
             the case that they are columns of the DataFrame. In the future,
             this will default to ``False``.
+        kwargs : dict
+            Optional keyword arguments to pass to the function.
+            Currently not supported
 
         Examples
         --------
@@ -1528,13 +1570,17 @@ def mult(df):
         dtype: int64
 
         """
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         if self.obj.empty:
-            if function in {"count", "size", "idxmin", "idxmax"}:
+            if func in {"count", "size", "idxmin", "idxmax"}:
                 res = cudf.Series([], dtype="int64")
             else:
                 res = self.obj.copy(deep=True)
             res.index = self.grouping.keys
-            if function in {"sum", "product"}:
+            if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
@@ -1542,20 +1588,20 @@ def mult(df):
                         res._data[name] = col.astype("int")
             return res
 
-        if not callable(function):
-            raise TypeError(f"type {type(function)} is not callable")
+        if not callable(func):
+            raise TypeError(f"type {type(func)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped(
             include_groups=include_groups
         )
 
         if engine == "auto":
-            if _can_be_jitted(grouped_values, function, args):
+            if _can_be_jitted(grouped_values, func, args):
                 engine = "jit"
             else:
                 engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1564,7 +1610,7 @@ def mult(df):
             )
         elif engine == "cudf":
             result = self._iterative_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1744,12 +1790,14 @@ def _broadcast(self, values: cudf.Series) -> cudf.Series:
         return values
 
     @_performance_tracking
-    def transform(self, function):
+    def transform(
+        self, func, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
         """Apply an aggregation, then broadcast the result to the group size.
 
         Parameters
         ----------
-        function: str or callable
+        func: str or callable
             Aggregation to apply to each group. Note that the set of
             operations currently supported by `transform` is identical
             to that supported by the `agg` method.
@@ -1778,18 +1826,35 @@ def transform(self, function):
         --------
         agg
         """
-        if not (isinstance(function, str) or callable(function)):
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
+
+        if not (isinstance(func, str) or callable(func)):
             raise TypeError(
                 "Aggregation must be a named aggregation or a callable"
             )
         try:
-            result = self.agg(function)
+            result = self.agg(func)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
         # If the aggregation is a scan, don't broadcast
-        if libgroupby._is_all_scan_aggregate([[function]]):
+        if libgroupby._is_all_scan_aggregate([[func]]):
             if len(result) != len(self.obj):
                 raise AssertionError(
                     "Unexpected result length for scan transform"
@@ -1824,7 +1889,7 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def describe(self, include=None, exclude=None):
+    def describe(self, percentiles=None, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
         dispersion and shape of a dataset's distribution, excluding NaN values.
@@ -1833,6 +1898,10 @@ def describe(self, include=None, exclude=None):
 
         Parameters
         ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output.
+            Currently not supported.
+
         include: 'all', list-like of dtypes or None (default), optional
             list of data types to include in the result.
             Ignored for Series.
@@ -1869,8 +1938,12 @@ def describe(self, include=None, exclude=None):
         90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
 
         """
-        if exclude is not None and include is not None:
-            raise NotImplementedError
+        if percentiles is not None:
+            raise NotImplementedError("percentiles is currently not supported")
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported")
+        if include is not None:
+            raise NotImplementedError("include is currently not supported")
 
         res = self.agg(
             [
@@ -1896,69 +1969,7 @@ def describe(self, include=None, exclude=None):
         return res
 
     @_performance_tracking
-    def corr(self, method="pearson", min_periods=1):
-        """
-        Compute pairwise correlation of columns, excluding NA/null values.
-
-        Parameters
-        ----------
-        method: {"pearson", "kendall", "spearman"} or callable,
-            default "pearson". Currently only the pearson correlation
-            coefficient is supported.
-
-        min_periods: int, optional
-            Minimum number of observations required per pair of columns
-            to have a valid result.
-
-        Returns
-        -------
-        DataFrame
-            Correlation matrix.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> gdf = cudf.DataFrame({
-        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
-        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
-        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
-        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
-        >>> gdf
-           id  val1  val2  val3
-        0  a     5     4     4
-        1  a     4     5     5
-        2  a     6     6     6
-        3  b     4     1     1
-        4  b     8     2     2
-        5  b     7     9     9
-        6  c     4     8     8
-        7  c     5     5     5
-        8  c     2     1     1
-        >>> gdf.groupby("id").corr(method="pearson")
-                    val1      val2      val3
-        id
-        a   val1  1.000000  0.500000  0.500000
-            val2  0.500000  1.000000  1.000000
-            val3  0.500000  1.000000  1.000000
-        b   val1  1.000000  0.385727  0.385727
-            val2  0.385727  1.000000  1.000000
-            val3  0.385727  1.000000  1.000000
-        c   val1  1.000000  0.714575  0.714575
-            val2  0.714575  1.000000  1.000000
-            val3  0.714575  1.000000  1.000000
-        """
-
-        if method.lower() not in ("pearson",):
-            raise NotImplementedError(
-                "Only pearson correlation is currently supported"
-            )
-
-        return self._cov_or_corr(
-            lambda x: x.corr(method, min_periods), "Correlation"
-        )
-
-    @_performance_tracking
-    def cov(self, min_periods=0, ddof=1):
+    def cov(self, min_periods=0, ddof=1, numeric_only: bool = False):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
         excluding NA/null values.
@@ -2042,6 +2053,10 @@ def cov(self, min_periods=0, ddof=1):
            val2  3.833333  12.333333  12.333333
            val3  3.833333  12.333333  12.333333
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         return self._cov_or_corr(
             lambda x: x.cov(min_periods, ddof), "Covariance"
@@ -2137,7 +2152,13 @@ def _cov_or_corr(self, func, method_name):
         return res
 
     @_performance_tracking
-    def var(self, ddof=1):
+    def var(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise variance of the values in each group.
 
         Parameters
@@ -2146,6 +2167,18 @@ def var(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the variance.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "var")(ddof=ddof)
@@ -2153,7 +2186,13 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def std(self, ddof=1):
+    def std(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise std of the values in each group.
 
         Parameters
@@ -2162,6 +2201,18 @@ def std(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the standard deviation.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "std")(ddof=ddof)
@@ -2169,7 +2220,9 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def quantile(self, q=0.5, interpolation="linear"):
+    def quantile(
+        self, q=0.5, interpolation="linear", numeric_only: bool = False
+    ):
         """Compute the column-wise quantiles of the values in each group.
 
         Parameters
@@ -2179,7 +2232,14 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is not currently supported."
+            )
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -2333,7 +2393,14 @@ def fillna(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """
         Shift each group by ``periods`` positions.
 
@@ -2355,6 +2422,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
               the list. The length of the list should match the number of
               columns shifted. Each value should match the data type of the
               column to fill.
+        suffix : str, optional
+            A string to add to each shifted column if there are multiple periods.
+            Ignored otherwise.
+            Currently not supported.
 
         Returns
         -------
@@ -2374,6 +2445,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
+        if suffix is not None:
+            raise NotImplementedError("shift is not currently supported.")
+
         values = self.grouping.values
         if is_list_like(fill_value):
             if len(fill_value) != len(values._data):
@@ -2473,6 +2547,142 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def _mimic_pandas_order(
+        self, result: DataFrameOrSeries
+    ) -> DataFrameOrSeries:
+        """Given a groupby result from libcudf, reconstruct the row orders
+        matching that of pandas. This also adds appropriate indices.
+        """
+        # TODO: copy metadata after this method is a common pattern, should
+        # merge in this method.
+
+        # This function is used to reorder the results of scan-based
+        # groupbys which have the same output size as input size.
+        # However, if the grouping key has NAs and dropna=True, the
+        # result coming back from libcudf has null_count few rows than
+        # the input, so we must produce an ordering from the full
+        # input range.
+        _, _, (ordering,) = self._groupby.groups(
+            [as_column(range(0, len(self.obj)))]
+        )
+        if self._dropna and any(
+            c.has_nulls(include_nan=True) > 0
+            for c in self.grouping._key_columns
+        ):
+            # Scan aggregations with null/nan keys put nulls in the
+            # corresponding output rows in pandas, to do that here
+            # expand the result by reindexing.
+            ri = cudf.RangeIndex(0, len(self.obj))
+            result.index = cudf.Index(ordering)
+            # This reorders and expands
+            result = result.reindex(ri)
+        else:
+            # Just reorder according to the groupings
+            result = result.take(ordering.argsort())
+        # Now produce the actual index we first thought of
+        result.index = self.obj.index
+        return result
+
+    def ohlc(self):
+        """
+        Compute open, high, low and close values of a group, excluding missing values.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("ohlc is currently not implemented")
+
+    @property
+    def plot(self):
+        """
+        Make plots of a grouped Series or DataFrame.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("plot is currently not implemented")
+
+    def resample(self, rule, *args, include_groups: bool = True, **kwargs):
+        """
+        Provide resampling when using a TimeGrouper.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("resample is currently not implemented")
+
+    def take(self, indices):
+        """
+        Return the elements in the given *positional* indices in each group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("take is currently not implemented")
+
+    def filter(self, func, dropna: bool = True, *args, **kwargs):
+        """
+        Filter elements from groups that don't satisfy a criterion.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("filter is currently not implemented")
+
+    def expanding(self, *args, **kwargs):
+        """
+        Return an expanding grouper, providing expanding
+        functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def ewm(self, *args, **kwargs):
+        """
+        Return an ewm grouper, providing ewm functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def any(self, skipna: bool = True):
+        """
+        Return True if any value in the group is truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("any is currently not implemented")
+
+    def all(self, skipna: bool = True):
+        """
+        Return True if all values in the group are truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("all is currently not implemented")
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    obj: "cudf.core.dataframe.DataFrame"
+
+    _PROTECTED_KEYS = frozenset(("obj",))
+
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
+    def __getitem__(self, key):
+        return self.obj[key].groupby(
+            by=self.grouping.keys,
+            dropna=self._dropna,
+            sort=self._sort,
+            group_keys=self._group_keys,
+            as_index=self._as_index,
+        )
+
     def value_counts(
         self,
         subset=None,
@@ -2637,68 +2847,112 @@ def value_counts(
 
         return result
 
-    def _mimic_pandas_order(
-        self, result: DataFrameOrSeries
-    ) -> DataFrameOrSeries:
-        """Given a groupby result from libcudf, reconstruct the row orders
-        matching that of pandas. This also adds appropriate indices.
+    @_performance_tracking
+    def corr(
+        self, method="pearson", min_periods=1, numeric_only: bool = False
+    ):
         """
-        # TODO: copy metadata after this method is a common pattern, should
-        # merge in this method.
+        Compute pairwise correlation of columns, excluding NA/null values.
 
-        # This function is used to reorder the results of scan-based
-        # groupbys which have the same output size as input size.
-        # However, if the grouping key has NAs and dropna=True, the
-        # result coming back from libcudf has null_count few rows than
-        # the input, so we must produce an ordering from the full
-        # input range.
-        _, _, (ordering,) = self._groupby.groups(
-            [as_column(range(0, len(self.obj)))]
-        )
-        if self._dropna and any(
-            c.has_nulls(include_nan=True) > 0
-            for c in self.grouping._key_columns
-        ):
-            # Scan aggregations with null/nan keys put nulls in the
-            # corresponding output rows in pandas, to do that here
-            # expand the result by reindexing.
-            ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
-            # This reorders and expands
-            result = result.reindex(ri)
-        else:
-            # Just reorder according to the groupings
-            result = result.take(ordering.argsort())
-        # Now produce the actual index we first thought of
-        result.index = self.obj.index
-        return result
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
 
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
 
-class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
-    obj: "cudf.core.dataframe.DataFrame"
+        Returns
+        -------
+        DataFrame
+            Correlation matrix.
 
-    _PROTECTED_KEYS = frozenset(("obj",))
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+           id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
 
-    def _reduce_numeric_only(self, op: str):
-        columns = list(
-            name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
+        if method != "pearson":
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
             )
-        )
-        return self[columns].agg(op)
 
-    def __getitem__(self, key):
-        return self.obj[key].groupby(
-            by=self.grouping.keys,
-            dropna=self._dropna,
-            sort=self._sort,
-            group_keys=self._group_keys,
-            as_index=self._as_index,
+        return self._cov_or_corr(
+            lambda x: x.corr(method, min_periods), "Correlation"
         )
 
+    def hist(
+        self,
+        column=None,
+        by=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        ax=None,
+        sharex: bool = False,
+        sharey: bool = False,
+        figsize: tuple[float, float] | None = None,
+        layout: tuple[int, int] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is not currently implemented")
+
+    def boxplot(
+        self,
+        subplots: bool = True,
+        column=None,
+        fontsize: int | None = None,
+        rot: int = 0,
+        grid: bool = True,
+        ax=None,
+        figsize: tuple[float, float] | None = None,
+        layout=None,
+        sharex: bool = False,
+        sharey: bool = True,
+        backend=None,
+        **kwargs,
+    ):
+        raise NotImplementedError("boxplot is not currently implemented")
+
 
 DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
@@ -2706,8 +2960,10 @@ def __getitem__(self, key):
 class SeriesGroupBy(GroupBy):
     obj: "cudf.core.series.Series"
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
 
         # downcast the result to a Series:
         if len(result._data):
@@ -2722,14 +2978,95 @@ def agg(self, func):
 
     aggregate = agg
 
-    def apply(self, func, *args):
-        result = super().apply(func, *args)
+    def apply(self, func, *args, **kwargs):
+        result = super().apply(func, *args, **kwargs)
 
         # apply Series name to result
         result.name = self.obj.name
 
         return result
 
+    @property
+    def dtype(self) -> pd.Series:
+        raise NotImplementedError("dtype is currently not implemented.")
+
+    def hist(
+        self,
+        by=None,
+        ax=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        figsize: tuple[float, float] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is currently not implemented.")
+
+    @property
+    def is_monotonic_increasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically increasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_increasing is currently not implemented."
+        )
+
+    @property
+    def is_monotonic_decreasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically decreasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_decreasing is currently not implemented."
+        )
+
+    def nlargest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the largest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nlargest is currently not implemented.")
+
+    def nsmallest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the smallest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nsmallest is currently not implemented.")
+
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins=None,
+        dropna: bool = True,
+    ) -> cudf.Series | cudf.DataFrame:
+        raise NotImplementedError("value_counts is currently not implemented.")
+
+    def corr(
+        self,
+        other: cudf.Series,
+        method: str = "pearson",
+        min_periods: int | None = None,
+    ) -> cudf.Series:
+        raise NotImplementedError("corr is currently not implemented.")
+
 
 SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 4e0c5bd86b9..715bbf89b15 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -43,8 +43,10 @@ def __init__(self, obj, by, axis=None, kind=None):
         by = _ResampleGrouping(obj, by)
         super().__init__(obj, by=by)
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
         if len(self.grouping.bin_labels) != len(result):
             index = cudf.core.index.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 826a0e52f57..74f04c0584f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3885,3 +3885,28 @@ def test_group_by_raises_category_error(op):
 
     with pytest.raises(TypeError):
         df.groupby(df.a).agg(op)
+
+
+def test_ngroups():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ngroups == ggb.ngroups
+    assert len(pgb) == len(ggb)
+
+
+def test_ndim():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ndim == ggb.ndim
+
+    pser = pd.Series(range(3))
+    gser = cudf.Series.from_pandas(pser)
+    pgb = pser.groupby([0, 0, 1])
+    ggb = gser.groupby(cudf.Series([0, 0, 1]))
+    assert pgb.ndim == ggb.ndim

From 6e7624d6b31c93b0547590929ac63ed8e3a48d24 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:06:51 -0400
Subject: [PATCH 08/72] Add stream parameter to reshape APIs (#16410)

Adds `stream` parameter to reshape APIs:
- `cudf::interleave_columns`
- `cudf::tile`
- `cudf::byte_cast`

Found while working #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16410
---
 cpp/include/cudf/detail/reshape.hpp   |  4 ---
 cpp/include/cudf/reshape.hpp          | 17 ++++++----
 cpp/src/reshape/byte_cast.cu          | 11 ++-----
 cpp/src/reshape/interleave_columns.cu |  3 +-
 cpp/src/reshape/tile.cu               |  3 +-
 cpp/tests/CMakeLists.txt              |  1 +
 cpp/tests/streams/reshape_test.cpp    | 47 +++++++++++++++++++++++++++
 7 files changed, 65 insertions(+), 21 deletions(-)
 create mode 100644 cpp/tests/streams/reshape_test.cpp

diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 30f8b88b116..68a856373bf 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -28,8 +28,6 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::tile
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
@@ -38,8 +36,6 @@ std::unique_ptr<table> tile(table_view const& input,
 
 /**
  * @copydoc cudf::interleave_columns
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index a0a7fe694bb..07aaf6488ad 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -47,13 +47,14 @@ namespace CUDF_EXPORT cudf {
  * @throws cudf::logic_error if input contains no columns.
  * @throws cudf::logic_error if input columns dtypes are not identical.
  *
- * @param[in] input Table containing columns to interleave
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- *
+ * @param input Table containing columns to interleave
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The interleaved columns as a single column
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -68,15 +69,17 @@ std::unique_ptr<column> interleave_columns(
  * return = [[8, 4, 7, 8, 4, 7], [5, 2, 3, 5, 2, 3]]
  * ```
  *
- * @param[in] input Table containing rows to be repeated
- * @param[in] count Number of times to tile "rows". Must be non-negative
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @param input Table containing rows to be repeated
+ * @param count Number of times to tile "rows". Must be non-negative
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return The table containing the tiled "rows"
  */
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,6 +98,7 @@ enum class flip_endianness : bool { NO, YES };
  *
  * @param input_column Column to be converted to lists of bytes
  * @param endian_configuration Whether to retain or flip the endianness of the elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return The column containing the lists of bytes
@@ -102,6 +106,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 3dfa0b65814..2a03a5504c1 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -167,11 +167,6 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
 }  // namespace
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
@@ -183,15 +178,13 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 
 }  // namespace detail
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
+  return detail::byte_cast(input, endian_configuration, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 79124508b11..7473b6045af 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -264,10 +264,11 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::interleave_columns(input, cudf::get_default_stream(), mr);
+  return detail::interleave_columns(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 29996aa2152..3d4fb73c000 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -64,10 +64,11 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
+                            rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, cudf::get_default_stream(), mr);
+  return detail::tile(in, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 22827484f9a..4dffcb41ba2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -704,6 +704,7 @@ ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE test
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/reshape_test.cpp b/cpp/tests/streams/reshape_test.cpp
new file mode 100644
index 00000000000..d7c5da91bca
--- /dev/null
+++ b/cpp/tests/streams/reshape_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/reshape.hpp>
+
+class ReshapeTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReshapeTest, InterleaveColumns)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 3, 6});
+  auto b = cudf::test::fixed_width_column_wrapper<int32_t>({1, 4, 7});
+  auto c = cudf::test::fixed_width_column_wrapper<int32_t>({2, 5, 8});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+  cudf::interleave_columns(in, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, Tile)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  cudf::tile(in, 2, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, ByteCast)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 100, -100, 1000, 1000});
+  cudf::byte_cast(a, cudf::flip_endianness::YES, cudf::test::get_default_stream());
+  cudf::byte_cast(a, cudf::flip_endianness::NO, cudf::test::get_default_stream());
+}

From 35796057b64e258713d4d89ba368837d30a1a9c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 08:33:23 -1000
Subject: [PATCH 09/72] Align misc DataFrame and MultiIndex methods with pandas
 2.x (#16402)

The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `DataFrame.insert`
* `DataFrame.melt`
* `DataFrame.merge`
* `DataFrame.quantile`
* `DataFrame.cov`
* `DataFrame.corr`
* `DataFrame.median`
* `DataFrame.rolling`
* `DataFrame.resample`
* `DataFrame.dropna`
* `MultiIndex.from_tuple`
* `MultiIndex.from_frame`
* `MultiIndex.from_product`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16402
---
 python/cudf/cudf/core/dataframe.py      | 106 +++++++++++++++++-------
 python/cudf/cudf/core/indexed_frame.py  |  81 +++++++++++-------
 python/cudf/cudf/core/multiindex.py     |  38 +++++++--
 python/cudf/cudf/core/reshape.py        |   3 +
 python/cudf/cudf/core/window/ewm.py     |  23 +++--
 python/cudf/cudf/core/window/rolling.py |  27 +++++-
 python/cudf/cudf/tests/test_dropna.py   |   9 ++
 7 files changed, 211 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1d7136e61e3..6ea11fe9f64 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3215,26 +3215,37 @@ def reset_index(
         )
 
     @_performance_tracking
-    def insert(self, loc, name, value, nan_as_null=no_default):
+    def insert(
+        self,
+        loc,
+        column,
+        value,
+        allow_duplicates: bool = False,
+        nan_as_null=no_default,
+    ):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
         loc : int
             location to insert by index, cannot be greater then num columns + 1
-        name : number or string
-            name or label of column to be inserted
+        column : number or string
+            column or label of column to be inserted
         value : Series or array-like
         nan_as_null : bool, Default None
             If ``None``/``True``, converts ``np.nan`` values to
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is currently not implemented."
+            )
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
-            name=name,
+            name=column,
             value=value,
             nan_as_null=nan_as_null,
             ignore_index=False,
@@ -4097,7 +4108,15 @@ def transpose(self):
     T = property(transpose, doc=transpose.__doc__)
 
     @_performance_tracking
-    def melt(self, **kwargs):
+    def melt(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name="value",
+        col_level=None,
+        ignore_index: bool = True,
+    ):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
 
@@ -4124,23 +4143,30 @@ def melt(self, **kwargs):
         """
         from cudf.core.reshape import melt
 
-        return melt(self, **kwargs)
+        return melt(
+            self,
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name=var_name,
+            value_name=value_name,
+            col_level=col_level,
+            ignore_index=ignore_index,
+        )
 
     @_performance_tracking
     def merge(
         self,
         right,
+        how="inner",
         on=None,
         left_on=None,
         right_on=None,
         left_index=False,
         right_index=False,
-        how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
-        indicator=False,
         suffixes=("_x", "_y"),
+        indicator=False,
+        validate=None,
     ):
         """Merge GPU DataFrame objects by performing a database-style join
         operation by columns or indexes.
@@ -4241,17 +4267,8 @@ def merge(
             raise NotImplementedError(
                 "Only indicator=False is currently supported"
             )
-
-        if lsuffix or rsuffix:
-            raise ValueError(
-                "The lsuffix and rsuffix keywords have been replaced with the "
-                "``suffixes=`` keyword.  "
-                "Please provide the following instead: \n\n"
-                "    suffixes=('%s', '%s')"
-                % (lsuffix or "_x", rsuffix or "_y")
-            )
-        else:
-            lsuffix, rsuffix = suffixes
+        if validate is not None:
+            raise NotImplementedError("validate is currently not supported.")
 
         lhs, rhs = self, right
         merge_cls = Merge
@@ -5952,9 +5969,9 @@ def quantile(
         axis=0,
         numeric_only=True,
         interpolation=None,
+        method="single",
         columns=None,
         exact=True,
-        method="single",
     ):
         """
         Return values at the given quantile.
@@ -5980,14 +5997,14 @@ def quantile(
                 * higher: `j`.
                 * nearest: `i` or `j` whichever is nearest.
                 * midpoint: (`i` + `j`) / 2.
-        columns : list of str
-            List of column names to include.
-        exact : boolean
-            Whether to use approximate or exact quantile algorithm.
         method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
+        columns : list of str
+            List of column names to include.
+        exact : boolean
+            Whether to use approximate or exact quantile algorithm.
 
         Returns
         -------
@@ -7309,25 +7326,47 @@ def unnamed_group_generator():
             return result
 
     @_performance_tracking
-    def cov(self, **kwargs):
+    def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
         """Compute the covariance matrix of a DataFrame.
 
         Parameters
         ----------
-        **kwargs
-            Keyword arguments to be passed to cupy.cov
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns to
+            have a valid result.
+            Currently not supported.
+
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported.
 
         Returns
         -------
         cov : DataFrame
         """
-        cov = cupy.cov(self.values, rowvar=False)
+        if min_periods is not None:
+            raise NotImplementedError(
+                "min_periods is currently not supported."
+            )
+
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
+        cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
         df._set_columns_like(self._data)
         return df
 
-    def corr(self, method="pearson", min_periods=None):
+    def corr(
+        self, method="pearson", min_periods=None, numeric_only: bool = False
+    ):
         """Compute the correlation matrix of a DataFrame.
 
         Parameters
@@ -7357,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None):
         if min_periods is not None:
             raise NotImplementedError("Unsupported argument 'min_periods'")
 
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
         corr = cupy.corrcoef(values, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e14f8923c25..0678ebfdd81 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1495,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
-    ):
+    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         """
         Return the median of the values for the requested axis.
 
@@ -1857,7 +1855,16 @@ def mask(
     @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
-        self, window, min_periods=None, center=False, axis=0, win_type=None
+        self,
+        window,
+        min_periods=None,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
+        axis=0,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         return Rolling(
             self,
@@ -1865,7 +1872,11 @@ def rolling(
             min_periods=min_periods,
             center=center,
             axis=axis,
+            on=on,
             win_type=win_type,
+            closed=closed,
+            step=step,
+            method=method,
         )
 
     @copy_docstring(ExponentialMovingWindow)
@@ -1880,6 +1891,7 @@ def ewm(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
         return ExponentialMovingWindow(
             self,
@@ -1892,6 +1904,7 @@ def ewm(
             ignore_na=ignore_na,
             axis=axis,
             times=times,
+            method=method,
         )
 
     @_performance_tracking
@@ -3943,16 +3956,15 @@ def resample(
         self,
         rule,
         axis=0,
-        closed=None,
-        label=None,
-        convention="start",
+        closed: Literal["right", "left"] | None = None,
+        label: Literal["right", "left"] | None = None,
+        convention: Literal["start", "end", "s", "e"] = "start",
         kind=None,
-        loffset=None,
-        base=None,
         on=None,
         level=None,
         origin="start_day",
         offset=None,
+        group_keys: bool = False,
     ):
         """
         Convert the frequency of ("resample") the given time series data.
@@ -4090,26 +4102,27 @@ def resample(
                 "deprecated and will be removed in a future version. ",
                 FutureWarning,
             )
-        if (axis, convention, kind, loffset, base, origin, offset) != (
-            0,
-            "start",
-            None,
-            None,
-            None,
-            "start_day",
-            None,
-        ):
-            raise NotImplementedError(
-                "The following arguments are not "
-                "currently supported by resample:\n\n"
-                "- axis\n"
-                "- convention\n"
-                "- kind\n"
-                "- loffset\n"
-                "- base\n"
-                "- origin\n"
-                "- offset"
+            raise NotImplementedError("kind is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "The 'axis' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
             )
+            raise NotImplementedError("axis is currently not supported.")
+        if convention != "start":
+            warnings.warn(
+                "The 'convention' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("convention is currently not supported.")
+        if origin != "start_day":
+            raise NotImplementedError("origin is currently not supported.")
+        if offset is not None:
+            raise NotImplementedError("offset is currently not supported.")
+        if group_keys is not False:
+            raise NotImplementedError("group_keys is currently not supported.")
         by = cudf.Grouper(
             key=on, freq=rule, closed=closed, label=label, level=level
         )
@@ -4120,7 +4133,13 @@ def resample(
         )
 
     def dropna(
-        self, axis=0, how="any", thresh=None, subset=None, inplace=False
+        self,
+        axis=0,
+        how="any",
+        thresh=None,
+        subset=None,
+        inplace=False,
+        ignore_index: bool = False,
     ):
         """
         Drop rows (or columns) containing nulls from a Column.
@@ -4144,6 +4163,8 @@ def dropna(
             columns, subset is a list of rows to consider.
         inplace : bool, default False
             If True, do operation inplace and return None.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -4220,6 +4241,8 @@ def dropna(
         """
         if axis == 0:
             result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
+            if ignore_index:
+                result.index = RangeIndex(len(result))
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dfc596bf279..0e1fddd7ed5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -524,8 +524,10 @@ def codes(self):
             col.values for col in self._codes
         )
 
-    def get_slice_bound(self, label, side, kind=None):
-        raise NotImplementedError()
+    def get_slice_bound(self, label, side):
+        raise NotImplementedError(
+            "get_slice_bound is not currently implemented."
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -1108,7 +1110,7 @@ def _concat(cls, objs):
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, names=None):
+    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         """
         Convert list of tuples to MultiIndex.
 
@@ -1116,6 +1118,9 @@ def from_tuples(cls, tuples, names=None):
         ----------
         tuples : list / sequence of tuple-likes
             Each tuple is the index of one row/column.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
 
@@ -1142,7 +1147,9 @@ def from_tuples(cls, tuples, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
+        pdi = pd.MultiIndex.from_tuples(
+            tuples, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @_performance_tracking
@@ -1215,7 +1222,12 @@ def values(self):
 
     @classmethod
     @_performance_tracking
-    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
+    def from_frame(
+        cls,
+        df: pd.DataFrame | cudf.DataFrame,
+        sortorder: int | None = None,
+        names=None,
+    ):
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1223,6 +1235,9 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         ----------
         df : DataFrame
             DataFrame to be converted to MultiIndex.
+        sortorder : int, optional
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list-like, optional
             If no names are provided, use the column names, or tuple of column
             names if the columns is a MultiIndex. If a sequence, overwrite
@@ -1273,11 +1288,13 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         else:
             source_data = df
         names = names if names is not None else source_data._column_names
-        return cls.from_arrays(source_data._columns, names=names)
+        return cls.from_arrays(
+            source_data._columns, sortorder=sortorder, names=names
+        )
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, arrays, names=None):
+    def from_product(cls, iterables, sortorder: int | None = None, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1285,6 +1302,9 @@ def from_product(cls, arrays, names=None):
         ----------
         iterables : list / sequence of iterables
             Each iterable has unique labels for each level of the index.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
             If not explicitly provided, names will be inferred from the
@@ -1314,7 +1334,9 @@ def from_product(cls, arrays, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_product(arrays, names=names)
+        pdi = pd.MultiIndex.from_product(
+            iterables, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @classmethod
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a542c5f5969..e7248977b1d 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -502,6 +502,7 @@ def melt(
     var_name=None,
     value_name="value",
     col_level=None,
+    ignore_index: bool = True,
 ):
     """Unpivots a DataFrame from wide format to long format,
     optionally leaving identifier variables set.
@@ -566,6 +567,8 @@ def melt(
     """
     if col_level is not None:
         raise NotImplementedError("col_level != None is not supported yet.")
+    if ignore_index is not True:
+        raise NotImplementedError("ignore_index is currently not supported.")
 
     # Arg cleaning
 
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 1203a840076..ef0f6958aeb 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
 from __future__ import annotations
 
+import warnings
+from typing import Literal
+
 import numpy as np
 
 from cudf._lib.reduce import scan
@@ -103,13 +105,24 @@ def __init__(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
-        if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+        if min_periods != 0:
             raise NotImplementedError(
-                "The parameters `min_periods`, `ignore_na`, "
-                "`axis`, and `times` are not yet supported."
+                "min_periods is currently not supported."
             )
-
+        if ignore_na is not False:
+            raise NotImplementedError("ignore_na is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
+            raise NotImplementedError("axis is currently not supported.")
+        if times is not None:
+            raise NotImplementedError("times is currently not supported.")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported.")
         self.obj = obj
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 29391c68471..043a41145e5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
+from __future__ import annotations
+
+import warnings
 
 import numba
 import pandas as pd
@@ -196,17 +199,26 @@ def __init__(
         obj,
         window,
         min_periods=None,
-        center=False,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
         axis=0,
-        win_type=None,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         self.obj = obj
         self.window = window
         self.min_periods = min_periods
         self.center = center
         self._normalize()
-        self.agg_params = {}
+        # for var & std only?
+        self.agg_params: dict[str, int] = {}
         if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
             raise NotImplementedError("axis != 0 is not supported yet.")
         self.axis = axis
 
@@ -217,6 +229,15 @@ def __init__(
                 )
         self.win_type = win_type
 
+        if on is not None:
+            raise NotImplementedError("on is currently not supported")
+        if closed not in (None, "right"):
+            raise NotImplementedError("closed is currently not supported")
+        if step is not None:
+            raise NotImplementedError("step is currently not supported")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported")
+
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index ed0cf0053ea..5b1ee0ffac6 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -284,3 +284,12 @@ def test_dropna_multiindex_2(data, how):
     got = gi.dropna(how)
 
     assert_eq(expect, got)
+
+
+def test_ignore_index():
+    pser = pd.Series([1, 2, np.nan], index=[2, 4, 1])
+    gser = cudf.from_pandas(pser)
+
+    result = pser.dropna(ignore_index=True)
+    expected = gser.dropna(ignore_index=True)
+    assert_eq(result, expected)

From 743e16426c564d0ed0d7e3d9be5f67e4605c4f32 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 29 Jul 2024 14:19:43 -0500
Subject: [PATCH 10/72] update some branch references in GitHub Actions configs
 (#16397)

Fixes some lingering references to `branch-24.08` in the `pr_issue_status_automation` CI workflow.

This was missed when new branches were cut because that file ends in `.yml` and `update-version.sh` was only modifying files ending in `.yaml`. The corresponding `update-version.sh` changes were made in #16183 and are already on 24.10 thanks to forward mergers.

https://github.com/rapidsai/cudf/blob/dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050/ci/release/update-version.sh#L78

## Notes for Reviewers

I checked like this, and don't see any other missed references:

```shell
git grep -E '24\.8|24\.08|0\.39'
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16397
---
 .github/workflows/pr_issue_status_automation.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ca971dc28d..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:

From f8eb63e499f94d583d715f5c1f5e6f234589be57 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:39:19 -1000
Subject: [PATCH 11/72] Align Index APIs with pandas 2.x (#16361)

Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

* `to_flat_index`
* `isin`
* `unique`
* `transpose`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16361
---
 docs/cudf/source/conf.py                     |  5 ++++
 python/cudf/cudf/core/_base_index.py         | 25 ++++++++++++++++++--
 python/cudf/cudf/core/index.py               | 24 +++++++++++++++----
 python/cudf/cudf/core/multiindex.py          | 16 +++++++++++--
 python/cudf/cudf/core/series.py              |  8 -------
 python/cudf/cudf/core/single_column_frame.py |  7 ++++++
 python/cudf/cudf/tests/test_multiindex.py    |  9 +++++++
 7 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index f544536fb31..7421d9be298 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -561,6 +561,11 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
     ("py:class", "ColumnLike"),
+    ("py:class", "ColumnLike"),
+    ("py:obj", "cudf.Index.transpose"),
+    ("py:obj", "cudf.Index.T"),
+    ("py:obj", "cudf.Index.to_flat_index"),
+    ("py:obj", "cudf.MultiIndex.to_flat_index"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 8fad82c5c46..c91514202c5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -868,6 +868,24 @@ def to_numpy(self):
         """Convert to a numpy array."""
         raise NotImplementedError
 
+    def to_flat_index(self) -> Self:
+        """
+        Identity method.
+
+        This is implemented for compatibility with subclass implementations
+        when chaining.
+
+        Returns
+        -------
+        pd.Index
+            Caller.
+
+        See Also
+        --------
+        MultiIndex.to_flat_index : Subclass implementation.
+        """
+        return self
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -945,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         raise NotImplementedError
 
-    def isin(self, values):
+    def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -956,6 +974,9 @@ def isin(self, values):
         ----------
         values : set, list-like, Index
             Sought values.
+        level : str or int, optional
+            Name or position of the index level to use (if the index is a
+            `MultiIndex`).
 
         Returns
         -------
@@ -979,7 +1000,7 @@ def isin(self, values):
         # ColumnBase.isin).
         raise NotImplementedError
 
-    def unique(self):
+    def unique(self, level: int | None = None):
         """
         Return unique values in the index.
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1c48b8f4f2d..156cb973a9a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -540,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int:
             )
         return 0
 
-    def unique(self) -> Self:
+    def unique(self, level: int | None = None) -> Self:
         # RangeIndex always has unique values
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return self.copy()
 
     @_performance_tracking
@@ -964,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = []
         return as_column(i, dtype=size_type_dtype)
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
@@ -1616,12 +1624,20 @@ def append(self, other):
 
         return self._concat(to_concat)
 
-    def unique(self):
+    def unique(self, level: int | None = None) -> Self:
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return cudf.core.index._index_from_data(
             {self.name: self._values.unique()}, name=self.name
         )
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 0e1fddd7ed5..2788455aebf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1156,6 +1156,15 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
     def to_numpy(self):
         return self.values_host
 
+    def to_flat_index(self):
+        """
+        Convert a MultiIndex to an Index of Tuples containing the level values.
+
+        This is not currently implemented
+        """
+        # TODO: Could implement as Index of ListDtype?
+        raise NotImplementedError("to_flat_index is not currently supported.")
+
     @property  # type: ignore
     @_performance_tracking
     def values_host(self):
@@ -1734,8 +1743,11 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     @_performance_tracking
-    def unique(self):
-        return self.drop_duplicates(keep="first")
+    def unique(self, level: int | None = None) -> Self | cudf.Index:
+        if level is None:
+            return self.drop_duplicates(keep="first")
+        else:
+            return self.get_level_values(level).unique()
 
     @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8277ccf68fc..10ac1fdfc1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2775,14 +2775,6 @@ def cov(self, other, min_periods=None, ddof: int | None = None):
                 f"{other.dtype}"
             )
 
-    @_performance_tracking
-    def transpose(self):
-        """Return the transpose, which is by definition self."""
-
-        return self
-
-    T = property(transpose, doc=transpose.__doc__)
-
     @_performance_tracking
     def duplicated(self, keep="first"):
         """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index b93528f9693..a5ff1223791 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -389,3 +389,10 @@ def where(self, cond, other=None, inplace=False):
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
 
         return _make_categorical_like(result, self_column)
+
+    @_performance_tracking
+    def transpose(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2c00d48266c..b7314a36e73 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2170,3 +2170,12 @@ def test_bool_raises():
         lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
         rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
     )
+
+
+def test_unique_level():
+    pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]])
+    cudf_mi = cudf.MultiIndex.from_pandas(pd_mi)
+
+    result = pd_mi.unique(level=1)
+    expected = cudf_mi.unique(level=1)
+    assert_eq(result, expected)

From 368a34ca9fd7db1b6cfb6e7817978e3e4fcfb00b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 29 Jul 2024 20:05:17 -0500
Subject: [PATCH 12/72] Use RMM adaptor constructors instead of factories.
 (#16414)

This PR uses RMM memory resource adaptor constructors instead of factory functions. With CTAD, we do not need the factory and can use the constructor directly. The factory will be deprecated in https://github.com/rapidsai/rmm/pull/1626.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16414
---
 cpp/benchmarks/fixture/benchmark_fixture.hpp         |  2 +-
 .../cudf_test/stream_checking_resource_adaptor.hpp   | 12 ------------
 cpp/include/cudf_test/testing_main.hpp               |  2 +-
 java/src/main/native/src/RmmJni.cpp                  |  7 -------
 4 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8c8d6756b00..8900899f9be 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -107,7 +107,7 @@ class memory_stats_logger {
  public:
   memory_stats_logger()
     : existing_mr(rmm::mr::get_current_device_resource()),
-      statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+      statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
     rmm::mr::set_current_device_resource(&statistics_mr);
   }
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 4f3c723d195..417bbb3d9ab 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -156,16 +156,4 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                                // cudf::test::get_default_stream() is observed.
 };
 
-/**
- * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
- * upstream resource `upstream`.
- *
- * @param upstream Reference to the upstream resource
- */
-inline stream_checking_resource_adaptor make_stream_checking_resource_adaptor(
-  rmm::device_async_resource_ref upstream, bool error_on_invalid_stream, bool check_default_stream)
-{
-  return stream_checking_resource_adaptor{upstream, error_on_invalid_stream, check_default_stream};
-}
-
 }  // namespace cudf::test
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 9866253a9f8..ed83ddabb00 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -183,7 +183,7 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor                       = cudf::test::make_stream_checking_resource_adaptor(
+  auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5842a980fc4..09c04a77590 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -154,13 +154,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   }
 };
 
-template <typename Upstream>
-tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
-                                                           std::size_t size_alignment)
-{
-  return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
-}
-
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).

From d1be0b6dc06fddd0b69fb69731281b16894cb132 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 15:12:38 -1000
Subject: [PATCH 13/72] Align CategoricalIndex APIs with pandas 2.x (#16369)

Mostly exposing methods that were available on the CategoricalColumn

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16369
---
 python/cudf/cudf/core/column/categorical.py | 130 +++++++++++---------
 python/cudf/cudf/core/index.py              | 116 +++++++++++++++++
 python/cudf/cudf/tests/test_categorical.py  |  56 +++++++++
 3 files changed, 247 insertions(+), 55 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9aaccca349d..9433a91b9c6 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
         dtype: category
         Categories (2, int64): [1, 2]
         """
-        old_categories = self._column.categories
-        new_categories = column.as_column(
-            new_categories,
-            dtype=old_categories.dtype if len(new_categories) == 0 else None,
-        )
-
-        if is_mixed_with_object_dtype(old_categories, new_categories):
-            raise TypeError(
-                f"cudf does not support adding categories with existing "
-                f"categories of dtype `{old_categories.dtype}` and new "
-                f"categories of dtype `{new_categories.dtype}`, please "
-                f"type-cast new_categories to the same type as "
-                f"existing categories."
-            )
-        common_dtype = find_common_type(
-            [old_categories.dtype, new_categories.dtype]
+        return self._return_or_inplace(
+            self._column.add_categories(new_categories=new_categories)
         )
 
-        new_categories = new_categories.astype(common_dtype)
-        old_categories = old_categories.astype(common_dtype)
-
-        if old_categories.isin(new_categories).any():
-            raise ValueError("new categories must not include old categories")
-
-        new_categories = old_categories.append(new_categories)
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
-
     def remove_categories(
         self,
         removals: Any,
@@ -349,23 +322,9 @@ def remove_categories(
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        cats = self.categories.to_series()
-        removals = cudf.Series(removals, dtype=cats.dtype)
-        removals_mask = removals.isin(cats)
-
-        # ensure all the removals are in the current categories
-        # list. If not, raise an error to match Pandas behavior
-        if not removals_mask.all():
-            vals = removals[~removals_mask].to_numpy()
-            raise ValueError(f"removals must all be in old categories: {vals}")
-
-        new_categories = cats[~cats.isin(removals)]._column
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
+        return self._return_or_inplace(
+            self._column.remove_categories(removals=removals)
+        )
 
     def set_categories(
         self,
@@ -1319,7 +1278,7 @@ def _set_categories(
         new_categories: Any,
         is_unique: bool = False,
         ordered: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         """Returns a new CategoricalColumn with the categories set to the
         specified *new_categories*.
 
@@ -1376,17 +1335,68 @@ def _set_categories(
         new_codes = df._data["new_codes"]
 
         # codes can't have masks, so take mask out before moving in
-        return column.build_categorical_column(
-            categories=new_cats,
-            codes=column.build_column(
-                new_codes.base_data, dtype=new_codes.dtype
+        return cast(
+            Self,
+            column.build_categorical_column(
+                categories=new_cats,
+                codes=column.build_column(
+                    new_codes.base_data, dtype=new_codes.dtype
+                ),
+                mask=new_codes.base_mask,
+                size=new_codes.size,
+                offset=new_codes.offset,
+                ordered=ordered,
             ),
-            mask=new_codes.base_mask,
-            size=new_codes.size,
-            offset=new_codes.offset,
-            ordered=ordered,
         )
 
+    def add_categories(self, new_categories: Any) -> Self:
+        old_categories = self.categories
+        new_categories = column.as_column(
+            new_categories,
+            dtype=old_categories.dtype if len(new_categories) == 0 else None,
+        )
+        if is_mixed_with_object_dtype(old_categories, new_categories):
+            raise TypeError(
+                f"cudf does not support adding categories with existing "
+                f"categories of dtype `{old_categories.dtype}` and new "
+                f"categories of dtype `{new_categories.dtype}`, please "
+                f"type-cast new_categories to the same type as "
+                f"existing categories."
+            )
+        common_dtype = find_common_type(
+            [old_categories.dtype, new_categories.dtype]
+        )
+
+        new_categories = new_categories.astype(common_dtype)
+        old_categories = old_categories.astype(common_dtype)
+
+        if old_categories.isin(new_categories).any():
+            raise ValueError("new categories must not include old categories")
+
+        new_categories = old_categories.append(new_categories)
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
+    def remove_categories(
+        self,
+        removals: Any,
+    ) -> Self:
+        removals = column.as_column(removals).astype(self.categories.dtype)
+        removals_mask = removals.isin(self.categories)
+
+        # ensure all the removals are in the current categories
+        # list. If not, raise an error to match Pandas behavior
+        if not removals_mask.all():
+            raise ValueError("removals must all be in old categories")
+
+        new_categories = self.categories.apply_boolean_mask(
+            self.categories.isin(removals).unary_operator("not")
+        )
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
     def reorder_categories(
         self,
         new_categories: Any,
@@ -1404,6 +1414,16 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
+    def rename_categories(self, new_categories) -> CategoricalColumn:
+        raise NotImplementedError(
+            "rename_categories is currently not supported."
+        )
+
+    def remove_unused_categories(self) -> Self:
+        raise NotImplementedError(
+            "remove_unused_categories is currently not supported."
+        )
+
     def as_ordered(self, ordered: bool):
         if self.dtype.ordered == ordered:
             return self
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 156cb973a9a..8c3b091abec 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2721,6 +2721,10 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @property
+    def ordered(self) -> bool:
+        return self._column.ordered
+
     @property  # type: ignore
     @_performance_tracking
     def codes(self):
@@ -2743,6 +2747,118 @@ def _is_boolean(self):
     def _is_categorical(self):
         return True
 
+    def add_categories(self, new_categories) -> Self:
+        """
+        Add new categories.
+
+        `new_categories` will be included at the last/highest place in the
+        categories and will be unused directly after this call.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.add_categories(new_categories)}
+        )
+
+    def as_ordered(self) -> Self:
+        """
+        Set the Categorical to be ordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=True)}
+        )
+
+    def as_unordered(self) -> Self:
+        """
+        Set the Categorical to be unordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=False)}
+        )
+
+    def remove_categories(self, removals) -> Self:
+        """
+        Remove the specified categories.
+
+        `removals` must be included in the old categories.
+
+        Parameters
+        ----------
+        removals : category or list of categories
+           The categories which should be removed.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_categories(removals)}
+        )
+
+    def remove_unused_categories(self) -> Self:
+        """
+        Remove categories which are not used.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_unused_categories()}
+        )
+
+    def rename_categories(self, new_categories) -> Self:
+        """
+        Rename categories.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.rename_categories(new_categories)}
+        )
+
+    def reorder_categories(self, new_categories, ordered=None) -> Self:
+        """
+        Reorder categories as specified in new_categories.
+
+        ``new_categories`` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.reorder_categories(
+                    new_categories, ordered=ordered
+                )
+            }
+        )
+
+    def set_categories(
+        self, new_categories, ordered=None, rename: bool = False
+    ) -> Self:
+        """
+        Set the categories to the specified new_categories.
+
+        Parameters
+        ----------
+        new_categories : list-like
+            The categories in new order.
+        ordered : bool, default None
+            Whether or not the categorical is treated as
+            a ordered categorical. If not given, do
+            not change the ordered information.
+        rename : bool, default False
+            Whether or not the `new_categories` should be
+            considered as a rename of the old categories
+            or as reordered categories.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.set_categories(
+                    new_categories, ordered=ordered, rename=rename
+                )
+            }
+        )
+
 
 @_performance_tracking
 def interval_range(
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 9b6029582ce..ae58af8ebce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -891,3 +891,59 @@ def test_categorical_maxima(op):
     result = getattr(ser.cat.as_ordered(), op)()
     result_pd = getattr(ser_pd.cat.as_ordered(), op)()
     assert_eq(result, result_pd)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_ordered(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+    assert pd_ci.ordered == cudf_ci.ordered
+
+
+@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_as_ordered(method, ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = getattr(pd_ci, method)()
+    result = getattr(cudf_ci, method)()
+    assert_eq(result, expected)
+
+
+def test_index_add_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.add_categories([4])
+    result = cudf_ci.add_categories([4])
+    assert_eq(result, expected)
+
+
+def test_index_remove_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.remove_categories([4])
+    result = cudf_ci.remove_categories([4])
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_reorder_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_set_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)

From 8def2ec1acac6a538002db011d977bb22cfbda82 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Jul 2024 14:34:59 -0500
Subject: [PATCH 14/72] Add Java APIs to copy column data to host
 asynchronously (#16429)

Adds Java methods to ColumnView to allow copying of column data to host memory asynchronously.  This can be used to avoid many unnecessary stream synchronization when copying many columns to the host.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16429
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 52 +++++++++++++------
 .../java/ai/rapids/cudf/HostColumnVector.java |  4 ++
 .../ai/rapids/cudf/HostColumnVectorCore.java  |  4 +-
 .../ai/rapids/cudf/JCudfSerialization.java    |  5 +-
 4 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 997ff77bae3..8ff2f0f0a73 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -5034,8 +5034,8 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
   // DATA MOVEMENT
   /////////////////////////////////////////////////////////////////////////////
 
-  private static HostColumnVectorCore copyToHostNestedHelper(
-      ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
+  private static HostColumnVectorCore copyToHostAsyncNestedHelper(
+      Cuda.Stream stream, ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
     if (deviceCvPointer == null) {
       return null;
     }
@@ -5056,20 +5056,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
       currValidity = deviceCvPointer.getValid();
       if (currData != null) {
         hostData = hostMemoryAllocator.allocate(currData.length);
-        hostData.copyFromDeviceBuffer(currData);
+        hostData.copyFromDeviceBufferAsync(currData, stream);
       }
       if (currValidity != null) {
         hostValid = hostMemoryAllocator.allocate(currValidity.length);
-        hostValid.copyFromDeviceBuffer(currValidity);
+        hostValid.copyFromDeviceBufferAsync(currValidity, stream);
       }
       if (currOffsets != null) {
         hostOffsets = hostMemoryAllocator.allocate(currOffsets.length);
-        hostOffsets.copyFromDeviceBuffer(currOffsets);
+        hostOffsets.copyFromDeviceBufferAsync(currOffsets, stream);
       }
       int numChildren = deviceCvPointer.getNumChildren();
       for (int i = 0; i < numChildren; i++) {
         try(ColumnView childDevPtr = deviceCvPointer.getChildColumnView(i)) {
-          children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+          children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
         }
       }
       currNullCount = deviceCvPointer.getNullCount();
@@ -5103,11 +5103,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
     }
   }
 
+  /** Copy the data to the host synchronously. */
+  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
+    HostColumnVector result = copyToHostAsync(Cuda.DEFAULT_STREAM, hostMemoryAllocator);
+    Cuda.DEFAULT_STREAM.sync();
+    return result;
+  }
+
   /**
-   * Copy the data to the host.
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
    */
-  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
-    try (NvtxRange toHost = new NvtxRange("ensureOnHost", NvtxColor.BLUE)) {
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream,
+                                          HostMemoryAllocator hostMemoryAllocator) {
+    try (NvtxRange toHost = new NvtxRange("toHostAsync", NvtxColor.BLUE)) {
       HostMemoryBuffer hostDataBuffer = null;
       HostMemoryBuffer hostValidityBuffer = null;
       HostMemoryBuffer hostOffsetsBuffer = null;
@@ -5127,16 +5136,16 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         if (!type.isNestedType()) {
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.length);
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           // If a strings column is all null values there is no data buffer allocated
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
               hostDataBuffer, hostValidityBuffer, hostOffsetsBuffer);
@@ -5145,21 +5154,21 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         } else {
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
 
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.getLength());
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           List<HostColumnVectorCore> children = new ArrayList<>();
           for (int i = 0; i < getNumChildren(); i++) {
             try (ColumnView childDevPtr = getChildColumnView(i)) {
-              children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+              children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
             }
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5192,10 +5201,19 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
     }
   }
 
+  /** Copy the data to host memory synchronously */
   public HostColumnVector copyToHost() {
     return copyToHost(DefaultHostMemoryAllocator.get());
   }
 
+  /**
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
+   */
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream) {
+    return copyToHostAsync(stream, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Calculate the total space required to copy the data to the host. This should be padded to
    * the alignment that the CPU requires.
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 6b41d10fee3..61b11673957 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -92,6 +92,8 @@ public interface EventHandler {
   public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer, List<HostColumnVectorCore> nestedHcv) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, nestedHcv);
     refCount = 0;
     incRefCountInternal(true);
@@ -100,6 +102,8 @@ public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
   HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, new ArrayList<>());
     assert !type.equals(DType.LIST) : "This constructor should not be used for list type";
     if (nullCount.isPresent() && nullCount.get() > 0 && hostValidityBuffer == null) {
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index 95d209c0984..a225fbf34e1 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,6 +47,8 @@ public class HostColumnVectorCore implements AutoCloseable {
   public HostColumnVectorCore(DType type, long rows,
                               Optional<Long> nullCount, HostMemoryBuffer data, HostMemoryBuffer validity,
                               HostMemoryBuffer offsets, List<HostColumnVectorCore> nestedChildren) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     this.offHeap = new OffHeapState(data, validity,  offsets);
     MemoryCleaner.register(this, offHeap);
     this.type = type;
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 666a8864003..89f363d2b29 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -907,8 +907,9 @@ private static ColumnBufferProvider[] providersFrom(ColumnVector[] columns) {
     boolean success = false;
     try {
       for (int i = 0; i < columns.length; i++) {
-        onHost[i] = columns[i].copyToHost();
+        onHost[i] = columns[i].copyToHostAsync(Cuda.DEFAULT_STREAM);
       }
+      Cuda.DEFAULT_STREAM.sync();
       ColumnBufferProvider[] ret = providersFrom(onHost, true);
       success = true;
       return ret;

From 79a1eed785fccbca2c20ff5cc844ec1a9e741ee5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 31 Jul 2024 11:00:30 -0400
Subject: [PATCH 15/72] Remove checking for specific tests in memcheck script
 (#16412)

Removes the checking for specific gtests in the `run_cudf_memcheck_ctests.sh` script. Each of those tests can check the `LIBCUDF_MEMCHECK_ENABLED` environment variable themselves.
This simplifies the script logic and may help with replacing this with ctest logic in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16412
---
 ci/run_cudf_memcheck_ctests.sh                                | 3 ---
 cpp/tests/error/error_handling_test.cu                        | 4 ++++
 .../test_default_stream_identification.cu                     | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
index aacd93e3b96..653829db419 100755
--- a/ci/run_cudf_memcheck_ctests.sh
+++ b/ci/run_cudf_memcheck_ctests.sh
@@ -15,9 +15,6 @@ export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
   test_name=$(basename ${gt})
   # Run gtests with compute-sanitizer
-  if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
-    continue
-  fi
   echo "Running compute-sanitizer on $test_name"
   compute-sanitizer --tool memcheck ${gt} "$@"
 done
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 46d01ec14ff..1dfe45556c4 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -50,6 +50,8 @@ CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -61,6 +63,8 @@ TEST(StreamCheck, FailedKernel)
 
 TEST(StreamCheck, CatchFailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
index 268c7b37c81..c5fb75a7a8e 100644
--- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
+++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
@@ -33,6 +33,7 @@ void test_cudaLaunchKernel()
   } catch (std::runtime_error&) {
     return;
   }
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
   throw std::runtime_error("No exception raised for kernel on default stream!");
 }
 

From 9336c172b1f61408e2392cbbd953e7f7e6e9ae3d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 31 Jul 2024 16:27:26 +0100
Subject: [PATCH 16/72] Add upper bound pin for polars (#16442)

This aligns the polars dependency with the most modern version supported by cudf-polars in this branch.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16442
---
 dependencies.yaml                 | 2 +-
 python/cudf_polars/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 0fa32404156..aeb030313ed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -630,7 +630,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0
+          - polars>=1.0,<1.3
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f8a1973bdbf..424c83a5199 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
-    "polars>=1.0",
+    "polars>=1.0,<1.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 0f3b3808348debca8458bf73575745770b494ddc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:38:56 -1000
Subject: [PATCH 17/72] Add environment variable to log cudf.pandas fallback
 calls (#16161)

Introduces a new environment variable `LOG_FAST_FALLBACK` which will create a structured log of the call that failed.

An example of the log is

```
INFO:root:{"debug_type": "LOG_FAST_FALLBACK", "failed_call": "pandas._libs.interval.Interval(0,1)", "exception": "Exception", "exception_message": "Cannot transform _Unusable", "pandas_object": "pandas._libs.interval.Interval", "passed_args": "0,1,", "passed_kwargs": {}}
```

I could turn this into a warning instead, but I imagine we would want to first utilize this to parse the failures and see generalized failures in aggregate

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16161
---
 python/cudf/cudf/pandas/_logger.py         | 80 ++++++++++++++++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py |  6 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/_logger.py

diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py
new file mode 100644
index 00000000000..68923c3e35c
--- /dev/null
+++ b/python/cudf/cudf/pandas/_logger.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import logging
+
+logging.basicConfig(
+    filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO
+)
+logger = logging.getLogger()
+
+
+class StructuredMessage:
+    # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging
+    def __init__(self, debug_type: str, /, **kwargs) -> None:
+        self.debug_type = debug_type
+        self.kwargs = kwargs
+
+    def __str__(self) -> str:
+        log = {"debug_type": self.debug_type}
+        return json.dumps({**log, **self.kwargs})
+
+
+def reprify(arg) -> str:
+    """Attempt to return arg's repr for logging."""
+    try:
+        return repr(arg)
+    except Exception:
+        return "<REPR FAILED>"
+
+
+def log_fallback(
+    slow_args: tuple, slow_kwargs: dict, exception: Exception
+) -> None:
+    """Log when a fast call falls back to the slow path."""
+    caller = slow_args[0]
+    module = getattr(caller, "__module__", "")
+    obj_name = getattr(caller, "__qualname__", type(caller).__qualname__)
+    if module:
+        slow_object = f"{module}.{obj_name}"
+    else:
+        slow_object = obj_name
+    # TODO: Maybe use inspect.signature to map called args and kwargs
+    # to their keyword names, but a user calling an API incorrectly would
+    # break this.
+    caller_args = slow_args[1]
+    args_passed = ", ".join((reprify(arg) for arg in caller_args))
+    args_types_passed = ", ".join((type(arg).__name__ for arg in caller_args))
+    kwargs_passed = {}
+    kwargs_types_passed = ""
+    if len(slow_args) == 3:
+        caller_kwargs = slow_args[2]
+        if caller_kwargs:
+            fmt_kwargs = ", ".join(
+                f"{kwarg}={reprify(value)}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            kwargs_types_passed = ", ".join(
+                f"{kwarg}={type(value).__name__}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            args_passed = f"{args_passed}, {fmt_kwargs}"
+            kwargs_passed = {
+                kwarg: reprify(value) for kwarg, value in caller_kwargs.items()
+            }
+    message = StructuredMessage(
+        "LOG_FAST_FALLBACK",
+        failed_call=f"{slow_object}({args_passed})",
+        exception=type(exception).__name__,
+        exception_message=str(exception),
+        slow_object=slow_object,
+        args_passed=args_passed,
+        kwargs_passed=kwargs_passed,
+        args_types_passed=args_types_passed,
+        kwargs_types_passed=kwargs_types_passed,
+    )
+    logger.info(message)
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index dfb729cae6b..bb678fd1efe 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -930,13 +930,17 @@ def _fast_slow_function_call(
                             "Pandas debugging mode failed. "
                             f"The exception was {e}."
                         )
-    except Exception:
+    except Exception as err:
         with nvtx.annotate(
             "EXECUTE_SLOW",
             color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
             domain="cudf_pandas",
         ):
             slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
+            if _env_get_bool("LOG_FAST_FALLBACK", False):
+                from ._logger import log_fallback
+
+                log_fallback(slow_args, slow_kwargs, err)
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast

From 5bcd8e062369a7d15222fa6d0bcc0b310553edbf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:34:37 -1000
Subject: [PATCH 18/72] Align DatetimeIndex APIs with pandas 2.x (#16367)

Mostly transferring methods that were defined on `Series.dt` methods to `DatetimeColumn` so it could be reused in `DatetimeIndex`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16367
---
 docs/cudf/source/conf.py                 |   2 +
 python/cudf/cudf/core/column/datetime.py |  56 ++++++
 python/cudf/cudf/core/index.py           | 211 ++++++++++++++++++++++-
 python/cudf/cudf/core/series.py          |  43 ++---
 python/cudf/cudf/tests/test_datetime.py  | 107 ++++++++++++
 5 files changed, 385 insertions(+), 34 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7421d9be298..7ebafc0da95 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,6 +556,8 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.DatetimeIndex.time"),
+    ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 73902789c11..81fbb914842 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -286,6 +286,62 @@ def dayofyear(self) -> ColumnBase:
     def day_of_year(self) -> ColumnBase:
         return self.get_dt_field("day_of_year")
 
+    @property
+    def is_month_start(self) -> ColumnBase:
+        return (self.day == 1).fillna(False)
+
+    @property
+    def is_month_end(self) -> ColumnBase:
+        last_day_col = libcudf.datetime.last_day_of_month(self)
+        return (self.day == last_day_col.day).fillna(False)
+
+    @property
+    def is_quarter_end(self) -> ColumnBase:
+        last_month = self.month.isin([3, 6, 9, 12])
+        return (self.is_month_end & last_month).fillna(False)
+
+    @property
+    def is_quarter_start(self) -> ColumnBase:
+        first_month = self.month.isin([1, 4, 7, 10])
+        return (self.is_month_start & first_month).fillna(False)
+
+    @property
+    def is_year_end(self) -> ColumnBase:
+        day_of_year = self.day_of_year
+        leap_dates = libcudf.datetime.is_leap_year(self)
+
+        leap = day_of_year == cudf.Scalar(366)
+        non_leap = day_of_year == cudf.Scalar(365)
+        return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
+            False
+        )
+
+    @property
+    def is_year_start(self) -> ColumnBase:
+        return (self.day_of_year == 1).fillna(False)
+
+    @property
+    def days_in_month(self) -> ColumnBase:
+        return libcudf.datetime.days_in_month(self)
+
+    @property
+    def day_of_week(self) -> ColumnBase:
+        raise NotImplementedError("day_of_week is currently not implemented.")
+
+    @property
+    def is_normalized(self) -> bool:
+        raise NotImplementedError(
+            "is_normalized is currently not implemented."
+        )
+
+    def to_julian_date(self) -> ColumnBase:
+        raise NotImplementedError(
+            "to_julian_date is currently not implemented."
+        )
+
+    def normalize(self) -> ColumnBase:
+        raise NotImplementedError("normalize is currently not implemented.")
+
     @property
     def values(self):
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8c3b091abec..40a5d9ff259 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -61,6 +61,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
+    from datetime import tzinfo
 
 
 def ensure_index(index_like: Any) -> BaseIndex:
@@ -1680,7 +1681,7 @@ class DatetimeIndex(Index):
     copy : bool
         Make a copy of input.
     freq : str, optional
-        This is not yet supported
+        Frequency of the DatetimeIndex
     tz : pytz.timezone or dateutil.tz.tzfile
         This is not yet supported
     ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
@@ -1847,6 +1848,210 @@ def searchsorted(
             value, side=side, ascending=ascending, na_position=na_position
         )
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def strftime(self, date_format: str) -> Index:
+        """
+        Convert to Index using specified date_format.
+
+        Return an Index of formatted strings specified by date_format, which
+        supports the same string format as the python standard library.
+
+        Parameters
+        ----------
+        date_format : str
+            Date format string (e.g. "%Y-%m-%d").
+        """
+        return Index._from_data(
+            {self.name: self._column.strftime(date_format)}
+        )
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    @property
+    def inferred_freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("inferred_freq is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        return self._freq
+
+    @freq.setter
+    def freq(self) -> None:
+        raise NotImplementedError("Setting freq is currently not supported.")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    @property
+    def tz(self) -> tzinfo | None:
+        """
+        Return the timezone.
+
+        Returns
+        -------
+        datetime.tzinfo or None
+            Returns None when the array is tz-naive.
+        """
+        return getattr(self.dtype, "tz", None)
+
+    @property
+    def tzinfo(self) -> tzinfo | None:
+        """
+        Alias for tz attribute
+        """
+        return self.tz
+
+    def to_pydatetime(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.datetime`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.datetime`` objects.
+        """
+        return self.to_pandas().to_pydatetime()
+
+    def to_julian_date(self) -> Index:
+        return Index._from_data({self.name: self._column.to_julian_date()})
+
+    def to_period(self, freq) -> pd.PeriodIndex:
+        return self.to_pandas().to_period(freq=freq)
+
+    def normalize(self) -> Self:
+        """
+        Convert times to midnight.
+
+        Currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.normalize()})
+
+    @property
+    def time(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().time
+
+    @property
+    def timetz(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects with timezones.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().timetz
+
+    @property
+    def date(self) -> np.ndarray:
+        """
+        Returns numpy array of python ``datetime.date`` objects.
+
+        Namely, the date part of Timestamps without time and
+        timezone information.
+        """
+        return self.to_pandas().date
+
+    @property
+    def is_month_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the month.
+        """
+        return self._column.is_month_start.values
+
+    @property
+    def is_month_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the month.
+        """
+        return self._column.is_month_end.values
+
+    @property
+    def is_quarter_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the quarter.
+        """
+        return self._column.is_quarter_end.values
+
+    @property
+    def is_quarter_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the start day of the quarter.
+        """
+        return self._column.is_quarter_start.values
+
+    @property
+    def is_year_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the year.
+        """
+        return self._column.is_year_end.values
+
+    @property
+    def is_year_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the year.
+        """
+        return self._column.is_year_start.values
+
+    @property
+    def is_normalized(self) -> bool:
+        """
+        Returns True if all of the dates are at midnight ("no time")
+        """
+        return self._column.is_normalized
+
+    @property
+    def days_in_month(self) -> Index:
+        """
+        Get the total number of days in the month that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.days_in_month})
+
+    daysinmonth = days_in_month
+
+    @property
+    def day_of_week(self) -> Index:
+        """
+        Get the day of week that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.day_of_week})
+
     @property  # type: ignore
     @_performance_tracking
     def year(self):
@@ -3391,9 +3596,11 @@ def _get_nearest_indexer(
     return indexer
 
 
-def _validate_freq(freq: Any) -> cudf.DateOffset:
+def _validate_freq(freq: Any) -> cudf.DateOffset | None:
     if isinstance(freq, str):
         return cudf.DateOffset._from_freqstr(freq)
+    elif freq is None:
+        return freq
     elif freq is not None and not isinstance(freq, cudf.DateOffset):
         raise ValueError(f"Invalid frequency: {freq}")
     return cast(cudf.DateOffset, freq)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 10ac1fdfc1e..929af5cd981 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4415,7 +4415,9 @@ def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
         """
-        return (self.day == 1).fillna(False)
+        return self._return_result_like_self(
+            self.series._column.is_month_start
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -4462,9 +4464,7 @@ def days_in_month(self) -> Series:
         11    31
         dtype: int16
         """
-        return self._return_result_like_self(
-            libcudf.datetime.days_in_month(self.series._column)
-        )
+        return self._return_result_like_self(self.series._column.days_in_month)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4505,9 +4505,7 @@ def is_month_end(self) -> Series:
         8    False
         dtype: bool
         """  # noqa: E501
-        last_day_col = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = self._return_result_like_self(last_day_col)
-        return (self.day == last_day.dt.day).fillna(False)
+        return self._return_result_like_self(self.series._column.is_month_end)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4546,14 +4544,10 @@ def is_quarter_start(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        first_month = self.series._column.get_dt_field("month").isin(
-            [1, 4, 7, 10]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_start
         )
 
-        result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_quarter_end(self) -> Series:
@@ -4591,16 +4585,10 @@ def is_quarter_end(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        last_day = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = last_day.get_dt_field("day")
-        last_month = self.series._column.get_dt_field("month").isin(
-            [3, 6, 9, 12]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_end
         )
 
-        result = ((day == last_day) & last_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_year_start(self) -> Series:
@@ -4627,10 +4615,7 @@ def is_year_start(self) -> Series:
         2    True
         dtype: bool
         """
-        outcol = self.series._column.get_dt_field(
-            "day_of_year"
-        ) == cudf.Scalar(1)
-        return self._return_result_like_self(outcol.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_start)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4658,13 +4643,7 @@ def is_year_end(self) -> Series:
         2    False
         dtype: bool
         """
-        day_of_year = self.series._column.get_dt_field("day_of_year")
-        leap_dates = libcudf.datetime.is_leap_year(self.series._column)
-
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
-        result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
-        return self._return_result_like_self(result.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_end)
 
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7ab9ff2ef23..6bc775d2a2c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pyarrow as pa
 import pytest
 
@@ -2429,3 +2430,109 @@ def test_day_month_name_locale_not_implemented(meth, klass):
         obj = obj.dt
     with pytest.raises(NotImplementedError):
         getattr(obj, meth)(locale="pt_BR.utf8")
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_month_start",
+        "is_month_end",
+        "is_quarter_end",
+        "is_quarter_start",
+        "is_year_end",
+        "is_year_start",
+        "days_in_month",
+        "timetz",
+        "time",
+        "date",
+    ],
+)
+def test_dti_datetime_attributes(attr):
+    data = [
+        "2020-01-01",
+        "2020-01-31",
+        "2020-03-01",
+        "2020-03-31",
+        "2020-03-31",
+        "2020-12-31",
+        None,
+    ]
+    pd_dti = pd.DatetimeIndex(data, name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    if isinstance(result, np.ndarray):
+        # numpy doesn't assert object arrays with NaT correctly
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("attr", ["freq", "unit"])
+def test_dti_properties(attr):
+    pd_dti = pd.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+    cudf_dti = cudf.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    assert result == expected
+
+
+def test_dti_asi8():
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = pd_dti.asi8
+    expected = cudf_dti.asi8
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["mean", {}],
+        pytest.param(
+            "std",
+            {},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+        pytest.param(
+            "std",
+            {"ddof": 0},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+    ],
+)
+def test_dti_reduction(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["to_pydatetime", {}],
+        ["to_period", {"freq": "D"}],
+        ["strftime", {"date_format": "%Y-%m-%d"}],
+    ],
+)
+def test_dti_methods(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert_eq(result, expected)

From e2d45d6f24adbeb3a21081e078a6c2776d550a06 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:36:08 -1000
Subject: [PATCH 19/72] Align TimedeltaIndex APIs with pandas 2.x (#16368)

Mostly exposing methods that were available on the `TimedeltaColumn`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16368
---
 python/cudf/cudf/core/column/timedelta.py | 12 +++
 python/cudf/cudf/core/index.py            | 92 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_timedelta.py  | 39 ++++++++++
 3 files changed, 143 insertions(+)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 59ea1cc002c..47c8ed6fd95 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -251,6 +251,18 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    def total_seconds(self) -> ColumnBase:
+        raise NotImplementedError("total_seconds is currently not implemented")
+
+    def ceil(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("ceil is currently not implemented")
+
+    def floor(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("floor is currently not implemented")
+
+    def round(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("round is currently not implemented")
+
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 40a5d9ff259..888ea25cdae 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2759,6 +2759,98 @@ def __getitem__(self, index):
             return pd.Timedelta(value)
         return value
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("freq is currently not implemented")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    def to_pytimedelta(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.timedelta`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.timedelta`` objects.
+        """
+        return self.to_pandas().to_pytimedelta()
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    def sum(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.sum(skipna=skipna)
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def median(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.median(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def total_seconds(self) -> cupy.ndarray:
+        """
+        Return total duration of each element expressed in seconds.
+
+        This method is currently not implemented.
+        """
+        return self._column.total_seconds().values
+
+    def ceil(self, freq: str) -> Self:
+        """
+        Ceil to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.ceil(freq)})
+
+    def floor(self, freq: str) -> Self:
+        """
+        Floor to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.floor(freq)})
+
+    def round(self, freq: str) -> Self:
+        """
+        Round to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.round(freq)})
+
     @property  # type: ignore
     @_performance_tracking
     def days(self):
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index c4a2349f535..d622ff6b94e 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1467,3 +1467,42 @@ def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op):
         got = op(gsr1, gsr2)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["sum", {}],
+        ["mean", {}],
+        ["median", {}],
+        ["std", {}],
+        ["std", {"ddof": 0}],
+    ],
+)
+def test_tdi_reductions(method, kwargs):
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = getattr(pd_tdi, method)(**kwargs)
+    expected = getattr(cudf_tdi, method)(**kwargs)
+    assert result == expected
+
+
+def test_tdi_asi8():
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.asi8
+    expected = cudf_tdi.asi8
+    assert_eq(result, expected)
+
+
+def test_tdi_unit():
+    pd_tdi = pd.TimedeltaIndex(
+        ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]"
+    )
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.unit
+    expected = cudf_tdi.unit
+    assert result == expected

From dab8660df7ba823dcef8cb8276a3867c2bb27cc7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:37:48 -1000
Subject: [PATCH 20/72] Align IntervalIndex APIs with pandas 2.x (#16371)

Implemented the relatively straightforward, missing APIs and raised `NotImplementedError` for the others

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16371
---
 docs/cudf/source/conf.py                      |  15 ++-
 python/cudf/cudf/core/column/interval.py      |  64 ++++++++-
 python/cudf/cudf/core/index.py                | 123 ++++++++++++++++++
 .../cudf/cudf/tests/indexes/test_interval.py  |  33 +++++
 4 files changed, 229 insertions(+), 6 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7ebafc0da95..43e2d6031bc 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -559,15 +559,20 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "cudf.DatetimeIndex.time"),
     ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
-    ("py:class", "pa.Array"),
-    ("py:class", "ScalarLike"),
-    ("py:class", "ParentType"),
-    ("py:class", "ColumnLike"),
-    ("py:class", "ColumnLike"),
     ("py:obj", "cudf.Index.transpose"),
     ("py:obj", "cudf.Index.T"),
     ("py:obj", "cudf.Index.to_flat_index"),
     ("py:obj", "cudf.MultiIndex.to_flat_index"),
+    ("py:meth", "pyarrow.Table.to_pandas"),
+    ("py:class", "pa.Array"),
+    ("py:class", "ScalarLike"),
+    ("py:class", "ParentType"),
+    ("py:class", "pyarrow.lib.DataType"),
+    ("py:class", "pyarrow.lib.Table"),
+    ("py:class", "pyarrow.lib.Scalar"),
+    ("py:class", "pyarrow.lib.ChunkedArray"),
+    ("py:class", "pyarrow.lib.Array"),
+    ("py:class", "ColumnLike"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d09a1f66539..b2f79ef0c65 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,11 +1,18 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
 import pandas as pd
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import StructColumn
+from cudf.core.column import StructColumn, as_column
 from cudf.core.dtypes import IntervalDtype
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 class IntervalColumn(StructColumn):
     def __init__(
@@ -85,6 +92,61 @@ def copy(self, deep=True):
             children=struct_copy.base_children,
         )
 
+    @property
+    def is_empty(self) -> ColumnBase:
+        left_equals_right = (self.right == self.left).fillna(False)
+        not_closed_both = as_column(
+            self.dtype.closed != "both", length=len(self)
+        )
+        return left_equals_right & not_closed_both
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def is_overlapping(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def length(self) -> ColumnBase:
+        return self.right - self.left
+
+    @property
+    def left(self) -> ColumnBase:
+        return self.children[0]
+
+    @property
+    def mid(self) -> ColumnBase:
+        try:
+            return 0.5 * (self.left + self.right)
+        except TypeError:
+            # datetime safe version
+            return self.left + 0.5 * self.length
+
+    @property
+    def right(self) -> ColumnBase:
+        return self.children[1]
+
+    def overlaps(other) -> ColumnBase:
+        raise NotImplementedError("overlaps is not currently implemented.")
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> IntervalColumn:
+        return IntervalColumn(
+            size=self.size,
+            dtype=IntervalDtype(self.dtype.fields["left"], closed),
+            mask=self.base_mask,
+            offset=self.offset,
+            null_count=self.null_count,
+            children=self.base_children,
+        )
+
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
             return IntervalColumn(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 888ea25cdae..cd879d559cd 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3429,6 +3429,31 @@ def from_breaks(
         )
         return IntervalIndex(interval_col, name=name, closed=closed)
 
+    @classmethod
+    def from_arrays(
+        cls,
+        left,
+        right,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        copy: bool = False,
+        dtype=None,
+    ) -> Self:
+        raise NotImplementedError("from_arrays is currently not supported.")
+
+    @classmethod
+    def from_tuples(
+        cls,
+        data,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        name=None,
+        copy: bool = False,
+        dtype=None,
+    ) -> IntervalIndex:
+        piidx = pd.IntervalIndex.from_tuples(
+            data, closed=closed, name=name, copy=copy, dtype=dtype
+        )
+        return cls.from_pandas(piidx)
+
     def __getitem__(self, index):
         raise NotImplementedError(
             "Getting a scalar from an IntervalIndex is not yet supported"
@@ -3443,6 +3468,104 @@ def _is_boolean(self):
     def _clean_nulls_from_index(self):
         return self
 
+    @property
+    def is_empty(self) -> cupy.ndarray:
+        """
+        Indicates if an interval is empty, meaning it contains no points.
+        """
+        return self._column.is_empty.values
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        """
+        Return a True if the IntervalIndex is non-overlapping and monotonic.
+        """
+        return self._column.is_non_overlapping_monotonic
+
+    @property
+    def is_overlapping(self) -> bool:
+        """
+        Return True if the IntervalIndex has overlapping intervals, else False.
+
+        Currently not implemented
+        """
+        return self._column.is_overlapping
+
+    @property
+    def length(self) -> Index:
+        """
+        Return an Index with entries denoting the length of each Interval.
+        """
+        return _index_from_data({None: self._column.length})
+
+    @property
+    def left(self) -> Index:
+        """
+        Return left bounds of the intervals in the IntervalIndex.
+
+        The left bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the left bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.left})
+
+    @property
+    def mid(self) -> Index:
+        """
+        Return the midpoint of each interval in the IntervalIndex as an Index.
+
+        Each midpoint is calculated as the average of the left and right bounds
+        of each interval.
+        """
+        return _index_from_data({None: self._column.mid})
+
+    @property
+    def right(self) -> Index:
+        """
+        Return right bounds of the intervals in the IntervalIndex.
+
+        The right bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the right bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.right})
+
+    def overlaps(self, other) -> cupy.ndarray:
+        """
+        Check elementwise if an Interval overlaps the values in the IntervalIndex.
+
+        Currently not supported.
+        """
+        return self._column.overlaps(other).values
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> Self:
+        """
+        Return an identical IntervalArray closed on the specified side.
+
+        Parameters
+        ----------
+        closed : {'left', 'right', 'both', 'neither'}
+            Whether the intervals are closed on the left-side, right-side, both
+            or neither.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.set_closed(closed)}
+        )
+
+    def to_tuples(self, na_tuple: bool = True) -> pd.Index:
+        """
+        Return an Index of tuples of the form (left, right).
+
+        Parameters
+        ----------
+        na_tuple : bool, default True
+            If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``,
+            just return ``NA`` as ``nan``.
+        """
+        return self.to_pandas().to_tuples(na_tuple=na_tuple)
+
 
 @_performance_tracking
 def as_index(
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 87b76ab7609..3b3a9f96543 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -368,3 +368,36 @@ def test_intervalindex_conflicting_closed():
 def test_intervalindex_invalid_data():
     with pytest.raises(TypeError):
         cudf.IntervalIndex([1, 2])
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_empty",
+        "length",
+        "left",
+        "right",
+        "mid",
+    ],
+)
+def test_intervalindex_properties(attr):
+    pd_ii = pd.IntervalIndex.from_arrays([0, 1], [0, 2])
+    cudf_ii = cudf.from_pandas(pd_ii)
+
+    result = getattr(cudf_ii, attr)
+    expected = getattr(pd_ii, attr)
+    assert_eq(result, expected)
+
+
+def test_set_closed():
+    data = [pd.Interval(0, 1)]
+    result = cudf.IntervalIndex(data).set_closed("both")
+    expected = pd.IntervalIndex(data).set_closed("both")
+    assert_eq(result, expected)
+
+
+def test_from_tuples():
+    data = [(1, 2), (10, 20)]
+    result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
+    expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
+    assert_eq(result, expected)

From be842259a835f4f7a5b9f7ff6fad1507d33c13cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:53:13 -0500
Subject: [PATCH 21/72] Remove cuDF dependency from pylibcudf column
 from_device tests (#16441)

This removes the need to `import cudf` in `test_column_from_device` and removes a runtime dependency on numpy in the associated pylibcudf column method.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16441
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  9 ++---
 .../cudf/_lib/pylibcudf/libcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 16 +++++++-
 .../test_column_from_device.py                | 39 +++++++++++++++----
 5 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index a61e0629292..1d9902b0374 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -15,13 +15,11 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
-from .types cimport DataType, type_id
+from .types cimport DataType, size_of, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 import functools
 
-import numpy as np
-
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -303,14 +301,15 @@ cdef class Column:
             raise ValueError("mask not yet supported.")
 
         typestr = iface['typestr'][1:]
+        data_type = _datatype_from_dtype_desc(typestr)
+
         if not is_c_contiguous(
             iface['shape'],
             iface['strides'],
-            np.dtype(typestr).itemsize
+            size_of(data_type)
         ):
             raise ValueError("Data must be C-contiguous")
 
-        data_type = _datatype_from_dtype_desc(typestr)
         size = iface['shape'][0]
         return Column(
             data_type,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 8e94ec296cf..eabae68bc90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -98,3 +98,5 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         HIGHER
         MIDPOINT
         NEAREST
+
+    cdef size_type size_of(data_type t) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 7d3ddca14a1..1f3e1aa2fbb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -27,3 +27,5 @@ cdef class DataType:
 
     @staticmethod
     cdef DataType from_libcudf(data_type dt)
+
+cpdef size_type size_of(DataType t)
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index c45c6071bb3..311f9ce4046 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,12 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    size_of as cpp_size_of,
+    size_type,
+    type_id,
+)
 from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
@@ -69,6 +74,15 @@ cdef class DataType:
         ret.c_obj = dt
         return ret
 
+cpdef size_type size_of(DataType t):
+    """Returns the size in bytes of elements of the specified data_type.
+
+    Only fixed-width types are supported.
+
+    For details, see :cpp:func:`size_of`.
+    """
+    with nogil:
+        return cpp_size_of(t.c_obj)
 
 SIZE_TYPE = DataType(type_to_id[size_type]())
 SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
index c4ff7bb43a5..78ee2cb100e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -4,7 +4,8 @@
 import pytest
 from utils import assert_column_eq
 
-import cudf
+import rmm
+
 from cudf._lib import pylibcudf as plc
 
 VALID_TYPES = [
@@ -35,17 +36,39 @@ def valid_type(request):
     return request.param
 
 
+class DataBuffer:
+    def __init__(self, obj, dtype):
+        self.obj = rmm.DeviceBuffer.to_device(obj)
+        self.dtype = dtype
+        self.shape = (int(len(self.obj) / self.dtype.itemsize),)
+        self.strides = (self.dtype.itemsize,)
+        self.typestr = self.dtype.str
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "data": self.obj.__cuda_array_interface__["data"],
+            "shape": self.shape,
+            "strides": self.strides,
+            "typestr": self.typestr,
+            "version": 0,
+        }
+
+
 @pytest.fixture
-def valid_column(valid_type):
+def input_column(valid_type):
     if valid_type == pa.bool_():
         return pa.array([True, False, True], type=valid_type)
     return pa.array([1, 2, 3], type=valid_type)
 
 
-def test_from_cuda_array_interface(valid_column):
-    col = plc.column.Column.from_cuda_array_interface_obj(
-        cudf.Series(valid_column)
-    )
-    expect = valid_column
+@pytest.fixture
+def iface_obj(input_column):
+    data = input_column.to_numpy(zero_copy_only=False)
+    return DataBuffer(data.view("uint8"), data.dtype)
+
+
+def test_from_cuda_array_interface(input_column, iface_obj):
+    col = plc.column.Column.from_cuda_array_interface_obj(iface_obj)
 
-    assert_column_eq(expect, col)
+    assert_column_eq(input_column, col)

From 9d0c57a64d63d52182bd1c1e930180bf62404f1a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 10:59:27 -0700
Subject: [PATCH 22/72] Add skiprows and nrows to parquet reader (#16214)

closes #15144

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16214
---
 python/cudf/cudf/_lib/parquet.pyx             | 35 ++++++++++++-----
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pxd   |  2 +-
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pyx   | 18 ++++-----
 python/cudf/cudf/io/parquet.py                | 23 +++++++++++
 .../cudf/pylibcudf_tests/io/test_parquet.py   |  2 +-
 python/cudf/cudf/tests/test_parquet.py        | 39 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             | 10 +++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |  2 +-
 8 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a2eed94bb3c..4a4b13b0b31 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -22,7 +22,7 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
@@ -132,7 +132,10 @@ cdef object _process_metadata(object df,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
-                              bool use_pandas_metadata):
+                              bool use_pandas_metadata,
+                              size_type nrows=-1,
+                              int64_t skip_rows=0,
+                              ):
 
     add_df_col_struct_names(df, child_names)
     index_col = None
@@ -221,9 +224,13 @@ cdef object _process_metadata(object df,
                 else:
                     idx = cudf.Index(cudf.core.column.column_empty(0))
             else:
+                start = range_index_meta["start"] + skip_rows
+                stop = range_index_meta["stop"]
+                if nrows != -1:
+                    stop = start + nrows
                 idx = cudf.RangeIndex(
-                    start=range_index_meta['start'],
-                    stop=range_index_meta['stop'],
+                    start=start,
+                    stop=stop,
                     step=range_index_meta['step'],
                     name=range_index_meta['name']
                 )
@@ -260,7 +267,9 @@ def read_parquet_chunked(
     row_groups=None,
     use_pandas_metadata=True,
     size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000
+    size_t pass_read_limit=1024000000,
+    size_type nrows=-1,
+    int64_t skip_rows=0
 ):
     # Convert NativeFile buffers to NativeFileDatasource,
     # but save original buffers in case we need to use
@@ -287,7 +296,9 @@ def read_parquet_chunked(
         row_groups,
         use_pandas_metadata,
         chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit
+        pass_read_limit=pass_read_limit,
+        skip_rows=skip_rows,
+        nrows=nrows,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -320,13 +331,16 @@ def read_parquet_chunked(
     df = _process_metadata(df, column_names, child_names,
                            per_file_user_data, row_groups,
                            filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
-                   Expression filters=None):
+                   Expression filters=None,
+                   size_type nrows=-1,
+                   int64_t skip_rows=0):
     """
     Cython function to call into libcudf API, see `read_parquet`.
 
@@ -362,6 +376,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         filters,
         convert_strings_to_categories = False,
         use_pandas_metadata = use_pandas_metadata,
+        skip_rows = skip_rows,
+        nrows = nrows,
     )
 
     df = cudf.DataFrame._from_data(
@@ -371,7 +387,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
                            tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
                            row_groups, filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 cpdef read_parquet_metadata(filepaths_or_buffers):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
index 027f215fb91..93ef849b813 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
@@ -28,7 +28,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = *,
     bool use_pandas_metadata = *,
     int64_t skip_rows = *,
-    size_type num_rows = *,
+    size_type nrows = *,
     # disabled see comment in parquet.pyx for more
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
index 96119e1b714..84a79f9565f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
@@ -26,7 +26,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # ReaderColumnSchema reader_column_schema = None,
     # DataType timestamp_type = DataType(type_id.EMPTY)
 ):
@@ -40,8 +40,8 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     )
     if row_groups is not None:
         opts.set_row_groups(row_groups)
-    if num_rows != -1:
-        opts.set_num_rows(num_rows)
+    if nrows != -1:
+        opts.set_num_rows(nrows)
     if skip_rows != 0:
         opts.set_skip_rows(skip_rows)
     if columns is not None:
@@ -73,7 +73,7 @@ cdef class ChunkedParquetReader:
         Whether to convert string columns to the category type
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
@@ -90,7 +90,7 @@ cdef class ChunkedParquetReader:
         bool use_pandas_metadata=True,
         bool convert_strings_to_categories=False,
         int64_t skip_rows = 0,
-        size_type num_rows = -1,
+        size_type nrows = -1,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000
     ):
@@ -103,7 +103,7 @@ cdef class ChunkedParquetReader:
             convert_strings_to_categories=convert_strings_to_categories,
             use_pandas_metadata=use_pandas_metadata,
             skip_rows=skip_rows,
-            num_rows=num_rows,
+            nrows=nrows,
         )
 
         with nogil:
@@ -152,7 +152,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # Disabled, these aren't used by cudf-python
     # we should only add them back in if there's user demand
     # ReaderColumnSchema reader_column_schema = None,
@@ -178,7 +178,7 @@ cpdef read_parquet(
         the per-file user metadata of the ``TableWithMetadata``
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
 
     Returns
@@ -195,7 +195,7 @@ cpdef read_parquet(
         convert_strings_to_categories,
         use_pandas_metadata,
         skip_rows,
-        num_rows,
+        nrows,
     )
 
     with nogil:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7dab2f20100..4a419a2fbb6 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -539,6 +539,8 @@ def read_parquet(
     open_file_options=None,
     bytes_per_thread=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -685,6 +687,8 @@ def read_parquet(
             partition_keys=partition_keys,
             partition_categories=partition_categories,
             dataset_kwargs=dataset_kwargs,
+            nrows=nrows,
+            skip_rows=skip_rows,
             **kwargs,
         )
     # Apply filters row-wise (if any are defined), and return
@@ -813,6 +817,8 @@ def _parquet_to_frame(
     partition_keys=None,
     partition_categories=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     **kwargs,
 ):
     # If this is not a partitioned read, only need
@@ -820,11 +826,18 @@ def _parquet_to_frame(
     if not partition_keys:
         return _read_parquet(
             paths_or_buffers,
+            nrows=nrows,
+            skip_rows=skip_rows,
             *args,
             row_groups=row_groups,
             **kwargs,
         )
 
+    if nrows is not None or skip_rows is not None:
+        raise NotImplementedError(
+            "nrows/skip_rows is not supported when reading a partitioned parquet dataset"
+        )
+
     partition_meta = None
     partitioning = (dataset_kwargs or {}).get("partitioning", None)
     if hasattr(partitioning, "schema"):
@@ -912,6 +925,8 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -934,13 +949,21 @@ def _read_parquet(
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows if nrows is not None else -1,
+                skip_rows=skip_rows if skip_rows is not None else 0,
             )
         else:
+            if nrows is None:
+                nrows = -1
+            if skip_rows is None:
+                skip_rows = 0
             return libparquet.read_parquet(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
     else:
         if (
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
index 07d2ab3d69a..dbd20cd473e 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
@@ -31,7 +31,7 @@ def test_read_parquet_basic(
 
     res = plc.io.parquet.read_parquet(
         plc.io.SourceInfo([source]),
-        num_rows=nrows,
+        nrows=nrows,
         skip_rows=skiprows,
         columns=columns,
     )
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3806b901b10..879a2c50db7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1978,6 +1978,25 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
                 assert fn == filename
 
 
+@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
+def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
+    # Checks that write_to_dataset is wrapping to_parquet
+    # as expected
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    size = 100
+    pdf = pd.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])
+
+    with pytest.raises(NotImplementedError):
+        cudf.read_parquet(pdf_dir, **kwargs)
+
+
 @pytest.mark.parametrize("return_meta", [True, False])
 def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
     pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
@@ -3768,6 +3787,26 @@ def test_parquet_chunked_reader(
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "nrows,skip_rows",
+    [
+        (0, 0),
+        (1000, 0),
+        (0, 1000),
+        (1000, 10000),
+    ],
+)
+def test_parquet_reader_nrows_skiprows(nrows, skip_rows):
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 100000, "b": ["av", "qw", "hi", "xyz"] * 100000}
+    )
+    expected = df[skip_rows : skip_rows + nrows]
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows)
+    assert_eq(expected, got)
+
+
 def test_parquet_reader_pandas_compatibility():
     df = pd.DataFrame(
         {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 80555750b3a..448a815fe1b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -199,6 +199,16 @@
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
     This parameter is functional only when `use_python_file_object=False`.
+skiprows : int, default None
+    If not None, the number of rows to skip from the start of the file.
+
+    .. note::
+       This option is not supported when the low-memory mode is on.
+nrows : int, default None
+    If not None, the total number of rows to read.
+
+    .. note:
+       This option is not supported when the low-memory mode is on.
 
 Returns
 -------
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f62dff4389..3754addeb11 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -321,7 +321,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
-                num_rows=nrows,
+                nrows=nrows,
             )
             df = DataFrame.from_table(
                 tbl_w_meta.tbl,

From 05745d04e08ea494a50d12bad977af7e71aaf27b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 1 Aug 2024 17:00:19 -0400
Subject: [PATCH 23/72] Improve performance of hash_character_ngrams using
 warp-per-string kernel (#16212)

Improves the performance of `nvtext::hash_character_ngrams` using a warp-per-string kernel instead of a string per thread.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16212
---
 cpp/src/text/generate_ngrams.cu | 161 ++++++++++++++++++++++----------
 1 file changed, 113 insertions(+), 48 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 724f3603f29..6f700f84ec4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -36,10 +36,12 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <cuda/functional>
+#include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 #include <stdexcept>
 
@@ -165,6 +167,47 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
+/**
+ * @brief Counts the number of ngrams in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(0,str.length() - ngrams + 1)`
+ * If a string has less than ngrams characters, its count is 0.
+ */
+CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings,
+                                          cudf::size_type ngrams,
+                                          cudf::size_type* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  namespace cg    = cooperative_groups;
+  auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  auto const lane_idx   = warp.thread_rank();
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = cg::reduce(warp, count, cg::plus<int>());
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(0, char_count - ngrams + 1); }
+}
+
 /**
  * @brief Generate character ngrams for each string
  *
@@ -220,17 +263,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts           = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    auto const num_blocks = cudf::util::div_rounding_up_safe(
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+    count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
+
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -246,36 +288,64 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 }
 
 namespace {
+
 /**
  * @brief Computes the hash of each character ngram
  *
- * Each thread processes a single string. Substrings are resolved for every character
+ * Each warp processes a single string. Substrings are resolved for every character
  * of the string and hashed.
  */
-struct character_ngram_hash_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type ngrams;
-  cudf::size_type const* d_ngram_offsets;
-  cudf::hash_value_type* d_results;
+CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
+                                             cudf::size_type ngrams,
+                                             cudf::size_type const* d_ngram_offsets,
+                                             cudf::hash_value_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    if (d_strings.is_null(idx)) return;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return;
-    auto itr                = d_str.begin();
-    auto const ngram_offset = d_ngram_offsets[idx];
-    auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
-    auto d_hashes           = d_results + ngram_offset;
-    for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
-      auto const begin = itr.byte_offset();
-      auto const end   = (itr + ngrams).byte_offset();
-      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
-      *d_hashes++      = hasher(ngram);
+  auto const str_idx = idx / cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
+
+  auto const ngram_offset = d_ngram_offsets[str_idx];
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+
+  auto d_hashes = d_results + ngram_offset;
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    cudf::hash_value_type hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve ngram substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(sub_str, ngrams);
+      if (left == 0) { hash = hasher(cudf::string_view(itr, bytes)); }
+    }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values into d_hashes
+      auto const hashes = &hvs[threadIdx.x];
+      d_hashes          = thrust::copy_if(
+        thrust::seq, hashes, hashes + cudf::detail::warp_size, d_hashes, [](auto h) {
+          return h != 0;
+        });
     }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 }  // namespace
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
@@ -291,18 +361,16 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const grid      = cudf::detail::grid_1d(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
 
   // build offsets column by computing the number of ngrams per string
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
 
   CUDF_EXPECTS(total_ngrams > 0,
@@ -313,11 +381,8 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
     cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
-  character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     generator);
+  character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, ngrams, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);

From a8a367009ff64478d78eb916fc9dc65b77b89aac Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:45:01 -0700
Subject: [PATCH 24/72] Move exception handler into pylibcudf from cudf
 (#16468)

PR to help prepare for the splitting out of pylibcudf.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16468
---
 docs/cudf/source/developer_guide/pylibcudf.md |  2 +-
 .../{ => pylibcudf}/exception_handler.pxd     |  6 +--
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  | 12 ++---
 .../cudf/_lib/pylibcudf/libcudf/copying.pxd   | 44 +++++++++----------
 .../_lib/pylibcudf/libcudf/lists/contains.pxd | 12 ++---
 5 files changed, 38 insertions(+), 38 deletions(-)
 rename python/cudf/cudf/_lib/{ => pylibcudf}/exception_handler.pxd (95%)

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 0b881b2b057..2ae545a4955 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -149,7 +149,7 @@ Some guidelines on what should be tested:
   - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
 - Nullable data should always be tested.
 - Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
-  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in `exception_handler.pxd`.
 
 Some guidelines on how best to use pytests.
 - By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
diff --git a/python/cudf/cudf/_lib/exception_handler.pxd b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/exception_handler.pxd
rename to python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
index 4337d8db285..6abcd0a1c0f 100644
--- a/python/cudf/cudf/_lib/exception_handler.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 # See
@@ -24,7 +24,7 @@ cdef extern from *:
      * Since this function interoperates with Python's exception state, it
      * does not throw any C++ exceptions.
      */
-    void cudf_exception_handler()
+    void libcudf_exception_handler()
     {
       // Catch a handful of different errors here and turn them into the
       // equivalent Python errors.
@@ -66,4 +66,4 @@ cdef extern from *:
 
     }  // anonymous namespace
     """
-    cdef void cudf_exception_handler()
+    cdef void libcudf_exception_handler()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index b34fea6a775..78da5980db4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -5,7 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -55,28 +55,28 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
     cdef bool is_supported_operation(
@@ -84,4 +84,4 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
         data_type lhs_type,
         data_type rhs_type,
         binary_operator op
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index 001489d69bf..af3a16ad01b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -8,7 +8,7 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
@@ -30,25 +30,25 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& source_table,
         const column_view& gather_map,
         out_of_bounds_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] shift(
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -57,22 +57,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
@@ -80,7 +80,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
@@ -88,68 +88,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 82aed7d70a0..40bb2e78970 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
@@ -21,25 +21,25 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const scalar& search_key,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const column_view& search_keys,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains_nulls(
         const lists_column_view& lists,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const scalar& search_key,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const column_view& search_keys,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler

From cc19d8a7b424abbc87f7767e3bc60c54390dc9e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 2 Aug 2024 09:34:27 -1000
Subject: [PATCH 25/72] Use explicit construction of column subclass instead of
 `build_column` when type is known (#16470)

When we need to construct a column with a specific type, we do not need to go through the indirection of `build_column`, which matches a column subclass to a passed type, and instead construct directly from the class instead

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16470
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/categorical.py | 46 +++++++++++++--------
 python/cudf/cudf/core/column/column.py      |  2 +-
 python/cudf/cudf/core/column/datetime.py    | 10 ++---
 python/cudf/cudf/core/column/numerical.py   | 43 ++++++++-----------
 python/cudf/cudf/core/column/string.py      |  6 +--
 python/cudf/cudf/core/column/timedelta.py   |  8 ++--
 python/cudf/cudf/core/dataframe.py          |  4 +-
 python/cudf/cudf/core/index.py              |  8 ++--
 9 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 18ab32d2c9e..9f36499586b 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,7 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.build_column(
+            codes=cudf.core.column.NumericalColumn(
                 result.base_data, dtype=result.dtype
             ),
             mask=result.base_mask,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9433a91b9c6..55bfae30470 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -572,13 +572,10 @@ def children(self) -> tuple[NumericalColumn]:
             codes_column = self.base_children[0]
             start = self.offset * codes_column.dtype.itemsize
             end = start + self.size * codes_column.dtype.itemsize
-            codes_column = cast(
-                cudf.core.column.NumericalColumn,
-                column.build_column(
-                    data=codes_column.base_data[start:end],
-                    dtype=codes_column.dtype,
-                    size=self.size,
-                ),
+            codes_column = cudf.core.column.NumericalColumn(
+                data=codes_column.base_data[start:end],
+                dtype=codes_column.dtype,
+                size=self.size,
             )
             self._children = (codes_column,)
         return self._children
@@ -660,8 +657,9 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.build_column(
-                    codes.base_data, dtype=codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    codes.base_data,  # type: ignore[arg-type]
+                    dtype=codes.dtype,
                 ),
                 mask=codes.base_mask,
                 ordered=self.ordered,
@@ -734,7 +732,10 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -842,7 +843,10 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -980,7 +984,9 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=column.build_column(output.base_data, dtype=output.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                output.base_data, dtype=output.dtype
+            ),
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1176,8 +1182,9 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=column.build_column(
-                codes_col.base_data, dtype=codes_col.dtype
+            codes=cudf.core.column.NumericalColumn(
+                codes_col.base_data,  # type: ignore[arg-type]
+                dtype=codes_col.dtype,
             ),
             mask=codes_col.base_mask,
             size=codes_col.size,
@@ -1190,8 +1197,9 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=column.build_column(
-                    self.codes.base_data, dtype=self.codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    self.codes.base_data,  # type: ignore[arg-type]
+                    dtype=self.codes.dtype,
                 ),
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
@@ -1339,7 +1347,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=column.build_column(
+                codes=cudf.core.column.NumericalColumn(
                     new_codes.base_data, dtype=new_codes.dtype
                 ),
                 mask=new_codes.base_mask,
@@ -1472,7 +1480,9 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=column.build_column(codes.base_data, codes.dtype),
+        codes=cudf.core.column.NumericalColumn(
+            codes.base_data, dtype=codes.dtype
+        ),
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e0d8ced595..a7d2cb441dd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1506,7 +1506,7 @@ def column_empty(
     elif isinstance(dtype, CategoricalDtype):
         data = None
         children = (
-            build_column(
+            cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
                         size=row_count
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 81fbb914842..ce67ce81e6b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -473,15 +473,15 @@ def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
         )
-        return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
+        return cast(cudf.core.column.NumericalColumn, col.astype(dtype))
 
     def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f9404eb3b40..c326a10c844 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,13 +13,7 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf.api.types import is_integer, is_scalar
-from cudf.core.column import (
-    ColumnBase,
-    as_column,
-    build_column,
-    column,
-    string,
-)
+from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
@@ -338,29 +332,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
     def as_datetime_column(
         self, dtype: Dtype
     ) -> cudf.core.column.DatetimeColumn:
-        return cast(
-            "cudf.core.column.DatetimeColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.DatetimeColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_timedelta_column(
         self, dtype: Dtype
     ) -> cudf.core.column.TimeDeltaColumn:
-        return cast(
-            "cudf.core.column.TimeDeltaColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.TimeDeltaColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_decimal_column(
@@ -637,7 +625,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=build_column(self.base_data, dtype=self.dtype),
+                codes=cudf.core.column.NumericalColumn(
+                    self.base_data,  # type: ignore[arg-type]
+                    dtype=self.dtype,
+                ),
                 mask=self.base_mask,
                 ordered=dtype.ordered,
                 size=self.size,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ec95c50f455..b422ff86b17 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5934,9 +5934,9 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         n_bytes_to_view = str_end_byte_offset - str_byte_offset
 
-        to_view = column.build_column(
-            self.base_data,
-            dtype=cudf.api.types.dtype("int8"),
+        to_view = cudf.core.column.NumericalColumn(
+            self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int8),
             offset=str_byte_offset,
             size=n_bytes_to_view,
         )
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 47c8ed6fd95..ba0dc4779bb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -265,10 +265,10 @@ def round(self, freq: str) -> ColumnBase:
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 52dc29974bf..865d2706ca3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,10 +46,10 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
+    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
-    build_column,
     column_empty,
     concat_columns,
 )
@@ -8543,7 +8543,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=build_column(
+                codes=NumericalColumn(
                     cols[name].base_data, dtype=cols[name].dtype
                 ),
                 mask=cols[name].base_mask,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd879d559cd..0d29ef07e7d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2434,12 +2434,10 @@ def to_pandas(
         return result
 
     @_performance_tracking
-    def _get_dt_field(self, field):
+    def _get_dt_field(self, field: str) -> Index:
+        """Return an Index of a numerical component of the DatetimeIndex."""
         out_column = self._values.get_dt_field(field)
-        # column.column_empty_like always returns a Column object
-        # but we need a NumericalColumn for Index..
-        # how should this be handled?
-        out_column = column.build_column(
+        out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,

From e0d1ac1efa9153f0a084bd72b7d4c300f9640196 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Aug 2024 17:44:45 -0500
Subject: [PATCH 26/72] Fix typo in dispatch_row_equal. (#16473)

This PR fixes a small typo in the C++ code.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16473
---
 cpp/src/stream_compaction/distinct.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e5cf29f3ebf..e2c5aba6802 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -51,7 +51,7 @@ namespace {
  * @param func The input functor to invoke
  */
 template <bool HasNested, typename Func>
-rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+rmm::device_uvector<cudf::size_type> dispatch_row_equal(
   null_equality compare_nulls,
   nan_equality compare_nans,
   bool has_nulls,
@@ -110,9 +110,9 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   };
 
   if (cudf::detail::has_nested_columns(input)) {
-    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
 }
 

From af57286536fc21b47b80e45be222773b751600c9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 5 Aug 2024 07:16:34 -0500
Subject: [PATCH 27/72] Add missing pylibcudf strings docs (#16471)

Noticed a few missing pylibcudf string docs that were missed, added them here.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16471
---
 .../api_docs/pylibcudf/strings/capitalize.rst |  6 +++
 .../api_docs/pylibcudf/strings/char_types.rst |  6 +++
 .../api_docs/pylibcudf/strings/find.rst       |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  5 ++
 .../pylibcudf/strings/regex_flags.rst         |  6 +++
 .../pylibcudf/strings/regex_program.rst       |  6 +++
 .../_lib/pylibcudf/strings/capitalize.pyx     | 48 ++++++++++++++++++-
 .../_lib/pylibcudf/strings/regex_program.pyx  | 19 ++++++++
 8 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
new file mode 100644
index 00000000000..578b2b75e37
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
@@ -0,0 +1,6 @@
+==========
+capitalize
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.capitalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
new file mode 100644
index 00000000000..577ec34915b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
@@ -0,0 +1,6 @@
+==========
+char_types
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.char_types
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
new file mode 100644
index 00000000000..61d4079e9a3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
@@ -0,0 +1,6 @@
+====
+find
+====
+
+.. automodule:: cudf._lib.pylibcudf.strings.find
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index cecf1ccc9bb..462a756a092 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -4,6 +4,11 @@ strings
 .. toctree::
     :maxdepth: 1
 
+    capitalize
+    char_types
     contains
+    find
+    regex_flags
+    regex_program
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
new file mode 100644
index 00000000000..0126b6a3706
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
@@ -0,0 +1,6 @@
+===========
+regex_flags
+===========
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_flags
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
new file mode 100644
index 00000000000..2f398186d51
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
@@ -0,0 +1,6 @@
+=============
+regex_program
+=============
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_program
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
index d3f79088018..ccf84d25572 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
@@ -22,7 +22,22 @@ cpdef Column capitalize(
     # TODO: default scalar values
     # https://github.com/rapidsai/cudf/issues/15505
 ):
-
+    """Returns a column of capitalized strings.
+
+    For details, see :cpp:func:`cudf::strings::capitalize`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    delimiters : Scalar, default None
+        Characters for identifying words to capitalize
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of strings capitalized from the input column
+    """
     cdef unique_ptr[column] c_result
 
     if delimiters is None:
@@ -47,6 +62,23 @@ cpdef Column title(
     Column input,
     string_character_types sequence_type=string_character_types.ALPHA
 ):
+    """Modifies first character of each word to upper-case and lower-cases
+    the rest.
+
+    For details, see :cpp:func:`cudf::strings::title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    sequence_type : string_character_types, default string_character_types.ALPHA
+        The character type that is used when identifying words
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of titled strings
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.title(input.view(), sequence_type)
@@ -55,6 +87,20 @@ cpdef Column title(
 
 
 cpdef Column is_title(Column input):
+    """Checks if the strings in the input column are title formatted.
+
+    For details, see :cpp:func:`cudf::strings::is_title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of type BOOL8
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.is_title(input.view())
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
index d605b0aba02..5f0b8868452 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
@@ -13,12 +13,31 @@ from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
 
 
 cdef class RegexProgram:
+    """Regex program class.
 
+    This is the Cython representation of
+    :cpp:class:`cudf::strings::regex_program`.
+
+    Do not instantiate this class directly, use the `create` method.
+
+    """
     def __init__(self, *args, **kwargs):
         raise ValueError("Do not instantiate RegexProgram directly, use create")
 
     @staticmethod
     def create(str pattern, int flags):
+        """Create a program from a pattern.
+
+        For detils, see :cpp:func:`cudf::strings::regex_program::create`.
+
+        Parameters
+        ----------
+        pattern : str
+            Regex pattern
+        flags : Uniont[int, RegexFlags]
+            Regex flags for interpreting special characters in the pattern
+
+        """
         cdef unique_ptr[regex_program] c_prog
         cdef regex_flags c_flags
         cdef string c_pattern = pattern.encode()

From 837dfe51a2f4d0268d6976464eed637645f524ff Mon Sep 17 00:00:00 2001
From: Rahul Prabhu <100436830+sdrp713@users.noreply.github.com>
Date: Mon, 5 Aug 2024 14:14:41 -0700
Subject: [PATCH 28/72] Added batch memset to memset data and validity buffers
 in parquet reader (#16281)

Under some situations in the Parquet reader (particularly the case with tables containing many columns or deeply nested column) we burn a decent amount of time doing cudaMemset() operations on output buffers. A good amount of this overhead seems to stem from the fact that we're simply launching many tiny kernels. This PR adds a batched memset kernel that takes a list of device spans as a single input and does all the work under a single kernel launch. This PR addresses issue #15773

## Improvements
Using out performance cluster, improvements of 2.39% were shown on running the overall NDS queries
Additionally, benchmarks were added showing big improvements(around 20%) especially on fixed width data types which can be shown below

data_type | num_cols | cardinality | run_length | bytes_per_second_before_this_pr | bytes_per_second_after_this_pr | speedup
--- | --- | --- | --- | --- | --- | ---
INTEGRAL | 1000 | 0 | 1 | 36514934834 | 42756531566 | 1.170932709
INTEGRAL | 1000 | 1000 | 1 | 35364061247 | 39112512476 | 1.105996062
INTEGRAL | 1000 | 0 | 32 | 37349112510 | 39641370858 | 1.061373837
INTEGRAL | 1000 | 1000 | 32 | 39167079622 | 43740824957 | 1.116775245
FLOAT | 1000 | 0 | 1 | 51877322003 | 64083898838 | 1.235296973
FLOAT | 1000 | 1000 | 1 | 48983612272 | 58705522023 | 1.198472699
FLOAT | 1000 | 0 | 32 | 46544977658 | 53715018581 | 1.154045426
FLOAT | 1000 | 1000 | 32 | 54493432148 | 66617609904 | 1.22248879
DECIMAL | 1000 | 0 | 1 | 47616412888 | 57952310685 | 1.217065864
DECIMAL | 1000 | 1000 | 1 | 47166138095 | 54283772484 | 1.1509056
DECIMAL | 1000 | 0 | 32 | 45266163387 | 53770390830 | 1.18787162
DECIMAL | 1000 | 1000 | 32 | 52292176603 | 58847723569 | 1.125363819
TIMESTAMP | 1000 | 0 | 1 | 50245415328 | 60797982330 | 1.210020495
TIMESTAMP | 1000 | 1000 | 1 | 50300238706 | 60810368331 | 1.208947908
TIMESTAMP | 1000 | 0 | 32 | 55338354243 | 66786275739 | 1.206871376
TIMESTAMP | 1000 | 1000 | 32 | 55680028082 | 69029227374 | 1.23974843
DURATION | 1000 | 0 | 1 | 54680007758 | 66855201896 | 1.222662626
DURATION | 1000 | 1000 | 1 | 54305832171 | 66602436269 | 1.226432477
DURATION | 1000 | 0 | 32 | 60040760815 | 72663056969 | 1.210228784
DURATION | 1000 | 1000 | 32 | 60212221703 | 75646396131 | 1.256329595
STRING | 1000 | 0 | 1 | 29691707753 | 33388700976 | 1.12451265
STRING | 1000 | 1000 | 1 | 31411129876 | 35407241037 | 1.127219593
STRING | 1000 | 0 | 32 | 29680479388 | 33382478907 | 1.124728427
STRING | 1000 | 1000 | 32 | 35476213777 | 40478389269 | 1.141000827
LIST | 1000 | 0 | 1 | 6874253484 | 7370835717 | 1.072237987
LIST | 1000 | 1000 | 1 | 6763426009 | 7253762966 | 1.07249831
LIST | 1000 | 0 | 32 | 6981508808 | 7502741115 | 1.074658977
LIST | 1000 | 1000 | 32 | 6989374761 | 7506418252 | 1.073975643
STRUCT | 1000 | 0 | 1 | 2137525922 | 2189495762 | 1.024313081
STRUCT | 1000 | 1000 | 1 | 1057923939 | 1078475980 | 1.019426766
STRUCT | 1000 | 0 | 32 | 1637342446 | 1698913790 | 1.037604439
STRUCT | 1000 | 1000 | 32 | 1057587701 | 1082539399 | 1.02359303

Authors:
  - Rahul Prabhu (https://github.com/sdrp713)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - https://github.com/nvdbaranec
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16281
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 .../io/utilities/batched_memset_bench.cpp     | 101 ++++++++++++++++++
 cpp/include/cudf/io/detail/batched_memset.hpp |  82 ++++++++++++++
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  29 ++++-
 cpp/src/io/utilities/column_buffer.cpp        |  29 +++--
 cpp/src/io/utilities/column_buffer.hpp        |  23 +++-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../utilities_tests/batched_memset_tests.cu   |  97 +++++++++++++++++
 8 files changed, 353 insertions(+), 14 deletions(-)
 create mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp
 create mode 100644 cpp/include/cudf/io/detail/batched_memset.hpp
 create mode 100644 cpp/tests/utilities_tests/batched_memset_tests.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ff431c7f260..7be456ddfba 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -353,6 +353,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
+# ##################################################################################################
+# * multi buffer memset benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
+
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
new file mode 100644
index 00000000000..2905895a63b
--- /dev/null
+++ b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size = 512 << 20;
+
+void parquet_read_common(cudf::size_type num_rows_to_read,
+                         cudf::size_type num_cols_to_read,
+                         cuio_source_sink_pair& source_sink,
+                         nvbench::state& state)
+{
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+
+      timer.start();
+      auto const result = cudf::io::read_parquet(read_opts);
+      timer.stop();
+
+      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
+      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+    });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <data_type DataType>
+void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+{
+  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
+  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  auto const compression = cudf::io::compression_type::NONE;
+  cuio_source_sink_pair source_sink(source_type);
+  auto const tbl =
+    create_random_table(cycle_dtypes(d_type, num_cols),
+                        table_size_bytes{data_size},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .compression(compression);
+  cudf::io::write_parquet(write_opts);
+  auto const num_rows = view.num_rows();
+
+  parquet_read_common(num_rows, num_cols, source_sink, state);
+}
+
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::DURATION,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
+  .set_name("batched_memset")
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("num_cols", {1000})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/io/detail/batched_memset.hpp
new file mode 100644
index 00000000000..d0922cc64ee
--- /dev/null
+++ b/cpp/include/cudf/io/detail/batched_memset.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
+
+/**
+ * @brief A helper function that takes in a vector of device spans and memsets them to the
+ * value provided using batches sent to the GPU.
+ *
+ * @param bufs Vector with device spans of data
+ * @param value Value to memset all device spans to
+ * @param _stream Stream used for device memory operations and kernel launches
+ *
+ * @return The data in device spans all set to value
+ */
+template <typename T>
+void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
+                    T const value,
+                    rmm::cuda_stream_view stream)
+{
+  // define task and bytes parameters
+  auto const num_bufs = bufs.size();
+
+  // copy bufs into device memory and then get sizes
+  auto gpu_bufs =
+    cudf::detail::make_device_uvector_async(bufs, stream, rmm::mr::get_current_device_resource());
+
+  // get a vector with the sizes of all buffers
+  auto sizes = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<std::size_t>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
+
+  // get an iterator with a constant value to memset
+  auto iter_in = thrust::make_constant_iterator(thrust::make_constant_iterator(value));
+
+  // get an iterator pointing to each device span
+  auto iter_out = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<T*>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].data(); }));
+
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+
+  rmm::device_buffer d_temp_storage(
+    temp_storage_bytes, stream, rmm::mr::get_current_device_resource());
+
+  cub::DeviceCopy::Batched(
+    d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+}
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e006cc7d714..557b1a45c1f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -1494,6 +1495,11 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
   // buffers if they are not part of a list hierarchy. mark down
   // if we have any list columns that need further processing.
   bool has_lists = false;
+  // Casting to std::byte since data buffer pointer is void *
+  std::vector<cudf::device_span<std::byte>> memset_bufs;
+  // Validity Buffer is a uint32_t pointer
+  std::vector<cudf::device_span<cudf::bitmask_type>> nullmask_bufs;
+
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& input_col  = _input_columns[idx];
     size_t const max_depth = input_col.nesting_depth();
@@ -1514,13 +1520,19 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         // we're going to start null mask as all valid and then turn bits off if necessary
         out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
-          cudf::mask_state::ALL_VALID,
+          cudf::mask_state::UNINITIALIZED,
+          false,
           _stream,
           _mr);
+        memset_bufs.push_back(cudf::device_span<std::byte>(static_cast<std::byte*>(out_buf.data()),
+                                                           out_buf.data_size()));
+        nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+          out_buf.null_mask(),
+          cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+            sizeof(cudf::bitmask_type)));
       }
     }
   }
-
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
     auto h_cols_info =
@@ -1593,11 +1605,22 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
           // allocate
           // we're going to start null mask as all valid and then turn bits off if necessary
-          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
+          out_buf.create_with_mask(size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr);
+          memset_bufs.push_back(cudf::device_span<std::byte>(
+            static_cast<std::byte*>(out_buf.data()), out_buf.data_size()));
+          nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+            out_buf.null_mask(),
+            cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+              sizeof(cudf::bitmask_type)));
         }
       }
     }
   }
+
+  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  // Need to set null mask bufs to all high bits
+  cudf::io::detail::batched_memset(
+    nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 2f4272b0367..8abfb000b94 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -33,7 +33,7 @@
 
 namespace cudf::io::detail {
 
-void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void gather_column_buffer::allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // The contents of _strings will never be directly returned to the user.
@@ -56,11 +56,12 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_data,
+                                                                   rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
-  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+  _data = create_data(data_type{type_to_id<size_type>()}, size + 1, memset_data, stream, _mr);
 }
 
 void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
@@ -93,6 +94,7 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 template <class string_policy>
 void column_buffer_base<string_policy>::create_with_mask(size_type _size,
                                                          cudf::mask_state null_mask_state,
+                                                         bool memset_data,
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
@@ -100,16 +102,20 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   _mr  = mr;
 
   switch (type.id()) {
-    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+    case type_id::STRING:
+      static_cast<string_policy*>(this)->allocate_strings_data(memset_data, stream);
+      break;
 
     // list columns store a buffer of int32's as offsets to represent
     // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+    case type_id::LIST:
+      _data = create_data(data_type{type_to_id<size_type>()}, size, memset_data, stream, _mr);
+      break;
 
     // struct columns store no data themselves.  just validity and children.
     case type_id::STRUCT: break;
 
-    default: _data = create_data(type, size, stream, _mr); break;
+    default: _data = create_data(type, size, memset_data, stream, _mr); break;
   }
   if (is_nullable) {
     _null_mask =
@@ -117,12 +123,21 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               bool memset_data,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, memset_data, stream, mr);
+}
+
 template <class string_policy>
 void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
-  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+  create_with_mask(_size, mask_state::ALL_NULL, true, stream, mr);
 }
 
 template <class string_policy>
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index ed6bb8bbdca..b2290965bb9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -44,6 +44,7 @@ namespace detail {
  *
  * @param type The intended data type to populate
  * @param size The number of elements to be represented by the mask
+ * @param memset_data Defines whether data should be memset to 0
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  *
@@ -51,17 +52,25 @@ namespace detail {
  */
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
+                                      bool memset_data,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
-
+  if (memset_data) { CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); }
   return data;
 }
 
+inline rmm::device_buffer create_data(data_type type,
+                                      size_type size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  return create_data(type, size, true, stream, mr);
+}
+
 using string_index_pair = thrust::pair<char const*, size_type>;
 
 // forward declare friend functions
@@ -113,12 +122,18 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
+  void create(size_type _size,
+              bool memset_data,
+              rmm::cuda_stream_view stream,
+              rmm::device_async_resource_ref mr);
+
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
   // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
   // something other than `ALL_NULL`
   void create_with_mask(size_type _size,
                         cudf::mask_state null_mask_state,
+                        bool memset_data,
                         rmm::cuda_stream_view stream,
                         rmm::device_async_resource_ref mr);
 
@@ -192,7 +207,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   [[nodiscard]] void* data_impl() { return _strings ? _strings->data() : _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
@@ -226,7 +241,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _data.data(); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4dffcb41ba2..5e85b3e8adf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
+  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
new file mode 100644
index 00000000000..9fc5baeec97
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+template <typename T>
+struct MultiBufferTestIntegral : public cudf::test::BaseFixture {};
+
+TEST(MultiBufferTestIntegral, BasicTest1)
+{
+  std::vector<size_t> const BUF_SIZES{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  // Creating base vector for data and setting it to all 0xFF
+  std::vector<std::vector<uint64_t>> expected;
+  std::transform(BUF_SIZES.begin(), BUF_SIZES.end(), std::back_inserter(expected), [](auto size) {
+    return std::vector<uint64_t>(size + 2000, std::numeric_limits<uint64_t>::max());
+  });
+
+  // set buffer region to other value
+  std::for_each(thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+                thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+                [](auto elem) {
+                  std::fill_n(
+                    thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0xEEEEEEEEEEEEEEEE);
+                });
+
+  // Copy host vector data to device
+  std::vector<rmm::device_uvector<uint64_t>> device_bufs;
+  std::transform(expected.begin(),
+                 expected.end(),
+                 std::back_inserter(device_bufs),
+                 [stream, mr](auto const& vec) {
+                   return cudf::detail::make_device_uvector_async(vec, stream, mr);
+                 });
+
+  // Initialize device buffers for memset
+  std::vector<cudf::device_span<uint64_t>> memset_bufs;
+  std::transform(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), BUF_SIZES.end())),
+    std::back_inserter(memset_bufs),
+    [](auto const& elem) {
+      return cudf::device_span<uint64_t>(thrust::get<0>(elem).data() + 1000, thrust::get<1>(elem));
+    });
+
+  // Function Call
+  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+
+  // Set all buffer regions to 0 for expected comparison
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+    [](auto elem) { std::fill_n(thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0UL); });
+
+  // Compare to see that only given buffers are zeroed out
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), expected.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), expected.end())),
+    [stream](auto const& elem) {
+      auto after_memset = cudf::detail::make_std_vector_async(thrust::get<0>(elem), stream);
+      EXPECT_TRUE(
+        std::equal(thrust::get<1>(elem).begin(), thrust::get<1>(elem).end(), after_memset.begin()));
+    });
+}

From 8068a2d616b6647bcd80720a2c24af858cbffd2d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 5 Aug 2024 14:48:33 -0700
Subject: [PATCH 29/72] Fix build failures with GCC 13 (#16488)

Closes #16395

This PR resolves two types of compilation errors, allowing for successful builds with GCC 13:

- replacing the `cuco_allocator` strong type with an alias to fix a new build time check with GCC 13
- removing `std::move` when returning a temporary

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16488
---
 cpp/include/cudf/detail/cuco_helpers.hpp      | 17 ++----
 .../cudf/detail/distinct_hash_join.cuh        |  2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |  2 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp      | 14 ++---
 cpp/src/groupby/hash/groupby.cu               | 19 +++----
 cpp/src/io/json/json_tree.cu                  | 35 +++++++------
 cpp/src/io/json/write_json.cu                 |  2 +-
 cpp/src/join/conditional_join.cu              | 52 +++++++++----------
 cpp/src/join/distinct_hash_join.cu            |  2 +-
 cpp/src/join/hash_join.cu                     |  2 +-
 cpp/src/join/join_common_utils.hpp            |  8 +--
 cpp/src/join/mixed_join.cu                    | 22 ++++----
 cpp/src/join/mixed_join_semi.cu               | 11 ++--
 cpp/src/reductions/histogram.cu               | 12 +++--
 cpp/src/search/contains_table.cu              | 17 +++---
 cpp/src/stream_compaction/distinct.cu         | 19 +++----
 cpp/src/stream_compaction/distinct_count.cu   | 17 +++---
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 cpp/src/text/bpe/byte_pair_encoding.cuh       |  4 +-
 cpp/src/text/bpe/load_merge_pairs.cu          | 39 +++++++-------
 cpp/src/text/vocabulary_tokenize.cu           |  4 +-
 cpp/tests/copying/gather_tests.cpp            | 14 ++---
 cpp/tests/reshape/byte_cast_tests.cpp         | 22 ++++----
 cpp/tests/structs/structs_column_tests.cpp    | 48 ++++++++---------
 25 files changed, 195 insertions(+), 193 deletions(-)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
index dca5a39bece..926df921715 100644
--- a/cpp/include/cudf/detail/cuco_helpers.hpp
+++ b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -36,19 +36,10 @@ static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5;
  * later expects a standard C++ `Allocator` interface. This allocator helper provides a simple way
  * to handle cuco memory allocation/deallocation with the given `stream` and the rmm default memory
  * resource.
+ *
+ * @tparam T The allocator's value type.
  */
-class cuco_allocator
-  : public rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<char>> {
-  /// Default stream-ordered allocator type
-  using default_allocator = rmm::mr::polymorphic_allocator<char>;
-  /// The base allocator adaptor type
-  using base_type = rmm::mr::stream_allocator_adaptor<default_allocator>;
-
- public:
-  /**
-   * @brief Constructs the allocator adaptor with the given `stream`
-   */
-  cuco_allocator(rmm::cuda_stream_view stream) : base_type{default_allocator{}, stream} {}
-};
+template <typename T>
+using cuco_allocator = rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<T>>;
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index c3bc3ad89fa..0b3d7ac58bf 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -99,7 +99,7 @@ struct distinct_hash_join {
                                            cuda::thread_scope_device,
                                            comparator_adapter<d_equal_type>,
                                            probing_scheme_type,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage_type>;
 
   bool _has_nulls;  ///< true if nulls are present in either build table or probe table
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index dfe79646167..7a1e38eefe0 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -32,7 +32,7 @@
 namespace cudf::detail {
 
 using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
 
 /**
  * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index ff7da4462a2..af46dd79cdb 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -59,7 +59,7 @@ struct hash_join {
     cuco::static_multimap<hash_value_type,
                           cudf::size_type,
                           cuda::thread_scope_device,
-                          cudf::detail::cuco_allocator,
+                          cudf::detail::cuco_allocator<char>,
                           cuco::legacy::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
 
   hash_join()                            = delete;
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 4e504ec1d30..d00db222b62 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1337,7 +1337,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<SourceElementT> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release());
   }
 
   /**
@@ -1361,7 +1361,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release());
   }
 
   /**
@@ -1386,7 +1386,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release());
   }
 
   /**
@@ -1413,8 +1413,8 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    build_from_non_nested(std::move(
-      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release()));
+    build_from_non_nested(
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release());
   }
 
   /**
@@ -1435,7 +1435,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<std::string> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release());
   }
 
   /**
@@ -1460,7 +1460,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release());
   }
 
   /**
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 5fe4a5eb30f..35161eada28 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -568,15 +568,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{num_keys,
-                                      0.5,  // desired load factor
-                                      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                      d_key_equal,
-                                      probing_scheme_type{d_row_hash},
-                                      cuco::thread_scope_device,
-                                      cuco::storage<1>{},
-                                      cudf::detail::cuco_allocator{stream},
-                                      stream.value()};
+    auto const set = cuco::static_set{
+      num_keys,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_key_equal,
+      probing_scheme_type{d_row_hash},
+      cuco::thread_scope_device,
+      cuco::storage<1>{},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     // Compute all single pass aggs first
     compute_single_pass_aggs(keys,
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ad807b57766..ee6bc0b9f4b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -545,15 +545,15 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 
   using hasher_type                             = decltype(d_hasher);
   constexpr size_type empty_node_index_sentinel = -1;
-  auto key_set =
-    cuco::static_set{cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
-                     cuco::empty_key{empty_node_index_sentinel},
-                     d_equal,
-                     cuco::linear_probing<1, hasher_type>{d_hasher},
-                     {},
-                     {},
-                     cudf::detail::cuco_allocator{stream},
-                     stream.value()};
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
   key_set.insert_if_async(iter,
                           iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
@@ -734,14 +734,15 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   constexpr size_type empty_node_index_sentinel = -1;
   using hasher_type                             = decltype(d_hashed_cache);
 
-  auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_nodes)},
-                                  cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
-                                  d_equal,
-                                  cuco::linear_probing<1, hasher_type>{d_hashed_cache},
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+  auto key_set = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_nodes)},
+    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hashed_cache},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // insert and convert node ids to unique set ids
   auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index c688c809e04..60bb2366e87 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -649,7 +649,7 @@ struct column_to_strings_fn {
     auto const list_child_string = make_lists_column(
       column.size(),
       std::move(new_offsets),
-      std::move(child_string_with_null()),
+      child_string_with_null(),
       column.null_count(),
       cudf::detail::copy_bitmask(column, stream_, rmm::mr::get_current_device_resource()),
       stream_);
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index d4ef2747c9d..789702ce538 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -432,13 +432,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_SEMI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -449,13 +449,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_ANTI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
@@ -484,12 +484,12 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_SEMI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
@@ -498,12 +498,12 @@ std::size_t conditional_left_anti_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_ANTI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index daa1bf17c0d..3d95b0c5a5c 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -119,7 +119,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
                 {},
                 cuco::thread_scope_device,
                 cuco_storage_type{},
-                cudf::detail::cuco_allocator{stream},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
                 stream.value()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index eb9b687630b..5d01482f44a 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -374,7 +374,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
                 cuco::empty_value{cudf::detail::JoinNoneValue},
                 stream.value(),
-                cudf::detail::cuco_allocator{stream}},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}},
     _build{build},
     _preprocessed_build{
       cudf::experimental::row::equality::preprocessed_table::create(_build, stream)}
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4157100b67e..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -48,11 +48,13 @@ using mixed_multimap_type =
   cuco::static_multimap<hash_value_type,
                         size_type,
                         cuda::thread_scope_device,
-                        cudf::detail::cuco_allocator,
+                        cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::
-  static_map<hash_value_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
 
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 90748e6f322..48b94c777de 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -126,11 +126,12 @@ mixed_join(
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
@@ -391,11 +392,12 @@ compute_mixed_join_output_size(table_view const& left_equality,
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index c147ea3c253..3e4188a0fbd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -163,11 +163,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index bebb9d14923..d49c0c6f0d2 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -164,11 +164,13 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{compute_hash_table_size(input.num_rows()),
-                                         cuco::empty_key{-1},
-                                         cuco::empty_value{std::numeric_limits<size_type>::min()},
-                                         cudf::detail::cuco_allocator{stream},
-                                         stream.value()};
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
+
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 81227cb9a2d..66cefd0aa2f 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -229,14 +229,15 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
-      auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
-                                  cuco::empty_key{rhs_index_type{-1}},
-                                  d_equal,
-                                  probing_scheme,
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+      auto set = cuco::static_set{
+        cuco::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{rhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
 
       if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
         auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e2c5aba6802..6afd6e34c50 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -97,15 +97,16 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
 
   auto const helper_func = [&](auto const& d_equal) {
     using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{num_rows,
-                                               0.5,  // desired load factor
-                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                               d_equal,
-                                               {row_hash.device_hasher(has_nulls)},
-                                               {},
-                                               {},
-                                               cudf::detail::cuco_allocator{stream},
-                                               stream.value()};
+    auto set        = hash_set_type<RowHasher>{
+      num_rows,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_equal,
+      {row_hash.device_hasher(has_nulls)},
+      {},
+      {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
     return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 9843bb889f4..cdf9faddf31 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -141,14 +141,15 @@ cudf::size_type distinct_count(table_view const& keys,
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
-    auto key_set      = cuco::static_set{cuco::extent{compute_hash_table_size(num_rows)},
-                                    cuco::empty_key<cudf::size_type>{-1},
-                                    row_equal,
-                                    cuco::linear_probing<1, hasher_type>{hash_key},
-                                         {},
-                                         {},
-                                    cudf::detail::cuco_allocator{stream},
-                                    stream.value()};
+    auto key_set      = cuco::static_set{
+      cuco::extent{compute_hash_table_size(num_rows)},
+      cuco::empty_key<cudf::size_type>{-1},
+      row_equal,
+      cuco::linear_probing<1, hasher_type>{hash_key},
+           {},
+           {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     auto const iter = thrust::counting_iterator<cudf::size_type>(0);
     // when nulls are equal, we skip hashing any row that has a null
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index fca67c98873..bea02e3dbe8 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -57,7 +57,7 @@ using hash_set_type =
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
                                           cudf::nullate::DYNAMIC>>,
-                   cudf::detail::cuco_allocator,
+                   cudf::detail::cuco_allocator<char>,
                    cuco::storage<1>>;
 
 /**
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index a2e441c3284..69c77224eb7 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -106,7 +106,7 @@ using merge_pairs_map_type = cuco::static_map<cudf::size_type,
                                               cuda::thread_scope_device,
                                               bpe_equal,
                                               bpe_probe_scheme,
-                                              cudf::detail::cuco_allocator,
+                                              cudf::detail::cuco_allocator<char>,
                                               cuco_storage>;
 
 /**
@@ -164,7 +164,7 @@ using mp_table_map_type = cuco::static_map<cudf::size_type,
                                            cuda::thread_scope_device,
                                            mp_equal,
                                            mp_probe_scheme,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage>;
 
 }  // namespace detail
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index f34c5c4f7f6..9fb86aecce3 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -43,16 +43,16 @@ namespace {
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs_map =
-    std::make_unique<merge_pairs_map_type>(static_cast<size_t>(input.size()),
-                                           cuco::empty_key{-1},
-                                           cuco::empty_value{-1},
-                                           bpe_equal{input},
-                                           bpe_probe_scheme{bpe_hasher{input}},
-                                           cuco::thread_scope_device,
-                                           cuco_storage{},
-                                           cudf::detail::cuco_allocator{stream},
-                                           stream.value());
+  auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    bpe_equal{input},
+    bpe_probe_scheme{bpe_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
@@ -67,15 +67,16 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto mp_table_map = std::make_unique<mp_table_map_type>(static_cast<size_t>(input.size()),
-                                                          cuco::empty_key{-1},
-                                                          cuco::empty_value{-1},
-                                                          mp_equal{input},
-                                                          mp_probe_scheme{mp_hasher{input}},
-                                                          cuco::thread_scope_device,
-                                                          cuco_storage{},
-                                                          cudf::detail::cuco_allocator{stream},
-                                                          stream.value());
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 97abb1487d8..5945921ed9d 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -100,7 +100,7 @@ using vocabulary_map_type = cuco::static_map<cudf::size_type,
                                              cuda::thread_scope_device,
                                              vocab_equal,
                                              probe_scheme,
-                                             cudf::detail::cuco_allocator,
+                                             cudf::detail::cuco_allocator<char>,
                                              cuco_storage>;
 }  // namespace
 }  // namespace detail
@@ -152,7 +152,7 @@ tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
     detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
     cuco::thread_scope_device,
     detail::cuco_storage{},
-    cudf::detail::cuco_allocator{stream},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value());
 
   // the row index is the token id (value for each key in the map)
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 284b6c4c50c..07ce672b14d 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -43,7 +43,7 @@ TYPED_TEST(GatherTest, IdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   for (auto i = 0; i < source_table.num_columns(); ++i) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -66,7 +66,7 @@ TYPED_TEST(GatherTest, ReverseIdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
 
@@ -94,7 +94,7 @@ TYPED_TEST(GatherTest, EveryOtherNullOdds)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
   auto expect_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
@@ -126,7 +126,7 @@ TYPED_TEST(GatherTest, EveryOtherNullEvens)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2 + 1; });
@@ -160,7 +160,7 @@ TYPED_TEST(GatherTest, AllNull)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Check that the result is also all invalid
   CUDF_TEST_EXPECT_TABLES_EQUAL(source_table, result->view());
@@ -190,7 +190,7 @@ TYPED_TEST(GatherTest, MultiColReverseIdentityTest)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
@@ -228,7 +228,7 @@ TYPED_TEST(GatherTest, MultiColNulls)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Expected data
   auto expect_data =
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index cd280302677..b3d9b2e2f5f 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -61,8 +61,8 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int16_expected          = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release()),
-    std::move(int16_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release(),
+    int16_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -109,8 +109,8 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
 
   auto int32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(int32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    int32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -163,9 +163,8 @@ TEST_F(ByteCastTest, int64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int64_expected          = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(int64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    int64_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -226,8 +225,8 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls)
     cudf::test::detail::make_null_mask(even_validity, even_validity + 5);
   auto fp32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(fp32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    fp32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -297,9 +296,8 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto fp64_expected           = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(fp64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    fp64_data.release(),
     null_count,
     std::move(null_mask));
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index df005dfa1dc..f0010fc1ed9 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -448,12 +448,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto list_of_struct_of_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(struct_of_lists_col),
-    null_count,
-    std::move(null_mask));
+  auto list_of_struct_of_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(struct_of_lists_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Compare with expected values.
 
@@ -468,12 +468,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
 
   std::tie(null_mask, null_count) =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto expected_level3_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release()),
-    std::move(expected_level2_struct),
-    null_count,
-    std::move(null_mask));
+  auto expected_level3_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release(),
+                            std::move(expected_level2_struct),
+                            null_count,
+                            std::move(null_mask));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*list_of_struct_of_list, *expected_level3_list);
 }
@@ -498,12 +498,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(structs_col),
-    null_count,
-    std::move(null_mask));
+  auto lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   std::vector<std::unique_ptr<cudf::column>> cols;
   cols.push_back(std::move(lists_col));
@@ -519,12 +519,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
 
   std::tie(null_mask, null_count) = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto expected_lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(expected_structs_col),
-    null_count,
-    std::move(null_mask));
+  auto expected_lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(expected_structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Test that the lists child column is as expected.
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_lists_col, struct_of_list_of_struct->child(0));

From e8156d42163fb02aa90baba9be20ab89bc9ebef1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 6 Aug 2024 17:03:10 -0400
Subject: [PATCH 30/72] Fix segmented-sort overlapped input/output indices
 (#16463)

Fixes call to CUB `DeviceSegmentedSort::SortPairs` where the input and output indices pointed to the same temp memory. The documentation from https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedSort.html#id8 indicates the `d_values_in` and `d_values_out` memory must not overlap so using the same pointer for both created invalid output in certain conditions. The internal function was implemented to expect the input values to be updated in-place. The fix uses separate device memory for the input and output indices.

Closes #16455

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16463
---
 cpp/src/sort/segmented_sort_impl.cuh    |  4 +++-
 cpp/tests/sort/segmented_sort_tests.cpp | 26 ++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 6d472925b30..281fdfa6b8f 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -79,6 +79,8 @@ struct column_fast_sort_fn {
                                                 stream,
                                                 rmm::mr::get_current_device_resource());
     mutable_column_view output_view = temp_col->mutable_view();
+    auto temp_indices               = cudf::column(
+      cudf::column_view(indices.type(), indices.size(), indices.head(), nullptr, 0), stream);
 
     // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time
     auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) {
@@ -118,7 +120,7 @@ struct column_fast_sort_fn {
     fast_sort_impl(ascending,
                    input.begin<T>(),
                    output_view.begin<T>(),
-                   indices.begin<size_type>(),
+                   temp_indices.view().begin<size_type>(),
                    indices.begin<size_type>(),
                    input.size(),
                    segment_offsets.size() - 1,
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index da9666cbc74..f4fe2c5956a 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <type_traits>
 #include <vector>
@@ -338,3 +340,25 @@ TEST_F(SegmentedSortInt, Bool)
   result = cudf::stable_segmented_sorted_order(cudf::table_view({test_col}), segments);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
+
+// Specific test for fix in https://github.com/rapidsai/cudf/pull/16463
+TEST_F(SegmentedSortInt, UnbalancedOffsets)
+{
+  auto h_input = std::vector<int64_t>(3535);
+  std::iota(h_input.begin(), h_input.end(), 1);
+  std::sort(h_input.begin(), h_input.end(), std::greater<int64_t>{});
+  std::fill_n(h_input.begin(), 4, 0);
+  std::fill(h_input.begin() + 3533, h_input.end(), 10000);
+  auto d_input = cudf::detail::make_device_uvector_sync(
+    h_input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input    = cudf::column_view(cudf::device_span<int64_t const>(d_input));
+  auto segments = cudf::test::fixed_width_column_wrapper<int32_t>({0, 4, 3533, 3535});
+  // full sort should match handcrafted input data here
+  auto expected = cudf::sort(cudf::table_view({input}));
+
+  auto input_view = cudf::table_view({input});
+  auto result     = cudf::segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+  result = cudf::stable_segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+}

From 6b0bff4b096ea87cd3436dba86146ed75af0f81e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 6 Aug 2024 14:48:16 -1000
Subject: [PATCH 31/72] Disallow cudf.Series to accept column in favor of
 `._from_column` (#16454)

`cudf.Series` is a public constructor that happens to accept a private `ColumnBase` object. Many ops return Columns and is natural to want to reconstruct a `Series`.

This PR adds a `SingleColumnFrame._from_column` classmethod for instances where we need to wrap a new column in an `Index` or `Series`. This constructor also passes some unneeded validation in `ColumnAccessor` and `Series`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16454
---
 python/cudf/cudf/core/byte_pair_encoding.py  |   6 +-
 python/cudf/cudf/core/column/categorical.py  |  16 +--
 python/cudf/cudf/core/column/methods.py      |  15 ++-
 python/cudf/cudf/core/column/numerical.py    |  12 +-
 python/cudf/cudf/core/column/string.py       |  23 ++--
 python/cudf/cudf/core/dataframe.py           | 116 +++++++++----------
 python/cudf/cudf/core/groupby/groupby.py     |  13 +--
 python/cudf/cudf/core/index.py               |  44 ++++++-
 python/cudf/cudf/core/indexed_frame.py       |  18 +--
 python/cudf/cudf/core/multiindex.py          |  19 ++-
 python/cudf/cudf/core/reshape.py             |   8 +-
 python/cudf/cudf/core/series.py              | 101 ++++++++++++----
 python/cudf/cudf/core/single_column_frame.py |  41 +++----
 python/cudf/cudf/core/tokenize_vocabulary.py |   8 +-
 python/cudf/cudf/core/tools/datetimes.py     |  11 +-
 python/cudf/cudf/core/tools/numeric.py       |  29 ++---
 python/cudf/cudf/datasets.py                 |   5 +-
 python/cudf/cudf/io/dlpack.py                |   2 +-
 python/cudf/cudf/tests/test_apply_rows.py    |   8 +-
 python/cudf/cudf/tests/test_column.py        |  44 ++++---
 python/cudf/cudf/tests/test_dataframe.py     |  26 +++--
 python/cudf/cudf/tests/test_decimal.py       |  10 +-
 python/cudf/cudf/tests/test_df_protocol.py   |   6 +-
 python/cudf/cudf/tests/test_list.py          |   2 +-
 python/cudf/cudf/tests/test_pickling.py      |   4 +-
 python/cudf/cudf/tests/test_replace.py       |   6 +-
 python/cudf/cudf/tests/test_series.py        |  10 +-
 python/cudf/cudf/tests/test_setitem.py       |  10 +-
 python/cudf/cudf/tests/test_string.py        |   2 +-
 python/cudf/cudf/tests/test_string_udfs.py   |   4 +-
 python/dask_cudf/dask_cudf/backends.py       |   7 +-
 python/dask_cudf/dask_cudf/core.py           |   2 +-
 32 files changed, 360 insertions(+), 268 deletions(-)

diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 4c881022ecf..6ca64a0a2be 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -27,7 +27,7 @@ class BytePairEncoder:
     def __init__(self, merges_pair: "cudf.Series"):
         self.merge_pairs = cpp_merge_pairs(merges_pair._column)
 
-    def __call__(self, text, separator: str = " "):
+    def __call__(self, text, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " "):
         sep = cudf.Scalar(separator, dtype="str")
         result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 55bfae30470..6fa69eb9cc1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -123,7 +123,7 @@ def categories(self) -> "cudf.core.index.Index":
         return self._column.dtype.categories
 
     @property
-    def codes(self) -> "cudf.Series":
+    def codes(self) -> cudf.Series:
         """
         Return Series of codes as well as the index.
         """
@@ -132,7 +132,7 @@ def codes(self) -> "cudf.Series":
             if isinstance(self._parent, cudf.Series)
             else None
         )
-        return cudf.Series(self._column.codes, index=index)
+        return cudf.Series._from_column(self._column.codes, index=index)
 
     @property
     def ordered(self) -> bool:
@@ -918,7 +918,7 @@ def find_and_replace(
             )
             cur_categories = replaced.categories
             new_categories = cur_categories.apply_boolean_mask(
-                ~cudf.Series(cur_categories.isin(drop_values))
+                cur_categories.isin(drop_values).unary_operator("not")
             )
             replaced = replaced._set_categories(new_categories)
             df = df.dropna(subset=["new"])
@@ -943,7 +943,7 @@ def find_and_replace(
         # If a category is being replaced by an existing one, we
         # want to map it to None. If it's totally new, we want to
         # map it to the new label it is to be replaced by
-        dtype_replace = cudf.Series._from_data({None: replacement_col})
+        dtype_replace = cudf.Series._from_column(replacement_col)
         dtype_replace[dtype_replace.isin(cats_col)] = None
         new_cats_col = cats_col.find_and_replace(
             to_replace_col, dtype_replace._column
@@ -1273,12 +1273,8 @@ def _categories_equal(
             return False
         # if order doesn't matter, sort before the equals call below
         if not ordered:
-            cur_categories = cudf.Series(cur_categories).sort_values(
-                ignore_index=True
-            )
-            new_categories = cudf.Series(new_categories).sort_values(
-                ignore_index=True
-            )
+            cur_categories = cur_categories.sort_values()
+            new_categories = new_categories.sort_values()
         return cur_categories.equals(new_categories)
 
     def _set_categories(
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 7c6f4e05577..8c46d238057 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -7,6 +7,8 @@
 from typing_extensions import Literal
 
 import cudf
+import cudf.core.column
+import cudf.core.column_accessor
 from cudf.utils.utils import NotIterable
 
 ParentType = Union["cudf.Series", "cudf.core.index.Index"]
@@ -84,14 +86,11 @@ def _return_or_inplace(
                         data=table, index=self._parent.index
                     )
             elif isinstance(self._parent, cudf.Series):
-                if retain_index:
-                    return cudf.Series(
-                        new_col,
-                        name=self._parent.name,
-                        index=self._parent.index,
-                    )
-                else:
-                    return cudf.Series(new_col, name=self._parent.name)
+                return cudf.Series._from_column(
+                    new_col,
+                    name=self._parent.name,
+                    index=self._parent.index if retain_index else None,
+                )
             elif isinstance(self._parent, cudf.BaseIndex):
                 return cudf.Index(new_col, name=self._parent.name)
             else:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c326a10c844..df27134d458 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -555,11 +555,8 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
                 if self.dtype.kind == "f":
                     # Exclude 'np.inf', '-np.inf'
-                    s = cudf.Series(self)
-                    # TODO: replace np.inf with cudf scalar when
-                    # https://github.com/rapidsai/cudf/pull/6297 merges
-                    non_infs = s[~((s == np.inf) | (s == -np.inf))]
-                    col = non_infs._column
+                    not_inf = (self != np.inf) & (self != -np.inf)
+                    col = self.apply_boolean_mask(not_inf)
                 else:
                     col = self
 
@@ -599,8 +596,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             else:
                 filled = self.fillna(0)
                 return (
-                    cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
-                    == cudf.Series(filled)
+                    filled.astype(to_dtype).astype(filled.dtype) == filled
                 ).all()
 
         # want to cast float to int:
@@ -615,7 +611,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             # NOTE(seberg): it would make sense to limit to the mantissa range.
             if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
-                return (cudf.Series(filled) % 1 == 0).all()
+                return (filled % 1 == 0).all()
             else:
                 return False
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b422ff86b17..1a4b558749d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -358,7 +358,7 @@ def cat(self, others=None, sep=None, na_rep=None):
             )
 
         if len(data) == 1 and data.null_count == 1:
-            data = [""]
+            data = cudf.core.column.as_column("", length=len(data))
         # We only want to keep the index if we are adding something to each
         # row, not if we are joining all the rows into a single string.
         out = self._return_or_inplace(data, retain_index=others is not None)
@@ -3623,7 +3623,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
-    def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
+    def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
         """
         Find all first occurrences of patterns in the Series/Index.
 
@@ -3679,12 +3679,12 @@ def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
                 f"got: {patterns_column.dtype}"
             )
 
-        return cudf.Series(
+        return cudf.Series._from_column(
             libstrings.find_multiple(self._column, patterns_column),
+            name=self._parent.name,
             index=self._parent.index
             if isinstance(self._parent, cudf.Series)
             else self._parent,
-            name=self._parent.name,
         )
 
     def isempty(self) -> SeriesOrIndex:
@@ -4376,14 +4376,9 @@ def code_points(self) -> SeriesOrIndex:
         2    99
         dtype: int32
         """
-
-        new_col = libstrings.code_points(self._column)
-        if isinstance(self._parent, cudf.Series):
-            return cudf.Series(new_col, name=self._parent.name)
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(new_col, name=self._parent.name)
-        else:
-            return new_col
+        return self._return_or_inplace(
+            libstrings.code_points(self._column), retain_index=False
+        )
 
     def translate(self, table: dict) -> SeriesOrIndex:
         """
@@ -4694,7 +4689,9 @@ def character_tokenize(self) -> SeriesOrIndex:
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series(result_col, name=self._parent.name, index=index)
+            return cudf.Series._from_column(
+                result_col, name=self._parent.name, index=index
+            )
         elif isinstance(self._parent, cudf.BaseIndex):
             return cudf.Index(result_col, name=self._parent.name)
         else:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 865d2706ca3..a53c7bcc63c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -382,7 +382,10 @@ def _setitem_tuple_arg(self, key, value):
                 length = len(idx) if idx is not None else 1
                 value = as_column(value, length=length)
 
-            new_col = cudf.Series(value, index=idx)
+            if isinstance(value, ColumnBase):
+                new_col = cudf.Series._from_column(value, index=idx)
+            else:
+                new_col = cudf.Series(value, index=idx)
             if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
@@ -500,28 +503,33 @@ def __getitem__(self, arg):
             return frame._slice(row_spec.key)
         elif isinstance(row_spec, indexing_utils.ScalarIndexer):
             result = frame._gather(row_spec.key, keep_index=True)
+            new_name = result.index[0]
+            new_index = ensure_index(result.keys())
             # Attempt to turn into series.
-            try:
-                # Behaviour difference from pandas, which will merrily
-                # turn any heterogeneous set of columns into a series if
-                # you only ask for one row.
-                new_name = result.index[0]
-                result = Series._concat(
-                    [result[name] for name in column_names],
-                    index=result.keys(),
-                )
-                result.name = new_name
-                return result
-            except TypeError:
-                # Couldn't find a common type, Hence:
-                # Raise in pandas compatibility mode,
-                # or just return a 1xN dataframe otherwise
-                if cudf.get_option("mode.pandas_compatible"):
-                    raise TypeError(
-                        "All columns need to be of same type, please "
-                        "typecast to common dtype."
+            if len(column_names) == 0:
+                return Series([], index=new_index, name=new_name)
+            else:
+                try:
+                    # Behaviour difference from pandas, which will merrily
+                    # turn any heterogeneous set of columns into a series if
+                    # you only ask for one row.
+                    ser = Series._concat(
+                        [result[name] for name in column_names],
                     )
-                return result
+                except TypeError as err:
+                    # Couldn't find a common type, Hence:
+                    # Raise in pandas compatibility mode,
+                    # or just return a 1xN dataframe otherwise
+                    if cudf.get_option("mode.pandas_compatible"):
+                        raise TypeError(
+                            "All columns need to be of same type, please "
+                            "typecast to common dtype."
+                        ) from err
+                    return result
+                else:
+                    ser.index = new_index
+                    ser.name = new_name
+                    return ser
         elif isinstance(row_spec, indexing_utils.EmptyIndexer):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
@@ -1488,14 +1496,14 @@ def __delitem__(self, name):
         self._drop_column(name)
 
     @_performance_tracking
-    def memory_usage(self, index=True, deep=False):
+    def memory_usage(self, index=True, deep=False) -> cudf.Series:
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
-        return Series._from_data(
-            data={None: as_column(mem_usage)},
+        return Series._from_column(
+            as_column(mem_usage),
             index=cudf.Index(names),
         )
 
@@ -1752,7 +1760,7 @@ def _concat(
             if 1 == first_data_column_position:
                 table_index = cudf.Index(cols[0])
             elif first_data_column_position > 1:
-                table_index = DataFrame._from_data(
+                table_index = cudf.MultiIndex._from_data(
                     data=dict(
                         zip(
                             indices[:first_data_column_position],
@@ -3803,7 +3811,9 @@ def agg(self, aggs, axis=None):
                     col_empty = column_empty(
                         len(idxs), dtype=col.dtype, masked=True
                     )
-                    ans = cudf.Series(data=col_empty, index=idxs)
+                    ans = cudf.Series._from_column(
+                        col_empty, index=cudf.Index(idxs)
+                    )
                     if isinstance(aggs.get(key), abc.Iterable):
                         # TODO : Allow simultaneous pass for multi-aggregation
                         # as a future optimization
@@ -4801,7 +4811,7 @@ def _func(x):  # pragma: no cover
         # this could be written as a single kernel
         result = {}
         for name, col in self._data.items():
-            apply_sr = Series._from_data({None: col})
+            apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
@@ -6083,8 +6093,8 @@ def quantile(
 
             if q_is_number:
                 result = result.transpose()
-                return Series(
-                    data=result._columns[0], index=result.index, name=q
+                return Series._from_column(
+                    result._columns[0], name=q, index=result.index
                 )
         else:
             # Ensure that qs is non-scalar so that we always get a column back.
@@ -6346,13 +6356,9 @@ def count(self, axis=0, numeric_only=False):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
         length = len(self)
-        return Series._from_data(
-            {
-                None: as_column(
-                    [length - col.null_count for col in self._columns]
-                )
-            },
-            cudf.Index(self._data.names),
+        return Series._from_column(
+            as_column([length - col.null_count for col in self._columns]),
+            index=cudf.Index(self._data.names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6480,7 +6486,7 @@ def _reduce(
                     )
                 else:
                     idx = cudf.Index(source._data.names)
-                return Series._from_data({None: as_column(result)}, idx)
+                return Series._from_column(as_column(result), index=idx)
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
@@ -6710,11 +6716,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                 result = result.set_mask(
                     cudf._lib.transform.bools_to_mask(mask._column)
                 )
-            return Series(
-                result,
-                index=self.index,
-                dtype=result_dtype,
-            )
+            return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result).set_index(self.index)
             result_df._set_columns_like(prepared._data)
@@ -7302,9 +7304,7 @@ def unnamed_group_generator():
 
         # Construct the resulting dataframe / series
         if not has_unnamed_levels:
-            result = Series._from_data(
-                data={None: stacked[0]}, index=new_index
-            )
+            result = Series._from_column(stacked[0], index=new_index)
         else:
             if unnamed_level_values.nlevels == 1:
                 unnamed_level_values = unnamed_level_values.get_level_values(0)
@@ -7445,10 +7445,8 @@ def to_struct(self, name=None):
             size=len(self),
             offset=0,
         )
-        return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor(
-                {name: col}, verify=False
-            ),
+        return cudf.Series._from_column(
+            col,
             index=self.index,
             name=name,
         )
@@ -7935,12 +7933,10 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_data(
-                {
-                    None: libcudf.transform.compute_column(
-                        [*self._columns], self._column_names, statements[0]
-                    )
-                }
+            return Series._from_column(
+                libcudf.transform.compute_column(
+                    [*self._columns], self._column_names, statements[0]
+                )
             )
 
         targets = []
@@ -8484,7 +8480,9 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns):
     return non_null_columns, dtypes
 
 
-def _find_common_dtypes_and_categories(non_null_columns, dtypes):
+def _find_common_dtypes_and_categories(
+    non_null_columns, dtypes
+) -> dict[Any, ColumnBase]:
     # A mapping of {idx: categories}, where `categories` is a
     # column of all the unique categorical values from each
     # categorical column across all input frames
@@ -8500,9 +8498,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
         ):
             # Combine and de-dupe the categories
-            categories[idx] = cudf.Series(
-                concat_columns([col.categories for col in cols])
-            )._column.unique()
+            categories[idx] = concat_columns(
+                [col.categories for col in cols]
+            ).unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_signed_type(len(categories[idx]))
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3cfbd1d736a..92c4b73ceaa 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -458,12 +458,11 @@ def size(self):
         """
         Return the size of each group.
         """
+        col = cudf.core.column.column_empty(
+            len(self.obj), "int8", masked=False
+        )
         return (
-            cudf.Series(
-                cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
-                )
-            )
+            cudf.Series._from_column(col)
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
@@ -484,7 +483,7 @@ def cumcount(self, ascending: bool = True):
                 "ascending is currently not implemented."
             )
         return (
-            cudf.Series(
+            cudf.Series._from_column(
                 cudf.core.column.column_empty(
                     len(self.obj), "int8", masked=False
                 ),
@@ -1069,7 +1068,7 @@ def ngroup(self, ascending=True):
             # Count descending from num_groups - 1 to 0
             groups = range(num_groups - 1, -1, -1)
 
-        group_ids = cudf.Series._from_data({None: as_column(groups)})
+        group_ids = cudf.Series._from_column(as_column(groups))
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0d29ef07e7d..094da09ab08 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -60,7 +60,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import Generator, Hashable, Iterable
     from datetime import tzinfo
 
 
@@ -1071,6 +1071,16 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            {name: column}, verify=False
+        )
+        return _index_from_data(ca)
+
     @classmethod
     @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
@@ -1092,8 +1102,30 @@ def _from_data_like_self(
     @classmethod
     @_performance_tracking
     def from_arrow(cls, obj):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
+        Index(['a', 'b', <NA>], dtype='object')
+        """
         try:
-            return cls(ColumnBase.from_arrow(obj))
+            return cls._from_column(ColumnBase.from_arrow(obj))
         except TypeError:
             # Try interpreting object as a MultiIndex before failing.
             return cudf.MultiIndex.from_arrow(obj)
@@ -1297,22 +1329,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
                 index=self,
                 positions=result_series,
                 method=method,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method == "nearest":
             result_series = _get_nearest_indexer(
                 index=self,
                 positions=result_series,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method is not None:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0678ebfdd81..24d947a574a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -35,6 +35,7 @@
     is_list_like,
     is_scalar,
 )
+from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -67,7 +68,6 @@
         Dtype,
         NotImplementedType,
     )
-    from cudf.core._base_index import BaseIndex
 
 
 doc_reset_index_template = """
@@ -304,6 +304,10 @@ def _from_data(
         index: BaseIndex | None = None,
     ):
         out = super()._from_data(data)
+        if not (index is None or isinstance(index, BaseIndex)):
+            raise ValueError(
+                f"index must be None or a cudf.Index not {type(index).__name__}"
+            )
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -2934,8 +2938,8 @@ def hash_values(self, method="murmur3", seed=None):
         # Note that both Series and DataFrame return Series objects from this
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
-        return cudf.Series._from_data(
-            {None: libcudf.hash.hash([*self._columns], method, seed)},
+        return cudf.Series._from_column(
+            libcudf.hash.hash([*self._columns], method, seed),
             index=self.index,
         )
 
@@ -3219,13 +3223,13 @@ def duplicated(self, subset=None, keep="first"):
         distinct = libcudf.stream_compaction.distinct_indices(
             columns, keep=keep
         )
-        (result,) = libcudf.copying.scatter(
+        result = libcudf.copying.scatter(
             [cudf.Scalar(False, dtype=bool)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
-        )
-        return cudf.Series(result, index=self.index)
+        )[0]
+        return cudf.Series._from_column(result, index=self.index)
 
     @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
@@ -3506,7 +3510,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self.index)
+        result = cudf.Series._from_column(col, index=self.index)
 
         return result
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 2788455aebf..9646b34830f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -702,12 +702,8 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             data_table = cudf.concat(
                 [
                     frame,
-                    cudf.DataFrame(
-                        {
-                            "idx": cudf.Series(
-                                column.as_column(range(len(frame)))
-                            )
-                        }
+                    cudf.DataFrame._from_data(
+                        {"idx": column.as_column(range(len(frame)))}
                     ),
                 ],
                 axis=1,
@@ -786,7 +782,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_data({None: index._data.columns[k]}),
+                cudf.Series._from_column(index._data.columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -852,7 +848,10 @@ def _get_row_major(
         valid_indices = self._get_valid_indices_by_tuple(
             df.index, row_tuple, len(df.index)
         )
-        indices = cudf.Series(valid_indices)
+        if isinstance(valid_indices, column.ColumnBase):
+            indices = cudf.Series._from_column(valid_indices)
+        else:
+            indices = cudf.Series(valid_indices)
         result = df.take(indices)
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
@@ -1925,8 +1924,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             *join_keys,
             how="inner",
         )
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index e7248977b1d..52a55760d4a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -484,9 +484,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         if len(new_objs) == 1 and not ignore_index:
             return new_objs[0]
         else:
-            return cudf.Series._concat(
-                objs, axis=axis, index=None if ignore_index else True
-            )
+            return cudf.Series._concat(objs, axis=axis, index=not ignore_index)
     elif typ is cudf.MultiIndex:
         return cudf.MultiIndex._concat(objs)
     elif issubclass(typ, cudf.Index):
@@ -632,7 +630,7 @@ def melt(
     def _tile(A, reps):
         series_list = [A] * reps
         if reps > 0:
-            return cudf.Series._concat(objs=series_list, index=None)
+            return cudf.Series._concat(objs=series_list, index=False)
         else:
             return cudf.Series([], dtype=A.dtype)
 
@@ -661,7 +659,7 @@ def _tile(A, reps):
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
-        objs=[frame[val] for val in value_vars], index=None
+        objs=[frame[val] for val in value_vars], index=False
     )
 
     return cudf.DataFrame(mdata)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 929af5cd981..de57ac5f290 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -69,6 +69,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
@@ -294,8 +296,8 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
             return result
         try:
             arg = self._loc_to_iloc(arg)
-        except (TypeError, KeyError, IndexError, ValueError):
-            raise KeyError(arg)
+        except (TypeError, KeyError, IndexError, ValueError) as err:
+            raise KeyError(arg) from err
 
         return self._frame.iloc[arg]
 
@@ -394,8 +396,10 @@ def _loc_to_iloc(self, arg):
             return _indices_from_labels(self._frame, arg)
 
         else:
-            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
-            if arg.dtype in (bool, np.bool_):
+            arg = cudf.core.series.Series._from_column(
+                cudf.core.column.as_column(arg)
+            )
+            if arg.dtype.kind == "b":
                 return arg
             else:
                 indices = _indices_from_labels(self._frame, arg)
@@ -510,7 +514,37 @@ def from_categorical(cls, categorical, codes=None):
         col = cudf.core.column.categorical.pandas_categorical_as_column(
             categorical, codes=codes
         )
-        return Series(data=col)
+        return Series._from_column(col)
+
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array: pa.Array):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
+        0       a
+        1       b
+        2    <NA>
+        dtype: object
+        """
+        return cls._from_column(ColumnBase.from_arrow(array))
 
     @classmethod
     @_performance_tracking
@@ -560,7 +594,8 @@ def from_masked_array(cls, data, mask, null_count=None):
         dtype: int64
         """
         col = as_column(data).set_mask(mask)
-        return cls(data=col)
+        ca = ColumnAccessor({None: col}, verify=False)
+        return cls._from_data(ca)
 
     @_performance_tracking
     def __init__(
@@ -586,10 +621,10 @@ def __init__(
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, (pd.Series, Series)):
                 index_from_data = ensure_index(data.index)
-        elif isinstance(data, ColumnAccessor):
+        elif isinstance(data, (ColumnAccessor, ColumnBase)):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
-                "ColumnAccessor"
+                "ColumnAccessor or a ColumnBase"
             )
         elif isinstance(data, dict):
             if not data:
@@ -656,6 +691,18 @@ def __init__(
             self._index = second_index
         self._check_data_index_length_match()
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls,
+        column: ColumnBase,
+        *,
+        name: abc.Hashable = None,
+        index: BaseIndex | None = None,
+    ) -> Self:
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca, index=index)
+
     @classmethod
     @_performance_tracking
     def _from_data(
@@ -1535,17 +1582,21 @@ def dtype(self):
 
     @classmethod
     @_performance_tracking
-    def _concat(cls, objs, axis=0, index=True):
+    def _concat(cls, objs, axis=0, index: bool = True):
         # Concatenate index if not provided
         if index is True:
             if isinstance(objs[0].index, cudf.MultiIndex):
-                index = cudf.MultiIndex._concat([o.index for o in objs])
+                result_index = cudf.MultiIndex._concat([o.index for o in objs])
             else:
                 with warnings.catch_warnings():
                     warnings.simplefilter("ignore", FutureWarning)
-                    index = cudf.core.index.Index._concat(
+                    result_index = cudf.core.index.Index._concat(
                         [o.index for o in objs]
                     )
+        elif index is False:
+            result_index = None
+        else:
+            raise ValueError(f"{index=} must be a bool")
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1597,7 +1648,9 @@ def _concat(cls, objs, axis=0, index=True):
         if len(objs):
             col = col._with_type_metadata(objs[0].dtype)
 
-        return cls(data=col, index=index, name=name)
+        return cls._from_data(
+            ColumnAccessor({name: col}, verify=False), index=result_index
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -2709,8 +2762,8 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series._from_data(
-            {self.name: val_counts.index.sort_values()._column}, name=self.name
+        return Series._from_column(
+            val_counts.index.sort_values()._column, name=self.name
         )
 
     @_performance_tracking
@@ -2999,8 +3052,8 @@ def isin(self, values):
                 f"to isin(), you passed a [{type(values).__name__}]"
             )
 
-        return Series._from_data(
-            {self.name: self._column.isin(values)}, index=self.index
+        return Series._from_column(
+            self._column.isin(values), name=self.name, index=self.index
         )
 
     @_performance_tracking
@@ -3036,7 +3089,7 @@ def unique(self):
         res = self._column.unique()
         if cudf.get_option("mode.pandas_compatible"):
             return res.values
-        return Series(res, name=self.name)
+        return Series._from_column(res, name=self.name)
 
     @_performance_tracking
     def value_counts(
@@ -3268,8 +3321,9 @@ def quantile(
         if return_scalar:
             return result
 
-        return Series._from_data(
-            data={self.name: result},
+        return Series._from_column(
+            result,
+            name=self.name,
             index=cudf.Index(np_array_q) if quant_index else None,
         )
 
@@ -3351,8 +3405,9 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series(
-            cudf.core.column.numerical.digitize(self._column, bins, right)
+        return Series._from_column(
+            cudf.core.column.numerical.digitize(self._column, bins, right),
+            name=self.name,
         )
 
     @_performance_tracking
@@ -5293,10 +5348,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     elif b_col.null_count:
         null_values = b_col.isnull()
     else:
-        return Series(result_col, index=index)
+        return Series._from_column(result_col, index=index)
 
     result_col[null_values] = False
     if equal_nan is True and a_col.null_count and b_col.null_count:
         result_col[equal_nulls] = True
 
-    return Series(result_col, index=index)
+    return Series._from_column(result_col, index=index)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index a5ff1223791..eb6714029cf 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -15,11 +15,14 @@
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
+    from collections.abc import Hashable
+
     import cupy
     import numpy
     import pyarrow as pa
@@ -112,35 +115,17 @@ def values_host(self) -> numpy.ndarray:  # noqa: D102
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, array) -> Self:
-        """Create from PyArrow Array/ChunkedArray.
-
-        Parameters
-        ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted.
-
-        Raises
-        ------
-        TypeError for invalid input type.
-
-        Returns
-        -------
-        SingleColumnFrame
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        """Constructor for a single Column."""
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
-        Index(['a', 'b', None], dtype='object')
-        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
-        0       a
-        1       b
-        2    <NA>
-        dtype: object
-        """
-        return cls(ColumnBase.from_arrow(array))
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array) -> Self:
+        raise NotImplementedError
 
     @_performance_tracking
     def to_arrow(self) -> pa.Array:
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index afb3496311b..99d85c0c5c0 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -22,7 +22,9 @@ class TokenizeVocabulary:
     def __init__(self, vocabulary: "cudf.Series"):
         self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
 
-    def tokenize(self, text, delimiter: str = "", default_id: int = -1):
+    def tokenize(
+        self, text, delimiter: str = "", default_id: int = -1
+    ) -> cudf.Series:
         """
         Parameters
         ----------
@@ -45,4 +47,4 @@ def tokenize(self, text, delimiter: str = "", default_id: int = -1):
             text._column, self.vocabulary, delim, default_id
         )
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c6e2b5d10e1..2f77778116f 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,6 +18,8 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
+from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
@@ -275,7 +277,7 @@ def to_datetime(
                 format=format,
                 utc=utc,
             )
-            return cudf.Series(col, index=arg.index)
+            return cudf.Series._from_column(col, index=arg.index)
         else:
             col = _process_col(
                 col=column.as_column(arg),
@@ -286,9 +288,12 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                return cudf.Index(col, name=arg.name)
+                ca = ColumnAccessor({arg.name: col}, verify=False)
+                return cudf.DatetimeIndex._from_data(ca)
             elif isinstance(arg, (cudf.Series, pd.Series)):
-                return cudf.Series(col, index=arg.index, name=arg.name)
+                return cudf.Series._from_column(
+                    col, name=arg.name, index=ensure_index(arg.index)
+                )
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 07158e4ee61..8b95f6f6a04 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -11,8 +13,12 @@
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
+from cudf.core.index import ensure_index
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 def to_numeric(arg, errors="raise", downcast=None):
     """
@@ -164,7 +170,9 @@ def to_numeric(arg, errors="raise", downcast=None):
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
-        return cudf.Series(col, index=arg.index, name=arg.name)
+        return cudf.Series._from_column(
+            col, name=arg.name, index=ensure_index(arg.index)
+        )
     else:
         if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.
@@ -226,25 +234,10 @@ def _convert_str_col(col, errors, _downcast=None):
             raise ValueError("Unable to convert some strings to numerics.")
 
 
-def _proc_inf_empty_strings(col):
+def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
     """Handles empty and infinity strings"""
     col = libstrings.to_lower(col)
-    col = _proc_empty_strings(col)
-    col = _proc_inf_strings(col)
-    return col
-
-
-def _proc_empty_strings(col):
-    """Replaces empty strings with NaN"""
-    s = cudf.Series(col)
-    s = s.where(s != "", "NaN")
-    return s._column
-
-
-def _proc_inf_strings(col):
-    """Convert "inf/infinity" strings into "Inf", the native string
-    representing infinity in libcudf
-    """
+    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
     # TODO: This can be handled by libcudf in
     # future see StringColumn.as_numerical_column
     col = libstrings.replace_multi(
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index 7b183d5f1a3..dbabaacf6b5 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -5,7 +5,6 @@
 
 import cudf
 from cudf._lib.transform import bools_to_mask
-from cudf.core.column_accessor import ColumnAccessor
 
 __all__ = ["timeseries", "randomdata"]
 
@@ -73,9 +72,7 @@ def timeseries(
         )
         mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
         masked_col = gdf[col]._column.set_mask(mask_buf)
-        gdf[col] = cudf.Series._from_data(
-            ColumnAccessor({None: masked_col}), index=gdf.index
-        )
+        gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
     return gdf
 
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index d3d99aab0cd..1347b2cc38f 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
     if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
         gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf = cudf.Series._from_data({None: cudf_obj})
+        gdf = cudf.Series._from_column(cudf_obj)
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index a11022c1a17..f9b0d9c1e78 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -27,8 +27,12 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic):
         gdf_series_expected = gdf_series_a * gdf_series_b
     else:
         # optimistically ignore the null masks
-        a = cudf.Series(column.build_column(gdf_series_a.data, dtype))
-        b = cudf.Series(column.build_column(gdf_series_b.data, dtype))
+        a = cudf.Series._from_column(
+            column.build_column(gdf_series_a.data, dtype)
+        )
+        b = cudf.Series._from_column(
+            column.build_column(gdf_series_b.data, dtype)
+        )
         gdf_series_expected = a * b
 
     df_expected = cudf.DataFrame(
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index c288155112c..4aa7fb27c9b 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -95,7 +95,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
     else:
         assert col.size == (col.data.size / col.dtype.itemsize)
 
-    got = cudf.Series(col)
+    got = cudf.Series._from_column(col)
 
     if offset is None:
         offset = 0
@@ -112,8 +112,8 @@ def test_column_offset_and_size(pandas_input, offset, size):
 
 def column_slicing_test(col, offset, size, cast_to_float=False):
     col_slice = col.slice(offset, offset + size)
-    series = cudf.Series(col)
-    sliced_series = cudf.Series(col_slice)
+    series = cudf.Series._from_column(col)
+    sliced_series = cudf.Series._from_column(col_slice)
 
     if cast_to_float:
         pd_series = series.astype(float).to_pandas()
@@ -208,7 +208,9 @@ def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     )
 
     got = (
-        cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null))
+        cudf.Series._from_column(
+            as_column(scalar, length=size, nan_as_null=nan_as_null)
+        )
         .dropna()
         .to_numpy()
     )
@@ -250,12 +252,18 @@ def test_column_chunked_array_creation():
     actual_column = cudf.core.column.as_column(chunked_array, dtype="float")
     expected_column = cudf.core.column.as_column(pyarrow_array, dtype="float")
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
     actual_column = cudf.core.column.as_column(chunked_array)
     expected_column = cudf.core.column.as_column(pyarrow_array)
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
 
 @pytest.mark.parametrize(
@@ -287,7 +295,7 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype):
     gpu_data_view = gpu_data.view(to_dtype)
 
     expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype)
-    got = cudf.Series(gpu_data_view, dtype=gpu_data_view.dtype)
+    got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype)
 
     gpu_ptr = gpu_data.data.get_ptr(mode="read")
     assert gpu_ptr == got._column.data.get_ptr(mode="read")
@@ -327,7 +335,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype):
     ],
 )
 def test_column_view_valid_string_to_numeric(data, to_dtype):
-    expect = cudf.Series(cudf.Series(data)._column.view(to_dtype))
+    expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype))
     got = cudf.Series(str_host_view(data, to_dtype))
 
     assert_eq(expect, got)
@@ -342,7 +350,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="int32")
     expect = cudf.Series(expect_data, dtype="float32")
-    got = cudf.Series(sr._column.view("float32"))
+    got = cudf.Series._from_column(sr._column.view("float32"))
 
     assert_eq(expect, got)
 
@@ -354,7 +362,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="float64")
     expect = cudf.Series(expect_data, dtype="int64")
-    got = cudf.Series(sr._column.view("int64"))
+    got = cudf.Series._from_column(sr._column.view("int64"))
 
     assert_eq(expect, got)
 
@@ -365,7 +373,9 @@ def test_column_view_numeric_slice(slc):
     sr = cudf.Series(data)
 
     expect = cudf.Series(data[slc].view("int64"))
-    got = cudf.Series(sr._column.slice(slc.start, slc.stop).view("int64"))
+    got = cudf.Series._from_column(
+        sr._column.slice(slc.start, slc.stop).view("int64")
+    )
 
     assert_eq(expect, got)
 
@@ -376,7 +386,7 @@ def test_column_view_numeric_slice(slc):
 def test_column_view_string_slice(slc):
     data = ["a", "bcde", "cd", "efg", "h"]
 
-    expect = cudf.Series(
+    expect = cudf.Series._from_column(
         cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8")
     )
     got = cudf.Series(str_host_view(data[slc], "int8"))
@@ -409,7 +419,10 @@ def test_as_column_buffer(data, expected):
     actual_column = cudf.core.column.as_column(
         cudf.core.buffer.as_buffer(data), dtype=data.dtype
     )
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
@@ -436,7 +449,10 @@ def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
     pyarrow_data = pa.array(data, **pyarrow_kwargs)
     cudf_from_pyarrow = as_column(pyarrow_data)
     expected = as_column(data, **cudf_kwargs)
-    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(cudf_from_pyarrow),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e2ce5c03b70..2c59253d500 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4264,34 +4264,36 @@ def test_empty_dataframe_describe():
 def test_as_column_types():
     col = column.as_column(cudf.Series([], dtype="float64"))
     assert_eq(col.dtype, np.dtype("float64"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="object"))
 
     assert_eq(pds, gds)
 
     pds = pd.Series(np.array([1, 2, 3]), dtype="float32")
-    gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32"))
+    gds = cudf.Series._from_column(
+        column.as_column(np.array([1, 2, 3]), dtype="float32")
+    )
 
     assert_eq(pds, gds)
 
@@ -4301,23 +4303,25 @@ def test_as_column_types():
     assert_eq(pds, gds)
 
     pds = pd.Series([], dtype="float64")
-    gds = cudf.Series(column.as_column(pds))
+    gds = cudf.Series._from_column(column.as_column(pds))
     assert_eq(pds, gds)
 
     pds = pd.Series([1, 2, 4], dtype="int64")
-    gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64"))
+    gds = cudf.Series._from_column(
+        column.as_column(cudf.Series([1, 2, 4]), dtype="int64")
+    )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="float32")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32")
     )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="str")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")
     )
 
@@ -6521,7 +6525,9 @@ def test_from_pandas_for_series_nan_as_null(nan_as_null):
     data = [np.nan, 2.0, 3.0]
     psr = pd.Series(data)
 
-    expected = cudf.Series(column.as_column(data, nan_as_null=nan_as_null))
+    expected = cudf.Series._from_column(
+        column.as_column(data, nan_as_null=nan_as_null)
+    )
     got = cudf.from_pandas(psr, nan_as_null=nan_as_null)
 
     assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 65f739bc74a..b63788d20b7 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -106,7 +106,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale)
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -146,7 +146,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
         .cast("float64")
         .cast(pa.decimal128(to_dtype.precision, to_dtype.scale))
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -206,9 +206,9 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
     if isinstance(to_dtype, Decimal32Dtype):
-        expected = cudf.Series(Decimal32Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr))
     elif isinstance(to_dtype, Decimal64Dtype):
-        expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning):
         got = s.astype(to_dtype)
@@ -245,7 +245,7 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(to_dtype, safe=False)
 
     got = got.astype(to_dtype)
-    expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
     assert_eq(got.dtype, expected.dtype)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 7f48e414180..44270d20d59 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -78,7 +78,7 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But
     # currently only non-null values are compared, null positions are
     # unchecked.
-    non_null_idxs = ~cudf.Series(cudfcol).isna()
+    non_null_idxs = cudfcol.notnull()
     assert_eq(
         col_from_buf.apply_boolean_mask(non_null_idxs),
         cudfcol.apply_boolean_mask(non_null_idxs),
@@ -86,8 +86,8 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get()
     col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get()
     assert_eq(
-        array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
-        col_array[non_null_idxs.to_numpy()].flatten(),
+        array_from_dlpack[non_null_idxs.values_host].flatten(),
+        col_array[non_null_idxs.values_host].flatten(),
     )
 
 
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 36bcaa66d7d..c4c883ca9f9 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -946,5 +946,5 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
         null_count=col.null_count,
         children=(column_empty(0, col.children[0].dtype), empty_inner),
     )
-    ser = cudf.Series._from_data({None: col_empty_offset})
+    ser = cudf.Series._from_column(col_empty_offset)
     assert ser.memory_usage() == 8
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 719e8a33285..0f13a9e173a 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -127,7 +127,7 @@ def test_pickle_categorical_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
 
 
 @pytest.mark.parametrize(
@@ -148,4 +148,4 @@ def test_pickle_string_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d4fe5ff3bb5..1973fe6fb41 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -817,12 +817,12 @@ def test_fillna_string(ps_data, fill_value, inplace):
 def test_series_fillna_invalid_dtype(data_dtype):
     gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype)
     fill_value = 2.5
-    with pytest.raises(TypeError) as raises:
-        gdf.fillna(fill_value)
-    raises.match(
+    msg = (
         f"Cannot safely cast non-equivalent"
         f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}"
     )
+    with pytest.raises(TypeError, match=msg):
+        gdf.fillna(fill_value)
 
 
 @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8ed78d804bf..6a1887afb1f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2041,7 +2041,7 @@ def test_series_ordered_dedup():
     sr = cudf.Series(np.random.randint(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
-    got = cudf.Series(sr._column.unique())
+    got = cudf.Series._from_column(sr._column.unique())
     assert_eq(expect.values, got.values)
 
 
@@ -2697,7 +2697,9 @@ def test_series_duplicate_index_reindex():
 def test_list_category_like_maintains_dtype():
     dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True)
     data = [1, 2, 3]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
@@ -2705,7 +2707,9 @@ def test_list_category_like_maintains_dtype():
 def test_list_interval_like_maintains_dtype():
     dtype = cudf.IntervalDtype(subtype=np.int8)
     data = [pd.Interval(1, 2)]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 69122cdbafa..5406836ba61 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -178,13 +178,19 @@ def test_column_set_equal_length_object_by_mask():
     bool_col = cudf.Series([True, True, True, True, True])._column
 
     data[bool_col] = replace_data
-    assert_eq(cudf.Series(data), cudf.Series(replace_data))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series._from_column(replace_data),
+    )
 
     data = cudf.Series([0, 0, 1, 1, 1])._column
     bool_col = cudf.Series([True, False, True, False, True])._column
     data[bool_col] = replace_data
 
-    assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500]))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series([100, 0, 300, 1, 500]),
+    )
 
 
 def test_column_set_unequal_length_object_by_mask():
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index f447759d010..4bd084a3938 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2677,7 +2677,7 @@ def test_string_int_to_ipv4():
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
 
-    got = cudf.Series(gsr._column.int2ip())
+    got = cudf.Series._from_column(gsr._column.int2ip())
 
     assert_eq(expected, got)
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 4432d2afc8e..69876d97aad 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -96,7 +96,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
     with _CUDFNumbaConfig():
         udf_str_kernel.forall(len(data))(str_views, output)
@@ -105,7 +105,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
 
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 4bdb5d921ec..2b1f745fc04 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -102,6 +102,7 @@ def _nest_list_data(data, leaf_type):
 
 @_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
+    """Return a non empty column as metadata."""
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
             s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
@@ -128,7 +129,7 @@ def _get_non_empty_data(s):
         data = [{key: None for key in struct_dtype.fields.keys()}] * 2
         data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = pa.array(["cat", "dog"])
+        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
         from cudf.utils.dtypes import get_time_unit
 
@@ -153,7 +154,7 @@ def _nonempty_series(s, idx=None):
         idx = _nonempty_index(s.index)
     data = _get_non_empty_data(s._column)
 
-    return cudf.Series(data, name=s.name, index=idx)
+    return cudf.Series._from_column(data, name=s.name, index=idx)
 
 
 @meta_nonempty.register(cudf.DataFrame)
@@ -424,7 +425,7 @@ def hash_object_cudf_index(ind, index=None):
         return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return cudf.Series(col).hash_values()
+    return cudf.Series._from_column(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index aab56e3a1b0..3181c8d69ec 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -342,7 +342,7 @@ def groupby(self, by=None, **kwargs):
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series(outcol)
+    return cudf.Series._from_column(outcol)
 
 
 @_dask_cudf_performance_tracking

From 3fd8783e49246f4ae61351375201d616d5ab6b55 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 13:00:09 -0700
Subject: [PATCH 32/72] Add `stream` param to stream compaction APIs (#16295)

Add `stream` param to a bunch of stream compaction APIs.

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16295
---
 cpp/include/cudf/detail/stream_compaction.hpp |  30 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   9 +-
 cpp/include/cudf/stream_compaction.hpp        |  30 +-
 .../stream_compaction/apply_boolean_mask.cu   |   3 +-
 cpp/src/stream_compaction/distinct.cu         |   4 +-
 cpp/src/stream_compaction/distinct_count.cu   |  11 +-
 cpp/src/stream_compaction/drop_nans.cu        |   6 +-
 cpp/src/stream_compaction/drop_nulls.cu       |   6 +-
 cpp/src/stream_compaction/unique.cu           |   3 +-
 cpp/src/stream_compaction/unique_count.cu     |   8 +-
 .../stream_compaction/unique_count_column.cu  |   7 +-
 cpp/tests/streams/stream_compaction_test.cpp  | 365 ++++++++++++++----
 java/src/main/native/src/TableJni.cpp         |   1 +
 13 files changed, 362 insertions(+), 121 deletions(-)

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 05194148a70..85d2ee9790f 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -29,9 +29,7 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                           cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
@@ -41,9 +39,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                          cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
@@ -53,8 +49,6 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 
 /**
  * @copydoc cudf::apply_boolean_mask
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
@@ -63,8 +57,6 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
 
 /**
  * @copydoc cudf::unique
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
@@ -75,8 +67,6 @@ std::unique_ptr<table> unique(table_view const& input,
 
 /**
  * @copydoc cudf::distinct
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> distinct(table_view const& input,
                                 std::vector<size_type> const& keys,
@@ -110,9 +100,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
@@ -120,18 +108,14 @@ cudf::size_type unique_count(column_view const& input,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::unique_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(table_view const& input,
                              null_equality nulls_equal,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
@@ -139,9 +123,7 @@ cudf::size_type distinct_count(column_view const& input,
                                rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal,
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index c11e07cd190..be0bd27083c 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -26,10 +26,7 @@ namespace CUDF_EXPORT cudf {
 namespace lists::detail {
 
 /**
- * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @copydoc cudf::lists::apply_boolean_mask
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
@@ -37,9 +34,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::list::distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::lists::distinct
  */
 std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index cfe404ff6ab..ced8d5849d0 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -67,6 +67,7 @@ namespace CUDF_EXPORT cudf {
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-null fields in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-null fields in @p keys.
@@ -75,6 +76,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -99,6 +101,7 @@ std::unique_ptr<table> drop_nulls(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without nulls in the columns
  * of @p keys.
@@ -106,6 +109,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,6 +145,7 @@ std::unique_ptr<table> drop_nulls(
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-NAN elements in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-NAN elements in @p keys.
@@ -149,6 +154,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -174,6 +180,7 @@ std::unique_ptr<table> drop_nans(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without NANs in the columns
  * of @p keys.
@@ -181,6 +188,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -200,6 +208,7 @@ std::unique_ptr<table> drop_nans(
  * @param[in] input The input table_view to filter
  * @param[in] boolean_mask A nullable column_view of type type_id::BOOL8 used
  * as a mask to filter the `input`.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing copy of all rows of @p input passing
  * the filter defined by @p boolean_mask.
@@ -207,6 +216,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +251,7 @@ enum class duplicate_keep_option {
  * @param[in] keep            keep any, first, last, or none of the found duplicates
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
  *                            equal if null_equality::UNEQUAL
+ * @param[in] stream          CUDA stream used for device memory operations and kernel launches
  * @param[in] mr              Device memory resource used to allocate the returned table's device
  *                            memory
  *
@@ -251,6 +262,7 @@ std::unique_ptr<table> unique(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -269,6 +281,7 @@ std::unique_ptr<table> unique(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows in an unspecified order
  */
@@ -278,6 +291,7 @@ std::unique_ptr<table> distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -346,12 +360,14 @@ std::unique_ptr<table> stable_distinct(
  * @param[in] input The column_view whose consecutive groups of equivalent rows will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling);
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a table.
@@ -359,11 +375,13 @@ cudf::size_type unique_count(column_view const& input,
  * @param[in] input Table whose consecutive groups of equivalent rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(table_view const& input,
-                             null_equality nulls_equal = null_equality::EQUAL);
+                             null_equality nulls_equal    = null_equality::EQUAL,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct elements in the column_view.
@@ -382,12 +400,14 @@ cudf::size_type unique_count(table_view const& input,
  * @param[in] input The column_view whose distinct elements will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling);
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct rows in a table.
@@ -395,11 +415,13 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] input Table whose distinct rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal.
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal = null_equality::EQUAL);
+                               null_equality nulls_equal    = null_equality::EQUAL,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index cdca9517d94..9812f4ffbd7 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -91,9 +91,10 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 6afd6e34c50..24e2692cb6f 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -150,11 +150,11 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
+                                rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> distinct_indices(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index cdf9faddf31..78eb0fa5212 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -218,15 +218,18 @@ cudf::size_type distinct_count(column_view const& input,
 
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling)
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::distinct_count(input, null_handling, nan_handling, stream);
 }
 
-cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type distinct_count(table_view const& input,
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::distinct_count(input, nulls_equal, stream);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index b46381c8ff6..b98ebbc2ecc 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -117,20 +117,22 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove nan elements.
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index cb7cd61bf02..2497e4e5065 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -90,20 +90,22 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove null elements.
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index edb47984d13..93de0e60b6d 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -119,10 +119,11 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
+                              rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
+  return detail::unique(input, keys, keep, nulls_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 19607fe8105..d842f63cd7b 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,10 +67,12 @@ cudf::size_type unique_count(table_view const& keys,
 
 }  // namespace detail
 
-cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type unique_count(table_view const& input,
+                             null_equality nulls_equal,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::unique_count(input, nulls_equal, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count_column.cu b/cpp/src/stream_compaction/unique_count_column.cu
index 16758b6e3a7..89ce2391a7b 100644
--- a/cpp/src/stream_compaction/unique_count_column.cu
+++ b/cpp/src/stream_compaction/unique_count_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,10 +101,11 @@ cudf::size_type unique_count(column_view const& input,
 
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling)
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::unique_count(input, null_handling, nan_handling, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 56443870602..443f4548b2c 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -41,6 +41,7 @@ auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
 auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
 auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
 
+using int16s_col = cudf::test::fixed_width_column_wrapper<int16_t>;
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 using floats_col = cudf::test::fixed_width_column_wrapper<float>;
 
@@ -51,50 +52,9 @@ using cudf::test::iterators::no_nulls;
 using cudf::test::iterators::null_at;
 using cudf::test::iterators::nulls_at;
 
-struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+struct StreamCompactionTest : public cudf::test::BaseFixture {};
 
-struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
-
-TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
-{
-  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
-  // groups for equivalent keys because KEEP_ANY is nondeterministic.
-  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
-  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
-  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
-  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
-
-  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
-  auto const key_idx = std::vector<cudf::size_type>{2, 3};
-
-  // NaNs are unequal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-
-  // NaNs are equal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-}
-
-TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+TEST_F(StreamCompactionTest, StableDistinctKeepAny)
 {
   auto constexpr null{0.0};  // shadow the global `null` variable of type int
 
@@ -150,7 +110,7 @@ TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+TEST_F(StreamCompactionTest, StableDistinctKeepFirstLastNone)
 {
   // Column(s) used to test needs to have different rows for the same keys.
   auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
@@ -192,44 +152,313 @@ TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+TEST_F(StreamCompactionTest, DropNaNs)
 {
-  // Column(s) used to test needs to have different rows for the same keys.
-  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
-  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
-  auto const input   = cudf::table_view{{col, keys}};
-  auto const key_idx = std::vector<cudf::size_type>{1};
+  auto const col1 = floats_col{{1., 2., NaN, NaN, 5., 6.}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2, 5})};
+  auto const col3 = floats_col{{NaN, 40., 70., NaN, 2., 10.}, nulls_at({2, 5})};
+  cudf::table_view input{{col1, col2, col3}};
+
+  std::vector<cudf::size_type> keys{0, 2};
 
-  // KEEP_FIRST
   {
-    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
-    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = floats_col{{1., 2., 3., 5., 6.}, nulls_at({2, 4})};
+    auto const col2_expected = int32s_col{{10, 40, 70, 2, 10}, nulls_at({2, 4})};
+    auto const col3_expected = floats_col{{NaN, 40., 70., 2., 10.}, nulls_at({2, 4})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, keys.size() - 1, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 
-  // KEEP_LAST
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // Without keep_threshold
+    auto const col1_expected = floats_col{{2., 3., 5., 6.}, nulls_at({1, 3})};
+    auto const col2_expected = int32s_col{{40, 70, 2, 10}, nulls_at({1, 3})};
+    auto const col3_expected = floats_col{{40., 70., 2., 10.}, nulls_at({1, 3})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
+}
+
+TEST_F(StreamCompactionTest, DropNulls)
+{
+  auto const col1 = int16s_col{{1, 0, 1, 0, 1, 0}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2})};
+  auto const col3 = floats_col{{10., 40., 70., 5., 2., 10.}, no_nulls()};
+  cudf::table_view input{{col1, col2, col3}};
+  std::vector<cudf::size_type> keys{0, 1, 2};
 
-  // KEEP_NONE
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 6};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1, 0}, null_at(4)};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2, 10}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2., 10.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, keys.size() - 1, cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  {
+    // Without keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1}, no_nulls()};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 }
+
+TEST_F(StreamCompactionTest, Unique)
+{
+  auto const col1     = int32s_col{5, 4, 3, 5, 8, 5};
+  auto const col2     = floats_col{4., 5., 3., 4., 9., 4.};
+  auto const col1_key = int32s_col{20, 20, 20, 19, 21, 9};
+  auto const col2_key = int32s_col{19, 19, 20, 20, 9, 21};
+
+  cudf::table_view input{{col1, col2, col1_key, col2_key}};
+  std::vector<cudf::size_type> keys = {2, 3};
+
+  {
+    // KEEP_FIRST
+    auto const exp_col1_first     = int32s_col{5, 3, 5, 8, 5};
+    auto const exp_col2_first     = floats_col{4., 3., 4., 9., 4.};
+    auto const exp_col1_key_first = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_first = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_first{
+      {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_FIRST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, *result);
+  }
+
+  {
+    // KEEP_LAST
+    auto const exp_col1_last     = int32s_col{4, 3, 5, 8, 5};
+    auto const exp_col2_last     = floats_col{5., 3., 4., 9., 4.};
+    auto const exp_col1_key_last = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_last = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_last{
+      {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_LAST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, *result);
+  }
+
+  {
+    // KEEP_NONE
+    auto const exp_col1_unique     = int32s_col{3, 5, 8, 5};
+    auto const exp_col2_unique     = floats_col{3., 4., 9., 4.};
+    auto const exp_col1_key_unique = int32s_col{20, 19, 21, 9};
+    auto const exp_col2_key_unique = int32s_col{20, 20, 9, 21};
+    cudf::table_view expected_unique{
+      {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_NONE,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, *result);
+  }
+}
+
+TEST_F(StreamCompactionTest, Distinct)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col1  = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const col2  = floats_col{10, 11, 12, 13, 14, 15, 16};
+  auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9};
+  auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 0, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 10, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_FIRST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 2, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 12, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_LAST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_NONE,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+}
+
+TEST_F(StreamCompactionTest, ApplyBooleanMask)
+{
+  auto const col = int32s_col{
+    9668, 9590, 9526, 9205, 9434, 9347, 9160, 9569, 9143, 9807, 9606, 9446, 9279, 9822, 9691};
+  cudf::test::fixed_width_column_wrapper<bool> mask({false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true});
+  cudf::table_view input({col});
+  auto const col_expected = int32s_col{9526, 9347, 9569, 9807, 9279, 9691};
+  cudf::table_view expected({col_expected});
+  auto const result = cudf::apply_boolean_mask(input, mask, cudf::test::get_default_stream());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StreamCompactionTest, UniqueCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+  std::vector<double> input_data(input.begin(), input.end());
+
+  auto const new_end  = std::unique(input_data.begin(), input_data.end());
+  auto const expected = std::distance(input_data.begin(), new_end);
+  EXPECT_EQ(
+    expected,
+    cudf::unique_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, UniqueCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const new_end = std::unique(pair_input.begin(), pair_input.end());
+  auto const result  = std::distance(pair_input.begin(), new_end);
+  EXPECT_EQ(
+    result,
+    cudf::unique_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const expected = static_cast<cudf::size_type>(
+    std::set<std::pair<int32_t, int32_t>>(pair_input.begin(), pair_input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a9ace1398e4..76ca8c533ce 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -3919,6 +3919,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
                      keep_option,
                      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
                      cudf::nan_equality::ALL_EQUAL,
+                     cudf::get_default_stream(),
                      rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }

From b933b54858a84082980f20522738fda4969a1318 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 7 Aug 2024 20:07:42 -0500
Subject: [PATCH 33/72] Use tool.scikit-build.cmake.version, set
 scikit-build-core minimum-version (#16503)

Contributes to https://github.com/rapidsai/build-planning/issues/58.

`scikit-build-core==0.10.0` was released today (https://github.com/scikit-build/scikit-build-core/releases/tag/v0.10.0), and wheel-building configurations across RAPIDS are incompatible with it.

This proposes upgrading to that version and fixing configuration here in a way that:

* is compatible with that new `scikit-build-core` version
* takes advantage of the forward-compatibility mechanism (`minimum-version`) that `scikit-build-core` provides, to reduce the risk of needing to do this again in the future

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16503
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/cudf_kafka/meta.yaml               | 2 +-
 dependencies.yaml                                | 4 ++--
 python/cudf/pyproject.toml                       | 5 +++--
 python/cudf_kafka/pyproject.toml                 | 5 +++--
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d04804cafaf..8d5fc2e31d9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -82,7 +82,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index e2c3558030d..7b0485d7f29 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -80,7 +80,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9137f099ad1..8d7ef63715b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,7 @@ requirements:
     - python
     - cython >=3.0.3
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
     # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 1b0e0e2c236..748a32e5518 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/dependencies.yaml b/dependencies.yaml
index abb55a5e011..b0d62a9fb0d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -301,10 +301,10 @@ dependencies:
           - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: conda
         packages:
-          - scikit-build-core>=0.7.0
+          - scikit-build-core>=0.10.0
       - output_types: [requirements, pyproject]
         packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - scikit-build-core[pyproject]>=0.10.0
   rapids_build_setuptools:
     common:
       - output_types: [requirements, pyproject]
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index b2ddb06d8c9..60ac171f3d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -133,7 +133,8 @@ requires = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a9b60133f42..63c5b07c5f3 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -86,7 +86,8 @@ filterwarnings = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true

From c146eed6f36e7c82052a3288e1bf6ab8c2216637 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:19:46 -0700
Subject: [PATCH 34/72] Expose `stream` param in transform APIs (#16452)

Exposes the `stream` param in transform APIs

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16452
---
 cpp/include/cudf/transform.hpp       |  21 +++-
 cpp/src/interop/to_arrow.cu          |   2 +-
 cpp/src/interop/to_arrow_device.cu   |   4 +-
 cpp/src/interop/to_arrow_host.cu     |   2 +-
 cpp/src/transform/bools_to_mask.cu   |   4 +-
 cpp/src/transform/compute_column.cu  |   3 +-
 cpp/src/transform/encode.cu          |   4 +-
 cpp/src/transform/mask_to_bools.cu   |   3 +-
 cpp/src/transform/nans_to_nulls.cu   |   4 +-
 cpp/src/transform/one_hot_encode.cu  |   3 +-
 cpp/src/transform/row_bit_count.cu   |  11 +-
 cpp/src/transform/transform.cpp      |   3 +-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/streams/transform_test.cpp | 164 +++++++++++++++++++++++++++
 14 files changed, 210 insertions(+), 19 deletions(-)
 create mode 100644 cpp/tests/streams/transform_test.cpp

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index adc5bdb2af8..f16214260f7 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf {
  * @param unary_udf     The PTX/CUDA string of the unary function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the unary function to
  *                      every element of the input
@@ -56,6 +57,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,12 +67,14 @@ std::unique_ptr<column> transform(
  * @throws cudf::logic_error if `input.type()` is a non-floating type
  *
  * @param input         An immutable view of the input column of floating-point type
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained by replacing `NaN` in `input` with null.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,12 +87,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  *
  * @param table The table used for expression evaluation
  * @param expr The root of the expression tree
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource
  * @return Output column
  */
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,6 +107,7 @@ std::unique_ptr<column> compute_column(
  * @throws cudf::logic_error if `input.type()` is a non-boolean type
  *
  * @param input        Boolean elements to convert to a bitmask
+ * @param stream       CUDA stream used for device memory operations and kernel launches
  * @param mr           Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained from input considering `true` represent `valid`/`1` and
@@ -108,6 +115,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -130,12 +138,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @endcode
  *
  * @param input Table containing values to be encoded
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the distinct row of the input table in sorter order,
  * and a column of integer indices representing the encoded rows.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -162,12 +172,14 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
  *
  * @param input Column containing values to be encoded
  * @param categories Column containing categories
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the owner to all encoded data and a table view into the data
  */
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -188,6 +200,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
  * @param bitmask A device pointer to the bitmask which needs to be converted
  * @param begin_bit position of the bit from which the conversion should start
  * @param end_bit position of the bit before which the conversion should stop
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A boolean column representing the given mask from [begin_bit, end_bit)
  */
@@ -195,6 +208,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -219,11 +233,14 @@ std::unique_ptr<column> mask_to_bools(
  * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
  *
  * @param t The table view to perform the computation on
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  table_view const& t,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -240,12 +257,14 @@ std::unique_ptr<column> row_bit_count(
  *
  * @param t The table view to perform the computation on
  * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the bit counts for each segment of rows
  */
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3d41f856f4f..a867d4adfa1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -247,7 +247,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
+  auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index cea7cdebcba..a5f3f9d87f5 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -200,7 +200,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto bitmask  = detail::bools_to_mask(column.view(), stream, mr);
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
   NANOARROW_RETURN_NOT_OK(
@@ -442,7 +442,7 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(
     set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 193b3a3b5a2..26f7c7e6e53 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -147,7 +147,7 @@ int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
   NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(populate_data_buffer(
     device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
                                bitmask.first->size()),
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index c12f65deb46..452aebf4428 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -59,10 +59,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
+  return detail::bools_to_mask(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 7960731f3a1..c4fc8d58552 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -138,10 +138,11 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
+  return detail::compute_column(table, expr, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 7a044b9f6f7..1c9d52bce1b 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -72,10 +72,10 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view con
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::device_async_resource_ref mr)
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input, cudf::get_default_stream(), mr);
+  return detail::encode(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index adf5db02d9c..be0b80a2633 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -62,9 +62,10 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
+                                      rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  return detail::mask_to_bools(bitmask, begin_bit, end_bit, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index fd4f33c594c..a24ba304004 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -93,10 +93,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
+  return detail::nans_to_nulls(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 808f2d1b284..46e6e55b0b7 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -115,9 +115,10 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
+  return detail::one_hot_encode(input, categories, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 12a15eb7e34..4530fabf889 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -561,23 +561,26 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  return segmented_row_bit_count(t, 1, stream, mr);
+  return detail::segmented_row_bit_count(t, 1, stream, mr);
 }
 
 }  // namespace detail
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+  return detail::segmented_row_bit_count(t, segment_length, stream, mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::row_bit_count(t, cudf::get_default_stream(), mr);
+  return detail::row_bit_count(t, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 98ec44758b9..f5e9048fa0a 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -97,10 +97,11 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
+  return detail::transform(input, unary_udf, output_type, is_ptx, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5e85b3e8adf..8c4b0f1e367 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -736,6 +736,7 @@ ConfigureTest(
   STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_TRANSFORM_TEST streams/transform_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
new file mode 100644
index 00000000000..9187672221c
--- /dev/null
+++ b/cpp/tests/streams/transform_test.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+
+class TransformTest : public cudf::test::BaseFixture {};
+
+template <class dtype, class Data>
+void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx)
+{
+  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
+  cudf::test::fixed_width_column_wrapper<dtype, typename decltype(data_iter)::value_type> in(
+    data_iter, data_iter + size, all_valid);
+  cudf::transform(
+    in, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Transform)
+{
+  char const* cuda =
+    R"***(
+__device__ inline void    fdsf   (
+       float* C,
+       float a
+)
+{
+  *C = a*a*a*a;
+}
+)***";
+
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Ef
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Ef;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Efx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Ef(
+	.param .b64 _ZN8__main__7add$241Ef_param_0,
+	.param .b32 _ZN8__main__7add$241Ef_param_1
+)
+{
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<2>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Ef_param_0];
+	ld.param.f32 	%f1, [_ZN8__main__7add$241Ef_param_1];
+	mul.f32 	%f2, %f1, %f1;
+	mul.f32 	%f3, %f2, %f2;
+	st.f32 	[%rd1], %f3;
+	mov.u32 	%r1, 0;
+	st.param.b32	[func_retval0+0], %r1;
+	ret;
+}
+)***";
+
+  auto data_init = [](cudf::size_type row) { return row % 3; };
+  test_udf<float>(cuda, data_init, 500, false);
+  test_udf<float>(ptx, data_init, 500, true);
+}
+
+TEST_F(TransformTest, ComputeColumn)
+{
+  auto c_0        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{3, 20, 1, 50};
+  auto c_1        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{10, 7, 20, 0};
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  cudf::compute_column(table, expression, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, BoolsToMask)
+{
+  std::vector<bool> input({1, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> input_column(input.begin(), input.end());
+  cudf::bools_to_mask(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, MaskToBools)
+{
+  cudf::mask_to_bools(nullptr, 0, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input{{1, 2, 3, 2, 3, 2, 1}};
+  cudf::encode(cudf::table_view({input}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, OneHotEncode)
+{
+  auto input    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 8, 8, 9, 9};
+  auto category = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 9};
+  cudf::one_hot_encode(input, category, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, NaNsToNulls)
+{
+  std::vector<float> input = {1, 2, 3, 4, 5};
+  std::vector<bool> mask   = {true, true, true, true, false, false};
+  auto input_column =
+    cudf::test::fixed_width_column_wrapper<float>(input.begin(), input.end(), mask.begin());
+  cudf::nans_to_nulls(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, RowBitCount)
+{
+  std::vector<std::string> strings{"abc", "ï", "", "z", "bananas", "warp", "", "zing"};
+  cudf::test::strings_column_wrapper col(strings.begin(), strings.end());
+  cudf::row_bit_count(cudf::table_view({col}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, SegmentedRowBitCount)
+{
+  // clang-format off
+  std::vector<std::string> const strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
+  std::vector<bool>        const valids  {  1,      0,    0,  1,   0,          1,      1,  1 };
+  // clang-format on
+  cudf::test::strings_column_wrapper const col(strings.begin(), strings.end(), valids.begin());
+  auto const input              = cudf::table_view({col});
+  auto constexpr segment_length = 2;
+  cudf::segmented_row_bit_count(input, segment_length, cudf::test::get_default_stream());
+}

From a94512a568bd0351fd20b0c2cbcd6067fd4d504b Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:20:57 -0700
Subject: [PATCH 35/72] Add interop example for `arrow::StringViewArray` to
 `cudf::column` (#16498)

Demonstrates the conversion from an `arrow:StringViewArray` to a `cudf::column`

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16498
---
 cpp/examples/build.sh               |   1 +
 cpp/examples/interop/CMakeLists.txt |  20 ++++
 cpp/examples/interop/interop.cpp    | 176 ++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 cpp/examples/interop/CMakeLists.txt
 create mode 100644 cpp/examples/interop/interop.cpp

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index dce81fb1677..2d6f6f316c7 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -61,3 +61,4 @@ build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
+build_example interop
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
new file mode 100644
index 00000000000..a1f99c1d2fd
--- /dev/null
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(interop_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  interop_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(interop interop.cpp)
+target_link_libraries(interop PRIVATE cudf::cudf)
+target_compile_features(interop PRIVATE cxx_std_17)
diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
new file mode 100644
index 00000000000..8271c3836e4
--- /dev/null
+++ b/cpp/examples/interop/interop.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/io/csv.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <arrow/array/array_binary.h>
+#include <arrow/type.h>
+
+// Helper functuons to create StringViews
+inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size)
+{
+  arrow::StringViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+inline arrow::StringViewType::c_type to_inline_string_view(std::string_view const& v)
+{
+  return to_inline_string_view(v.data(), static_cast<int32_t>(v.size()));
+}
+inline arrow::StringViewType::c_type to_string_view(const void* data,
+                                                    int32_t const& size,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  if (size <= arrow::StringViewType::kInlineSize) { return to_inline_string_view(data, size); }
+  arrow::StringViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+inline arrow::StringViewType::c_type to_string_view(std::string_view const& v,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  return to_string_view(v.data(), static_cast<int32_t>(v.size()), buffer_index, offset);
+}
+
+/**
+ * @brief Create a StringViewArray
+ *
+ * @param data_buffers The data buffers
+ * @param views The string views
+ * @param validate Whether to validate the array
+ */
+arrow::Result<std::shared_ptr<arrow::StringViewArray>> make_string_view_array(
+  arrow::BufferVector const& data_buffers,
+  std::vector<arrow::StringViewType::c_type> const& views,
+  bool validate = true)
+{
+  auto const length = static_cast<int64_t>(views.size());
+  auto const arr    = std::make_shared<arrow::StringViewArray>(
+    arrow::utf8_view(), length, arrow::Buffer::FromVector(views), std::move(data_buffers));
+  if (validate) { RETURN_NOT_OK(arr->ValidateFull()); }
+  return arr;
+}
+
+/**
+ * @brief Convert a vector of strings into a vector of the
+ * constituent chars and a vector of offsets.
+ *
+ * @param strings The vector of strings
+ */
+auto make_chars_and_offsets(std::vector<std::string> const& strings)
+{
+  std::vector<char> chars{};
+  std::vector<cudf::size_type> offsets(1, 0);
+  for (auto& str : strings) {
+    chars.insert(chars.end(), std::cbegin(str), std::cend(str));
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + str.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use arrow_string_view_to_cudf_column to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
+  }
+  return std::make_tuple(std::move(chars), std::move(offsets));
+};
+
+/**
+ * @brief Convert an Arrow StringViewArray to a cudf::column
+ *
+ * @param array The Arrow StringViewArray
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> arrow_string_view_to_cudf_column(
+  std::shared_ptr<arrow::StringViewArray> const& array,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  // Convert the string views into chars and offsets
+  std::vector<std::string> strings;
+  for (auto i = 0; i < array->length(); i++) {
+    strings.push_back(array->GetString(i));
+  }
+  auto const [chars, offsets] = make_chars_and_offsets(strings);
+
+  // Copy the chars vector to the device
+  rmm::device_uvector<char> d_chars(chars.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    d_chars.data(), chars.data(), chars.size() * sizeof(char), cudaMemcpyDefault, stream.value()));
+
+  // Copy the offsets vector to the device
+  // and wrap it in a cudf::column
+  rmm::device_uvector<cudf::size_type> d_offsets(offsets.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
+                                offsets.data(),
+                                offsets.size() * sizeof(cudf::size_type),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  auto offsets_col =
+    std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{0, stream, mr}, 0);
+
+  // Create a string column out of the chars and offsets
+  return cudf::make_strings_column(array->length(),
+                                   std::move(offsets_col),
+                                   d_chars.release(),
+                                   0,
+                                   rmm::device_buffer{0, stream, mr});
+}
+
+int main(int argc, char** argv)
+{
+  std::vector<std::shared_ptr<arrow::Buffer>> data_buffers;
+  std::vector<arrow::StringViewType::c_type> views;
+
+  // Define the data buffers and string views
+  auto const buffer_a =
+    arrow::Buffer::FromString("hello rapids teamapache arrow interopnvidiacudf");
+  data_buffers.push_back(buffer_a);
+  views.push_back(to_string_view("hello rapid steam", 0, 0));
+  views.push_back(to_string_view("apache arrow interop", 0, 17));
+  views.push_back(to_inline_string_view("nvidia"));
+  views.push_back(to_inline_string_view("cudf"));
+
+  // Create a StringViewArray
+  auto const string_view_col = make_string_view_array(data_buffers, views, true).ValueOrDie();
+  std::cout << string_view_col->ToString() << std::endl;
+
+  // Convert the StringViewArray to a cudf::column
+  auto const cudf_col = arrow_string_view_to_cudf_column(string_view_col);
+
+  // Write the cudf::column as CSV
+  auto const tbl_view                  = cudf::table_view({cudf_col->view()});
+  std::vector<std::string> const names = {"col_a"};
+
+  std::vector<char> h_buffer;
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(&h_buffer), tbl_view)
+      .include_header(not names.empty())
+      .names(names);
+
+  cudf::io::write_csv(writer_options);
+  auto const result = std::string(h_buffer.data(), h_buffer.size());
+  std::cout << result << std::endl;
+
+  return 0;
+}

From cc75b05b426920e6522c49527f8b684f780f38e3 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 8 Aug 2024 10:00:22 -0400
Subject: [PATCH 36/72] Change IPv4 convert APIs to support UINT32 instead of
 INT64 (#16489)

Changes the integer type for `cudf::strings::ipv4_to_integers` and `cudf::strings::integers_to_ipv4` to use UINT32 types instead of INT64. The INT64 type was originally chosen because libcudf did not support unsigned types at the time.
This is a breaking change since the basic input/output type is changed.

Closes #16324

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - https://github.com/brandon-b-miller
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16489
---
 cpp/include/cudf/strings/convert/convert_ipv4.hpp | 11 +++--------
 cpp/src/strings/convert/convert_ipv4.cu           | 14 +++++++-------
 cpp/tests/strings/ipv4_tests.cpp                  |  8 ++++----
 python/cudf/cudf/core/column/numerical.py         |  4 ++--
 python/cudf/cudf/tests/test_string.py             |  6 ++++--
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 04a04907c12..97d1dfee017 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -44,15 +44,12 @@ namespace strings {
  * No checking is done on the format. If a string is not in IPv4 format, the resulting
  * integer is undefined.
  *
- * The resulting 32-bit integer is placed in an int64_t to avoid setting the sign-bit
- * in an int32_t type. This could be changed if cudf supported a UINT32 type in the future.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
  * @param input Strings instance for this operation
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT64 column converted from strings
+ * @return New UINT32 column converted from strings
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
@@ -68,13 +65,11 @@ std::unique_ptr<column> ipv4_to_integers(
  * Each input integer is dissected into four integers by dividing the input into 8-bit sections.
  * These sub-integers are then converted into [0-9] characters and placed between '.' characters.
  *
- * No checking is done on the input integer value. Only the lower 32-bits are used.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @throw cudf::logic_error if the input column is not INT64 type.
+ * @throw cudf::logic_error if the input column is not UINT32 type.
  *
- * @param integers Integer (INT64) column to convert
+ * @param integers Integer (UINT32) column to convert
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 68a24e000ae..13d6e9bc3ba 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -46,7 +46,7 @@ namespace {
 struct ipv4_to_integers_fn {
   column_device_view const d_strings;
 
-  __device__ int64_t operator()(size_type idx)
+  __device__ uint32_t operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return 0;
     string_view d_str  = d_strings.element<string_view>(idx);
@@ -66,7 +66,7 @@ struct ipv4_to_integers_fn {
       }
     }
     uint32_t result = (ipvals[0] << 24) + (ipvals[1] << 16) + (ipvals[2] << 8) + ipvals[3];
-    return static_cast<int64_t>(result);
+    return result;
   }
 };
 
@@ -79,18 +79,18 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
-    return make_numeric_column(data_type{type_id::INT64}, 0, mask_state::UNALLOCATED, stream);
+    return make_numeric_column(data_type{type_id::UINT32}, 0, mask_state::UNALLOCATED, stream);
   }
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   // create output column copying the strings' null-mask
-  auto results   = make_numeric_column(data_type{type_id::INT64},
+  auto results   = make_numeric_column(data_type{type_id::UINT32},
                                      strings_count,
                                      cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                      input.null_count(),
                                      stream,
                                      mr);
-  auto d_results = results->mutable_view().data<int64_t>();
+  auto d_results = results->mutable_view().data<uint32_t>();
   // fill output column with ipv4 integers
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
@@ -135,7 +135,7 @@ struct integers_to_ipv4_fn {
       return;
     }
 
-    auto const ip_number = d_column.element<int64_t>(idx);
+    auto const ip_number = d_column.element<uint32_t>(idx);
 
     char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     int shift_bits  = 24;
@@ -165,7 +165,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
+  CUDF_EXPECTS(integers.type().id() == type_id::UINT32, "Input column must be UINT32 type");
 
   auto d_column = column_device_view::create(integers, stream);
   auto [offsets_column, chars] =
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 3bfe0f9727e..ea3ac439e62 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -40,8 +40,8 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::ipv4_to_integers(strings_view);
 
-  std::vector<int64_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
-  cudf::test::fixed_width_column_wrapper<int64_t> expected(
+  std::vector<uint32_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected(
     h_expected.cbegin(),
     h_expected.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
@@ -59,8 +59,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
     thrust::make_transform_iterator(h_strings.begin(),
                                     [](auto const str) { return str != nullptr; }));
 
-  std::vector<int64_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
-  cudf::test::fixed_width_column_wrapper<int64_t> column(
+  std::vector<uint32_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
+  cudf::test::fixed_width_column_wrapper<uint32_t> column(
     h_column.cbegin(),
     h_column.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index df27134d458..b83d7600c82 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -313,8 +313,8 @@ def normalize_binop_value(
             return NotImplemented
 
     def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != cudf.dtype("int64"):
-            raise TypeError("Only int64 type can be converted to ip")
+        if self.dtype != cudf.dtype("uint32"):
+            raise TypeError("Only uint32 type can be converted to ip")
 
         return libcudf.string_casting.int2ip(self)
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 4bd084a3938..a2a3e874c91 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2672,7 +2672,9 @@ def test_string_ip4_to_int():
 
 
 def test_string_int_to_ipv4():
-    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
+    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype(
+        "uint32"
+    )
     expected = cudf.Series(
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
@@ -2718,7 +2720,7 @@ def test_string_isipv4():
 
 
 @pytest.mark.parametrize(
-    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"}))
+    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"}))
 )
 def test_string_int_to_ipv4_dtype_fail(dtype):
     gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype)

From da51cad6c25f54ab344b0aa25e3dc1e4adb4550a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 10:25:11 -0500
Subject: [PATCH 37/72] Improve update-version.sh (#16506)

A few small tweaks to `update-version.sh` for alignment across RAPIDS.

The `UCX_PY` curl call is unused.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16506
---
 ci/release/update-version.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ad96aff3930..132e58249e6 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,18 +18,16 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
-echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -61,7 +59,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 
@@ -77,7 +75,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 # CI files
 for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 

From 792dd0686f4970c70f9bdba62c54a3de0a495fd5 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 8 Aug 2024 12:56:36 -0400
Subject: [PATCH 38/72] Update pre-commit hooks (#16510)

This PR updates pre-commit hooks to the latest versions that are supported without causing style check errors.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16510
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bbcd78d051f..1b17eae0842 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.2.0
+    rev: v0.3.1
     hooks:
       - id: verify-copyright
         exclude: |

From 1bbe440ee7ddbc021f945e4156220f9bc270a443 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 12:25:29 -0500
Subject: [PATCH 39/72] Add keep option to distinct nvbench (#16497)

This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

URL: https://github.com/rapidsai/cudf/pull/16497
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 cpp/benchmarks/stream_compaction/distinct.cpp | 45 ++++++++++++-------
 .../stream_compaction/stable_distinct.cpp     | 45 ++++++++++++-------
 .../stream_compaction_common.cpp              | 35 +++++++++++++++
 .../stream_compaction_common.hpp              | 19 ++++++++
 5 files changed, 113 insertions(+), 32 deletions(-)
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7be456ddfba..483b7b0a539 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -162,6 +162,7 @@ ConfigureNVBench(
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
+  stream_compaction/stream_compaction_common.cpp
   stream_compaction/unique.cpp
   stream_compaction/unique_count.cpp
 )
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index c04b6516903..d7deebca89a 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(input_table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(*table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result =
+      cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
index bcee3048013..0a8836c0583 100644
--- a/cpp/benchmarks/stream_compaction/stable_distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(input_table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("stable_distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(*table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      *table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("stable_distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
new file mode 100644
index 00000000000..8cbb2956777
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
+
+#include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/error.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str)
+{
+  if (keep_str == "any") {
+    return cudf::duplicate_keep_option::KEEP_ANY;
+  } else if (keep_str == "first") {
+    return cudf::duplicate_keep_option::KEEP_FIRST;
+  } else if (keep_str == "last") {
+    return cudf::duplicate_keep_option::KEEP_LAST;
+  } else if (keep_str == "none") {
+    return cudf::duplicate_keep_option::KEEP_NONE;
+  } else {
+    CUDF_FAIL("Unsupported keep option.");
+  }
+}
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
new file mode 100644
index 00000000000..d1ef2b10f41
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/stream_compaction.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str);

From 2c8de625b69bf5f7f3315c45a34bdf9ba45315a9 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 08:25:58 -0500
Subject: [PATCH 40/72] enable list to be forced as string in JSON reader.
 (#16472)

closes #15278

This PR allows list type also forced as string when mixed type as string is enabled and a user given schema specifies a column as string, in JSON reader.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16472
---
 cpp/src/io/json/json_column.cu  |  22 ++++---
 cpp/tests/io/json/json_test.cpp | 113 ++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 17fa7abdffe..e5e21e054a6 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -567,22 +567,22 @@ void make_device_json_column(device_span<SymbolT const> input,
     thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
-  auto initialize_json_columns = [&](auto i, auto& col) {
-    if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) {
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
       return;
-    } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) {
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
       col.string_offsets.resize(max_row_offsets[i] + 1, stream);
       col.string_lengths.resize(max_row_offsets[i] + 1, stream);
       init_to_zero(col.string_offsets);
       init_to_zero(col.string_lengths);
-    } else if (column_categories[i] == NC_LIST) {
+    } else if (column_category == NC_LIST) {
       col.child_offsets.resize(max_row_offsets[i] + 2, stream);
       init_to_zero(col.child_offsets);
     }
     col.num_rows = max_row_offsets[i] + 1;
     col.validity =
       cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_categories[i]);
+    col.type = to_json_col_type(column_category);
   };
 
   auto reinitialize_as_string = [&](auto i, auto& col) {
@@ -764,21 +764,23 @@ void make_device_json_column(device_span<SymbolT const> input,
       }
     }
 
+    auto this_column_category = column_categories[this_col_id];
     if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct forced as string, and enforce it
+      // get path of this column, check if it is a struct/list forced as string, and enforce it
       auto const nt                             = tree_path.get_path(this_col_id);
       std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
-          user_dtype.value().id() == type_id::STRING) {
+      if ((column_categories[this_col_id] == NC_STRUCT or
+           column_categories[this_col_id] == NC_LIST) and
+          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
-        column_categories[this_col_id]    = NC_STR;
+        this_column_category              = NC_STR;
       }
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col);
+    initialize_json_columns(this_col_id, col, this_column_category);
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 993ab82f423..0a485e26b71 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2351,7 +2351,7 @@ TEST_F(JsonReaderTest, MapTypes)
   // Testing function for mixed types in JSON (for spark json reader)
   auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
     std::map<std::string, cudf::io::schema_element> dtype_schema{
-      {"foo1", {data_type{type_id::STRING}}},  // list won't be a string
+      {"foo1", {data_type{type_id::STRING}}},  // list forced as a string
       {"foo2", {data_type{type_id::STRING}}},  // struct forced as a string
       {"1", {data_type{type_id::STRING}}},
       {"2", {data_type{type_id::STRING}}},
@@ -2378,17 +2378,17 @@ TEST_F(JsonReaderTest, MapTypes)
   test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
               { "foo2": { "a": 1 }, "bar": 456 }])",
           false,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl
   test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
               { "foo2": { "a": 1 }, "bar": 456 })",
           true,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl-array
   test_fn(R"([123, [1,2,3]]
               [456, null,  { "a": 1 }])",
           true,
-          {type_id::INT64, type_id::LIST, type_id::STRING});
+          {type_id::INT64, type_id::STRING, type_id::STRING});
   // json-array
   test_fn(R"([[[1,2,3], null, 123],
               [null, { "a": 1 }, 456 ]])",
@@ -2678,38 +2678,81 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
 
 TEST_F(JsonReaderTest, JSONMixedTypeChildren)
 {
-  std::string const json_str = R"(
-{ "Root": { "Key": [ { "EE": "A" } ] } }
-{ "Root": { "Key": {  } } }
-{ "Root": { "Key": [{ "YY": 1}] } }
-)";
-  // Column "EE" is created and destroyed
-  // Column "YY" should not be created
-
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
-      .lines(true)
-      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
-      .normalize_single_quotes(true)
-      .normalize_whitespace(false)
-      .mixed_types_as_string(true)
-      .keep_quotes(true);
-
-  auto result = cudf::io::read_json(options);
+  // struct mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 
-  ASSERT_EQ(result.tbl->num_columns(), 1);
-  ASSERT_EQ(result.metadata.schema_info.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
-  ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
-  ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
-  // types
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
-  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
-  cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  // list mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "\"abc\"", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 9ec34ad81152a4d7889bdf1f5b92032000b09b8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 9 Aug 2024 10:24:31 -0400
Subject: [PATCH 41/72] Remove a deprecated multibyte_split API (#16501)

Removes overloaded `cudf::io::text::multibyte_split` API deprecated in 24.08 and is no longer needed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16501
---
 cpp/include/cudf/io/text/multibyte_split.hpp | 20 --------------------
 cpp/src/io/text/multibyte_split.cu           | 14 --------------
 2 files changed, 34 deletions(-)

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 8624a386d0f..3a1f9611324 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -96,26 +96,6 @@ std::unique_ptr<cudf::column> multibyte_split(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Splits the source text into a strings column using a multiple byte delimiter.
- *
- * @deprecated Since 24.08
- *
- * @param source The source input data encoded in UTF-8
- * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range The position and size within `source` to produce the column from
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Memory resource to use for the device memory allocation
- * @return The strings found by splitting the source by the delimiter within the relevant byte
- * range.
- */
-[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
-  data_chunk_source const& source,
-  std::string const& delimiter,
-  std::optional<byte_range_info> byte_range,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 
 }  // namespace text
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index be2e2b9a79c..97729a091fb 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -567,20 +567,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
-// deprecated in 24.08
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              std::optional<byte_range_info> byte_range,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source,
-                         delimiter,
-                         parse_options{byte_range.value_or(create_byte_range_info_max())},
-                         stream,
-                         mr);
-}
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,

From 8009dc800bf79ba5fbacc9658235a212590640ba Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 9 Aug 2024 09:07:47 -0700
Subject: [PATCH 42/72] Update docs of the TPC-H derived examples (#16423)

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16423
---
 .gitignore                                    |  2 +
 cpp/examples/tpch/README.md                   | 37 ++++++------
 .../tpch/datagen/correct_datatypes.py         | 60 +++++++++++++++++++
 cpp/examples/tpch/datagen/datagen.sh          | 31 ++++++++++
 cpp/examples/tpch/datagen/tpch.patch          | 33 ++++++++++
 5 files changed, 145 insertions(+), 18 deletions(-)
 create mode 100644 cpp/examples/tpch/datagen/correct_datatypes.py
 create mode 100755 cpp/examples/tpch/datagen/datagen.sh
 create mode 100644 cpp/examples/tpch/datagen/tpch.patch

diff --git a/.gitignore b/.gitignore
index c89fb49697a..153c7f59744 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,6 +79,8 @@ Debug
 build/
 cpp/build/
 cpp/examples/*/install/
+cpp/examples/*/build/
+cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
index 1ea71ae9824..8c046c3f1e8 100644
--- a/cpp/examples/tpch/README.md
+++ b/cpp/examples/tpch/README.md
@@ -1,38 +1,39 @@
-# TPC-H Inspired Examples
+# TPC-H Derived Examples
 
 Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
 
 ## Requirements
 
 - Rust
+- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
 
-## Generating the Dataset
+## Running Queries
 
-1. Clone the datafusion repository.
+1. Build the `libcudf` examples.
 ```bash
-git clone git@github.com:apache/datafusion.git
+cd cudf/cpp/examples
+./build.sh
 ```
+The TPC-H query binaries would be built inside `tpch/build`.
 
-2. Run the data generator. The data will be placed in a `data/` subdirectory.
+2. Generate the dataset.
 ```bash
-cd datafusion/benchmarks/
-./bench.sh data tpch
-
-# for scale factor 10,
-./bench.sh data tpch10
+cd tpch/datagen
+./datagen.sh [scale factor (1/10)]
 ```
 
-## Running Queries
+The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
 
-1. Build the examples.
+3. Set these environment variables for optimized runtimes.
 ```bash
-cd cpp/examples
-./build.sh
+export KVIKIO_COMPAT_MODE="on"
+export LIBCUDF_CUFILE_POLICY="KVIKIO"
+export CUDA_MODULE_LOADING="EAGER"
 ```
-The TPC-H query binaries would be built inside `examples/tpch/build`.
 
-2. Execute the queries.
+4. Execute the queries.
 ```bash
-./tpch/build/tpch_q1
+./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
 ```
-A parquet file named `q1.parquet` would be generated holding the results of the query.
+
+A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
new file mode 100644
index 00000000000..8564774647b
--- /dev/null
+++ b/cpp/examples/tpch/datagen/correct_datatypes.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import os
+import sys
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pandas as pd
+
+if __name__ == "__main__":
+    dataset_path = str(sys.argv[1])
+    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
+    for table in tables:
+        filepath = os.path.join(dataset_path, f"{table}.parquet")
+        print("Reading file ", filepath)
+
+        if filepath.endswith("lineitem.parquet"):
+            df = pd.read_parquet(filepath)
+            df["l_linenumber"] = df["l_linenumber"].astype("int64")
+            df["l_quantity"] = df["l_quantity"].astype("int64")
+            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
+            df["l_discount"] = df["l_discount"].astype("float64")
+            df["l_tax"] = df["l_tax"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("part.parquet"):
+            df = pd.read_parquet(filepath)
+            df["p_size"] = df["p_size"].astype("int64")
+            df["p_retailprice"] = df["p_retailprice"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("partsupp.parquet"):
+            df = pd.read_parquet(filepath)
+            df["ps_availqty"] = df["ps_availqty"].astype("int64")
+            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("orders.parquet"):
+            df = pd.read_parquet(filepath)
+            df["o_totalprice"] = df["o_totalprice"].astype("float64")
+            df["o_shippriority"] = df["o_shippriority"].astype("int64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("supplier.parquet"):
+            df = pd.read_parquet(filepath)
+            df["s_acctbal"] = df["s_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("customer.parquet"):
+            df = pd.read_parquet(filepath)
+            df["c_acctbal"] = df["c_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("nation.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("region.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
new file mode 100755
index 00000000000..0b03753daea
--- /dev/null
+++ b/cpp/examples/tpch/datagen/datagen.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -e
+
+scale_factor=$1
+script_dir=$(pwd)
+
+# Clone the datafusion repository and apply a patch
+# for single threaded data generation so that a
+# single parquet file is generated for each table
+rm -rf datafusion
+git clone https://github.com/apache/datafusion.git datafusion
+cd datafusion/
+git checkout 679a85f
+git apply ${script_dir}/tpch.patch
+cd benchmarks/
+
+# Generate the data
+# Currently, we support only scale factor 1 and 10
+if [ ${scale_factor} -eq 1 ]; then
+    ./bench.sh data tpch
+elif [ ${scale_factor} -eq 10 ]; then
+    ./bench.sh data tpch10
+else
+    echo "Unsupported scale factor"
+    exit 1
+fi
+
+# Correct the datatypes of the parquet files
+python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
new file mode 100644
index 00000000000..42727aa9904
--- /dev/null
+++ b/cpp/examples/tpch/datagen/tpch.patch
@@ -0,0 +1,33 @@
+diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
+index 3b854f6dc..f000f09c0 100755
+--- a/benchmarks/bench.sh
++++ b/benchmarks/bench.sh
+@@ -311,6 +311,15 @@ data_tpch() {
+         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
+         popd > /dev/null
+     fi
++
++    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
++    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
++    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
++    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
++    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
++    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
++    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
++    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
+ }
+
+ # Runs the tpch benchmark
+diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
+index b5204b343..84fd2e78d 100644
+--- a/datafusion/common/src/config.rs
++++ b/datafusion/common/src/config.rs
+@@ -250,7 +250,7 @@ config_namespace! {
+         /// concurrency.
+         ///
+         /// Defaults to the number of CPU cores on the system
+-        pub target_partitions: usize, default = num_cpus::get()
++        pub target_partitions: usize, default = 1
+
+         /// The default time zone
+         ///

From 4446cf0188c03b82cbec28493aa131027f25dffa Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:43:23 -0500
Subject: [PATCH 43/72] Update json normalization to take device_buffer
 (#16520)

This change updates json normalization calls (quote and whitespace normalization) to take owning buffer of device_buffer as input rather than device_uvector. It makes it easy to hand over a string_column's char buffer to normalization calls.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16520
---
 cpp/include/cudf/io/detail/json.hpp           |  4 ++--
 cpp/src/io/json/json_normalization.cu         | 20 +++++++++----------
 cpp/src/io/json/read_json.cu                  | 16 +++++++--------
 .../io/json/json_quote_normalization_test.cpp |  9 ++++-----
 .../json_whitespace_normalization_test.cu     |  7 +++----
 5 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 42b10a78ce8..38ba4f675c3 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -61,7 +61,7 @@ void write_json(data_sink* sink,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
@@ -72,7 +72,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
 }  // namespace io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 760b2214365..cb8b4e97ebb 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,7 +298,7 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
@@ -311,22 +311,22 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                             normalize_quotes::TransduceToNormalizedQuotes{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
+  rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_quotes::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
@@ -339,18 +339,18 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
+  rmm::device_buffer outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 590f70864b1..e0d0497e0a2 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -168,7 +168,7 @@ size_t estimate_size_per_subchunk(size_t chunk_size)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns Data source owning buffer enclosing the bytes read
  */
-datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   host_span<std::unique_ptr<datasource>> sources,
   json_reader_options const& reader_opts,
   rmm::cuda_stream_view stream)
@@ -200,8 +200,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
           num_extra_delimiters;
-  rmm::device_uvector<char> buffer(buffer_size, stream);
-  device_span<char> bufspan(buffer);
+  rmm::device_buffer buffer(buffer_size, stream);
+  device_span<char> bufspan(reinterpret_cast<char*>(buffer.data()), buffer.size());
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
@@ -213,8 +213,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
-    auto empty_buf = rmm::device_uvector<char>(0, stream);
-    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+    auto empty_buf = rmm::device_buffer(0, stream);
+    return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
     std::int64_t next_delim_pos = -1;
@@ -232,12 +232,12 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     }
     if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
-    return datasource::owning_buffer<rmm::device_uvector<char>>(
+    return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
-  return datasource::owning_buffer<rmm::device_uvector<char>>(
+  return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
@@ -249,7 +249,7 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+  datasource::owning_buffer<rmm::device_buffer> bufview =
     get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 55ad0afe499..3a9ba8d9f3b 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -26,7 +26,7 @@
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
 
-#include <rmm/device_uvector.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -42,12 +42,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   auto stream_view  = cudf::test::get_default_stream();
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 8ed5fa81b12..01dd17fab98 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -38,12 +38,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   // Prepare cuda stream for data transfers & kernels
   auto stream_view = cudf::test::get_default_stream();
 
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
     device_data, stream_view, rmm::mr::get_current_device_resource());
 

From 16aa0eaa54d00d88f897766d91f9e531f64b0070 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:19 -1000
Subject: [PATCH 44/72] Allow DataFrame.sort_values(by=) to select an index
 level (#16519)

closes #14794

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16519
---
 python/cudf/cudf/core/index.py         | 13 ++++++++++++-
 python/cudf/cudf/core/indexed_frame.py | 26 +++++++++++++++++++++++++-
 python/cudf/cudf/tests/test_sorting.py | 20 ++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 094da09ab08..7f40428c1b8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Hashable
 from functools import cache, cached_property
 from numbers import Number
 from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
@@ -60,7 +61,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable, Iterable
+    from collections.abc import Generator, Iterable
     from datetime import tzinfo
 
 
@@ -450,6 +451,16 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
+    def _get_columns_by_label(self, labels) -> Index:
+        # used in .sort_values
+        if isinstance(labels, Hashable):
+            if labels == self.name:
+                return self._as_int_index()
+        elif is_list_like(labels):
+            if list(self.names) == list(labels):
+                return self._as_int_index()
+        raise KeyError(labels)
+
     @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 24d947a574a..3b44a0f5864 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3592,10 +3592,34 @@ def sort_values(
         if len(self) == 0:
             return self
 
+        try:
+            by_in_columns = self._get_columns_by_label(by)
+        except KeyError:
+            by_in_columns = None
+        if self.ndim == 1:
+            # For Series case, we're never selecting an index level.
+            by_in_index = None
+        else:
+            try:
+                by_in_index = self.index._get_columns_by_label(by)
+            except KeyError:
+                by_in_index = None
+
+        if by_in_columns is not None and by_in_index is not None:
+            raise ValueError(
+                f"{by=} appears in the {type(self).__name__} columns "
+                "and as an index level which is ambiguous."
+            )
+        elif by_in_columns is not None:
+            by_columns = by_in_columns
+        elif by_in_index is not None:
+            by_columns = by_in_index
+        else:
+            raise KeyError(by)
         # argsort the `by` column
         out = self._gather(
             GatherMap.from_column_unchecked(
-                self._get_columns_by_label(by)._get_sorted_inds(
+                by_columns._get_sorted_inds(
                     ascending=ascending, na_position=na_position
                 ),
                 len(self),
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index a8ffce6e88b..2cf2259d9ec 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -405,3 +405,23 @@ def test_dataframe_scatter_by_map_empty():
     df = DataFrame({"a": [], "b": []}, dtype="float64")
     scattered = df.scatter_by_map(df["a"])
     assert len(scattered) == 0
+
+
+def test_sort_values_by_index_level():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b"))
+    cudf_df = DataFrame.from_pandas(df)
+    result = cudf_df.sort_values("b")
+    expected = df.sort_values("b")
+    assert_eq(result, expected)
+
+
+def test_sort_values_by_ambiguous():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a"))
+    cudf_df = DataFrame.from_pandas(df)
+
+    assert_exceptions_equal(
+        lfunc=df.sort_values,
+        rfunc=cudf_df.sort_values,
+        lfunc_args_and_kwargs=(["a"], {}),
+        rfunc_args_and_kwargs=(["a"], {}),
+    )

From 4cd87d3fdb0de6154504f8486ed49b685a9dceec Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:53 -1000
Subject: [PATCH 45/72] Fix `date_range(start, end, freq)` when end-start is
 divisible by freq (#16516)

xref https://github.com/rapidsai/cudf/issues/16507

`date_range` generates its dates via `range`, and the end of this range was calculated via `math.ceil((end - start) / freq)`. If `(end - start) / freq` did not produce a remainder, `math.ceil` would not correctly increment this value by `1` to capture the last date.

Instead, this PR uses `math.floor((end - start) / freq) + 1` to always ensure the last date is captured

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16516
---
 python/cudf/cudf/core/index.py           | 6 ++++--
 python/cudf/cudf/core/series.py          | 3 +++
 python/cudf/cudf/core/tools/datetimes.py | 9 +++++----
 python/cudf/cudf/tests/test_datetime.py  | 6 ++++++
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7f40428c1b8..3eab27bd165 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2414,11 +2414,13 @@ def day_name(self, locale: str | None = None) -> Index:
         >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D")
         >>> datetime_index
         DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
-                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'],
+                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07',
+                       '2017-01-08'],
                       dtype='datetime64[ns]', freq='D')
         >>> datetime_index.day_name()
         Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
-               'Friday', 'Saturday'], dtype='object')
+               'Friday', 'Saturday', 'Sunday'],
+              dtype='object')
         """
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index de57ac5f290..53675d339ac 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -801,14 +801,17 @@ def dt(self):
         >>> s.dt.hour
         0    12
         1    13
+        2    14
         dtype: int16
         >>> s.dt.second
         0    0
         1    0
+        2    0
         dtype: int16
         >>> s.dt.day
         0    3
         1    3
+        2    3
         dtype: int16
 
         Returns
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 2f77778116f..c50a36b68b5 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -951,7 +951,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype)
         _is_increment_sequence = end >= start
 
-        periods = math.ceil(
+        periods = math.floor(
             int(end - start) / _offset_to_nanoseconds_lower_bound(offset)
         )
 
@@ -959,9 +959,10 @@ def date_range(
             # Mismatched sign between (end-start) and offset, return empty
             # column
             periods = 0
-        elif periods == 0:
-            # end == start, return exactly 1 timestamp (start)
-            periods = 1
+        else:
+            # If end == start, periods == 0 and we return exactly 1 timestamp (start).
+            # Otherwise, since closed="both", we ensure the end point is included.
+            periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
     # range) below, but don't always use it.  We do this to ensure
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 6bc775d2a2c..7be4faa42c3 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2536,3 +2536,9 @@ def test_dti_methods(method, kwargs):
     result = getattr(cudf_dti, method)(**kwargs)
     expected = getattr(pd_dti, method)(**kwargs)
     assert_eq(result, expected)
+
+
+def test_date_range_start_end_divisible_by_freq():
+    result = cudf.date_range("2011-01-01", "2011-01-02", freq="h")
+    expected = pd.date_range("2011-01-01", "2011-01-02", freq="h")
+    assert_eq(result, expected)

From 45b20d135a290d5f14e291316e94674653f71737 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:22:15 -1000
Subject: [PATCH 46/72] Preserve array name in MultiIndex.from_arrays (#16515)

xref https://github.com/rapidsai/cudf/issues/16507

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16515
---
 python/cudf/cudf/core/multiindex.py       |  4 ++++
 python/cudf/cudf/tests/test_multiindex.py | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9646b34830f..ab88b191570 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1394,12 +1394,16 @@ def from_arrays(
             raise TypeError(error_msg)
         codes = []
         levels = []
+        names_from_arrays = []
         for array in arrays:
             if not (is_list_like(array) or is_column_like(array)):
                 raise TypeError(error_msg)
             code, level = factorize(array, sort=True)
             codes.append(code)
             levels.append(level)
+            names_from_arrays.append(getattr(array, "name", None))
+        if names is None:
+            names = names_from_arrays
         return cls(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b7314a36e73..a68f4574da3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2179,3 +2179,13 @@ def test_unique_level():
     result = pd_mi.unique(level=1)
     expected = cudf_mi.unique(level=1)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex]
+)
+def test_from_arrays_infer_names(idx):
+    arrays = [idx([1], name="foo"), idx([2], name="bar")]
+    expected = pd.MultiIndex.from_arrays(arrays)
+    result = cudf.MultiIndex.from_arrays(arrays)
+    assert_eq(result, expected)

From a3dc14fcea938729c7c9468bd6a64331395b2f78 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 12 Aug 2024 07:56:48 -1000
Subject: [PATCH 47/72] Disallow indexing by selecting duplicate labels
 (#16514)

xref https://github.com/rapidsai/cudf/issues/16507

I would say this was a bug before because we would silently return a new DataFrame with just `len(set(column_labels))` when selecting by column. Now this operation raises since duplicate column labels are generally not supported.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16514
---
 python/cudf/cudf/core/column_accessor.py | 4 ++++
 python/cudf/cudf/tests/test_indexing.py  | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 819d351b2c4..83596704672 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -530,6 +530,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
             )
         else:
             data = {k: self._grouped_data[k] for k in key}
+            if len(data) != len(key):
+                raise ValueError(
+                    "Selecting duplicate column labels is not supported."
+                )
         if self.multiindex:
             data = dict(_to_flat_dict_inner(data))
         return self.__class__(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 7005cbc6834..716b4dc6acd 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2361,3 +2361,11 @@ def test_sliced_categorical_as_ordered():
         name="a",
     )
     assert_eq(result, expected)
+
+
+def test_duplicate_labels_raises():
+    df = cudf.DataFrame([[1, 2]], columns=["a", "b"])
+    with pytest.raises(ValueError):
+        df[["a", "a"]]
+    with pytest.raises(ValueError):
+        df.loc[:, ["a", "a"]]

From 091cb72294a394deb176600e74c7cb115cfff05a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 12 Aug 2024 14:48:02 -0400
Subject: [PATCH 48/72] Remove deprecated public APIs from libcudf (#16524)

Removing some more deprecated public libcudf APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16524
---
 cpp/include/cudf/strings/replace.hpp       | 12 ------------
 cpp/include/cudf/utilities/type_checks.hpp | 19 -------------------
 cpp/src/strings/replace/multi.cu           | 11 -----------
 cpp/src/utilities/type_checks.cpp          |  5 -----
 4 files changed, 47 deletions(-)

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 5b4ffb98f99..f450b77ad7a 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -160,18 +160,6 @@ std::unique_ptr<column> replace_multiple(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @copydoc cudf::strings::replace_multiple
- *
- * @deprecated since 24.08
- */
-[[deprecated]] std::unique_ptr<column> replace(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  strings_column_view const& repls,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fcbca09d17..aeb5db57830 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -22,25 +22,6 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @brief Compare the types of two `column_view`s
- *
- * @deprecated Since 24.06. Use cudf::have_same_types instead.
- *
- * This function returns true if the type of `lhs` equals that of `rhs`.
- * - For fixed point types, the scale is compared.
- * - For dictionary types, the type of the keys are compared if both are
- *   non-empty columns.
- * - For lists types, the type of child columns are compared recursively.
- * - For struct types, the type of each field are compared in order.
- * - For all other types, the `id` of `data_type` is compared.
- *
- * @param lhs The first `column_view` to compare
- * @param rhs The second `column_view` to compare
- * @return true if column types match
- */
-[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
-
 /**
  * @brief Compare the type IDs of two `column_view`s
  *
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2ca22f0e017..b5248700d53 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -533,16 +533,5 @@ std::unique_ptr<column> replace_multiple(strings_column_view const& strings,
   return detail::replace_multiple(strings, targets, repls, stream, mr);
 }
 
-// deprecated in 24.08
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace_multiple(strings, targets, repls, stream, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index dac981fb532..3095b342748 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -139,11 +139,6 @@ bool have_same_types(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
-{
-  return have_same_types(lhs, rhs);
-}
-
 bool have_same_types(column_view const& lhs, scalar const& rhs)
 {
   return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);

From cce00c00b0ae374ee72332aaea5fcd1cc121e85a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 12 Aug 2024 14:38:37 -0700
Subject: [PATCH 49/72] Pass batch size to JSON reader using environment
 variable (#16502)

The JSON reader set the batch size to `INT_MAX` bytes since the motivation for implementing a batched JSON reader was to parse source files whose total size is larger than `INT_MAX` (#16138, #16162). However, we can use a much smaller batch size to evaluate the correctness of the reader and speed up tests significantly.
This PR focuses on reducing runtime of the batched reader test by setting the batch size to be used by the reader as an environment variable.
The runtime of `JsonLargeReaderTest.MultiBatch` in `LARGE_STRINGS_TEST` gtest  drops from ~52s to ~3s.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16502
---
 cpp/CMakeLists.txt                    |   1 -
 cpp/src/io/json/byte_range_info.cu    |  37 ----
 cpp/src/io/json/read_json.cu          | 291 +++++++++++++++-----------
 cpp/src/io/json/read_json.hpp         |  28 ++-
 cpp/tests/large_strings/json_tests.cu |  20 +-
 5 files changed, 204 insertions(+), 173 deletions(-)
 delete mode 100644 cpp/src/io/json/byte_range_info.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 310bc99b279..eeafc411874 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -392,7 +392,6 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
-  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
diff --git a/cpp/src/io/json/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu
deleted file mode 100644
index 258a40b0dd3..00000000000
--- a/cpp/src/io/json/byte_range_info.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/find.h>
-
-namespace cudf::io::json::detail {
-
-// Extract the first character position in the string.
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream)
-{
-  auto const first_delimiter_position =
-    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
-  return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
-}
-
-}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index e0d0497e0a2..2658cbbed2f 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -31,6 +31,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/distance.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
@@ -38,11 +39,14 @@
 
 namespace cudf::io::json::detail {
 
-size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
-                    size_t range_offset,
-                    size_t range_size)
+namespace {
+
+// Return total size of sources enclosing the passed range
+std::size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
+                         std::size_t range_offset,
+                         std::size_t range_size)
 {
-  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](std::size_t sum, auto& source) {
     auto const size = source->size();
     // TODO take care of 0, 0, or *, 0 case.
     return sum +
@@ -50,109 +54,55 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
+// Return estimated size of subchunk using a heuristic involving the byte range size and the minimum
+// subchunk size
+std::size_t estimate_size_per_subchunk(std::size_t chunk_size)
+{
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil(static_cast<double>(chunk_size) / num_subchunks),
+                        min_subchunk_size);
+}
+
 /**
- * @brief Read from array of data sources into RMM buffer. The size of the returned device span
-          can be larger than the number of bytes requested from the list of sources when
-          the range to be read spans across multiple sources. This is due to the delimiter
-          characters inserted after the end of each accessed source.
+ * @brief Return the upper bound on the batch size for the JSON reader.
  *
- * @param buffer Device span buffer to which data is read
- * @param sources Array of data sources
- * @param compression Compression format of source
- * @param range_offset Number of bytes to skip from source start
- * @param range_size Number of bytes to read from source
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @returns A subspan of the input device span containing data read
+ * The datasources passed to the JSON reader are split into batches demarcated by byte range
+ * offsets and read iteratively. The batch size is capped at INT_MAX bytes, which is the
+ * default value returned by the function. This value can be overridden at runtime using the
+ * environment variable LIBCUDF_JSON_BATCH_SIZE
+ *
+ * @return size in bytes
  */
-device_span<char> ingest_raw_input(device_span<char> buffer,
-                                   host_span<std::unique_ptr<datasource>> sources,
-                                   compression_type compression,
-                                   size_t range_offset,
-                                   size_t range_size,
-                                   rmm::cuda_stream_view stream)
+std::size_t get_batch_size_upper_bound()
 {
-  CUDF_FUNC_RANGE();
-  // We append a line delimiter between two files to make sure the last line of file i and the first
-  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
-  // delimiter.
-  auto constexpr num_delimiter_chars = 1;
-
-  if (compression == compression_type::NONE) {
-    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
-    std::vector<size_t> prefsum_source_sizes(sources.size());
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    size_t bytes_read = 0;
-    std::transform_inclusive_scan(sources.begin(),
-                                  sources.end(),
-                                  prefsum_source_sizes.begin(),
-                                  std::plus<size_t>{},
-                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
-    auto upper =
-      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
-    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
-
-    auto const total_bytes_to_read =
-      std::min(range_size, prefsum_source_sizes.back() - range_offset);
-    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
-      if (sources[i]->is_empty()) continue;
-      auto data_size =
-        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
-                         (num_delimiter_chars * delimiter_map.size());
-      if (sources[i]->is_device_read_preferred(data_size)) {
-        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
-      } else {
-        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
-        auto const& h_buffer = h_buffers.back();
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
-        bytes_read += h_buffer->size();
-      }
-      range_offset = 0;
-      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
-    }
-    // Removing delimiter inserted after last non-empty source is read
-    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
-
-    // If this is a multi-file source, we scatter the JSON line delimiters between files
-    if (sources.size() > 1) {
-      static_assert(num_delimiter_chars == 1,
-                    "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator('\n');
-      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, rmm::mr::get_current_device_resource());
-      thrust::scatter(rmm::exec_policy_nosync(stream),
-                      delimiter_source,
-                      delimiter_source + d_delimiter_map.size(),
-                      d_delimiter_map.data(),
-                      buffer.data());
-    }
-    stream.synchronize();
-    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
-  }
-  // TODO: allow byte range reading from multiple compressed files.
-  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
-  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
-  // Single read because only a single compressed source is supported
-  // Reading to host because decompression of a single block is much faster on the CPU
-  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
-  auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto const batch_size_str         = std::getenv("LIBCUDF_JSON_BATCH_SIZE");
+  int64_t const batch_size          = batch_size_str != nullptr ? std::atol(batch_size_str) : 0L;
+  auto const batch_limit            = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+  auto const batch_size_upper_bound = static_cast<std::size_t>(
+    (batch_size > 0 && batch_size < batch_limit) ? batch_size : batch_limit);
+  return batch_size_upper_bound;
 }
 
-size_t estimate_size_per_subchunk(size_t chunk_size)
+/**
+ * @brief Extract the first delimiter character position in the string
+ *
+ * @param d_data Device span in which to search for delimiter character
+ * @param delimiter Delimiter character to search for
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Position of first delimiter character in device array
+ */
+size_type find_first_delimiter(device_span<char const> d_data,
+                               char const delimiter,
+                               rmm::cuda_stream_view stream)
 {
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  auto const first_delimiter_position =
+    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
+  return first_delimiter_position != d_data.end()
+           ? static_cast<size_type>(thrust::distance(d_data.begin(), first_delimiter_position))
+           : -1;
 }
 
 /**
@@ -175,12 +125,12 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 {
   CUDF_FUNC_RANGE();
 
-  size_t const total_source_size            = sources_size(sources, 0, 0);
+  std::size_t const total_source_size       = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
   auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
   compression_type const reader_compression = reader_opts.get_compression();
-  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
-  size_t chunk_size                         = reader_opts.get_byte_range_size();
+  std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size                    = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
@@ -188,14 +138,14 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
-  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
+  int const num_subchunks_prealloced  = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
   // of subchunks.
   auto constexpr header_size = 4096;
-  size_t const buffer_size =
+  std::size_t const buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
@@ -217,8 +167,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
-    std::int64_t next_delim_pos = -1;
-    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    std::int64_t next_delim_pos     = -1;
+    std::size_t next_subchunk_start = chunk_offset + chunk_size;
     while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
       buffer_offset += readbufspan.size();
       readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
@@ -243,6 +193,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
+// Helper function to read the current batch using byte range offsets and size
+// passed
 table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                json_reader_options const& reader_opts,
                                rmm::cuda_stream_view stream,
@@ -270,6 +222,92 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
 }
 
+}  // anonymous namespace
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   std::size_t range_offset,
+                                   std::size_t range_size,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  // We append a line delimiter between two files to make sure the last line of file i and the first
+  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
+  // delimiter.
+  auto constexpr num_delimiter_chars = 1;
+
+  if (compression == compression_type::NONE) {
+    auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
+    std::vector<std::size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
+    std::size_t bytes_read = 0;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<std::size_t>{},
+                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read;
+         i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
+        bytes_read += h_buffer->size();
+      }
+      range_offset = 0;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
+    }
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
+
+    // If this is a multi-file source, we scatter the JSON line delimiters between files
+    if (sources.size() > 1) {
+      static_assert(num_delimiter_chars == 1,
+                    "Currently only single-character delimiters are supported");
+      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(stream),
+                      delimiter_source,
+                      delimiter_source + d_delimiter_map.size(),
+                      d_delimiter_map.data(),
+                      buffer.data());
+    }
+    stream.synchronize();
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
+  }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
+}
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -296,15 +334,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * Note that the batched reader does not work for compressed inputs or for regular
    * JSON inputs.
    */
-  size_t const total_source_size = sources_size(sources, 0, 0);
-  size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
-                                               : std::min(chunk_size, total_source_size - chunk_offset);
+  std::size_t const total_source_size = sources_size(sources, 0, 0);
+  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size              = reader_opts.get_byte_range_size();
+  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
+                                                    : std::min(chunk_size, total_source_size - chunk_offset);
 
-  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
-  size_t const batch_size_ub =
-    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+  std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
+  std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
+  std::size_t const batch_size =
+    batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
 
   /*
    * Identify the position (zero-indexed) of starting source file from which to begin
@@ -314,10 +353,10 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    */
 
   // Prefix sum of source file sizes
-  size_t pref_source_size = 0;
+  std::size_t pref_source_size = 0;
   // Starting source file from which to being batching evaluated using byte range offset
-  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
-    for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
+  std::size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
+    for (std::size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
       if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
       pref_source_size += sources[src_idx]->size();
     }
@@ -329,16 +368,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
    * stops.
    */
-  size_t pref_bytes_size = chunk_offset;
-  size_t end_bytes_size  = chunk_offset + chunk_size;
-  std::vector<size_t> batch_offsets{pref_bytes_size};
-  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+  std::size_t pref_bytes_size = chunk_offset;
+  std::size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<std::size_t> batch_offsets{pref_bytes_size};
+  for (std::size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
     pref_source_size += sources[i]->size();
     // If the current source file can subsume multiple batches, we split the file until the
     // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
     while (pref_bytes_size < end_bytes_size &&
-           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
-      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+           pref_source_size >= std::min(pref_bytes_size + batch_size, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size, end_bytes_size - pref_bytes_size);
       batch_offsets.push_back(batch_offsets.back() + next_batch_size);
       pref_bytes_size += next_batch_size;
     }
@@ -356,7 +395,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 32de4ebabfa..7e3a920f00d 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -37,6 +37,20 @@ constexpr size_t min_subchunk_size        = 10000;
 constexpr int estimated_compression_ratio = 4;
 constexpr int max_subchunks_prealloced    = 3;
 
+/**
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
+ *
+ * @param buffer Device span buffer to which data is read
+ * @param sources Array of data sources
+ * @param compression Compression format of source
+ * @param range_offset Number of bytes to skip from source start
+ * @param range_size Number of bytes to read from source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
+ */
 device_span<char> ingest_raw_input(device_span<char> buffer,
                                    host_span<std::unique_ptr<datasource>> sources,
                                    compression_type compression,
@@ -44,14 +58,20 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    size_t range_size,
                                    rmm::cuda_stream_view stream);
 
+/**
+ * @brief Reads and returns the entire data set in batches.
+ *
+ * @param sources Input `datasource` objects to read the dataset from
+ * @param reader_opts Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return cudf::table object that contains the array of cudf::column.
+ */
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream);
-
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index 49abf7b484d..e34ab991c11 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -28,13 +28,17 @@ struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
 
 TEST_F(JsonLargeReaderTest, MultiBatch)
 {
-  std::string json_string             = R"(
+  std::string json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
-  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
+
+  std::size_t const batch_size_upper_bound = std::numeric_limits<int32_t>::max() / 16;
+  // set smaller batch_size to reduce file size and execution time
+  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+
+  constexpr std::size_t expected_file_size = 1.5 * static_cast<double>(batch_size_upper_bound);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
@@ -66,8 +70,11 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     datasources.emplace_back(cudf::io::datasource::create(hb));
   }
   // Test for different chunk sizes
-  std::vector<size_t> chunk_sizes{
-    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  std::vector<std::size_t> chunk_sizes{batch_size_upper_bound / 4,
+                                       batch_size_upper_bound / 2,
+                                       batch_size_upper_bound,
+                                       static_cast<std::size_t>(batch_size_upper_bound * 2)};
+
   for (auto chunk_size : chunk_sizes) {
     auto const tables =
       split_byte_range_reading<std::int64_t>(datasources,
@@ -86,4 +93,7 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     // cannot use EQUAL due to concatenate removing null mask
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
+
+  // go back to normal batch_size
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }

From e5f8dd33d78a2c964f8d6bac895deb73a9be7aa6 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 12 Aug 2024 16:52:52 -0500
Subject: [PATCH 50/72] Update the java code to properly deal with lists being
 returned as strings (#16536)

Recently some JSON parsing was updated so lists could be returned as strings. This updates the java code so that when cleaning up the results to match the desired schema that it can handle corner cases associated with lists and structs properly.

Tests are covered in the Spark plugin, but I am happy to add some here if we really want to validate that part of this.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16536
---
 java/src/main/java/ai/rapids/cudf/Table.java | 29 +++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4e737451ed6..36e342cae13 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1084,7 +1084,12 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
         // The types don't match so just return the input unchanged...
         return DidViewChange.no();
       } else {
-        String[] foundNames = children.getNames();
+        String[] foundNames;
+        if (children == null) {
+          foundNames = new String[0];
+        } else {
+          foundNames = children.getNames();
+        }
         HashMap<String, Integer> indices = new HashMap<>();
         for (int i = 0; i < foundNames.length; i++) {
           indices.put(foundNames[i], i);
@@ -1101,8 +1106,9 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
           for (int i = 0; i < columns.length; i++) {
             String neededColumnName = neededNames[i];
             Integer index = indices.get(neededColumnName);
+            Schema childSchema = schema.getChild(i);
             if (index != null) {
-              if (schema.getChild(i).isStructOrHasStructDescendant()) {
+              if (childSchema.isStructOrHasStructDescendant()) {
                 ColumnView child = cv.getChildColumnView(index);
                 boolean shouldCloseChild = true;
                 try {
@@ -1131,8 +1137,23 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
               }
             } else {
               somethingChanged = true;
-              try (Scalar s = Scalar.fromNull(types[i])) {
-                columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+              if (types[i] == DType.LIST) {
+                try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else if (types[i] == DType.STRUCT) {
+                int numStructChildren = childSchema.getNumChildren();
+                HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren];
+                for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) {
+                  structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType();
+                }
+                try (Scalar s = Scalar.structFromNull(structChildren)) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else {
+                try (Scalar s = Scalar.fromNull(types[i])) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
               }
             }
           }

From 7178bf2eb34334db909a151926d8112c441b3b09 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:45:44 -0400
Subject: [PATCH 51/72] Rework cudf::io::text::byte_range_info class member
 functions (#16518)

Adds `const` declarations to appropriate member functions in class `cudf::io::text::byte_range_info` and moves the ctor implementation to .cpp file.
This helps with using the `byte_range_info` objects in `const` variables and inside of `const` functions.

Found while working on #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16518
---
 cpp/include/cudf/io/text/byte_range_info.hpp | 21 ++++++++------------
 cpp/src/io/text/byte_range_info.cpp          |  7 +++++++
 cpp/src/io/text/multibyte_split.cu           |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 7e9256be1d3..5f3c91dc99c 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <cstdint>
@@ -40,53 +39,49 @@ class byte_range_info {
   int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() = default;
+  byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
    * @param offset offset in bytes
    * @param size size in bytes
    */
-  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
-  {
-    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
-    CUDF_EXPECTS(size >= 0, "size must be non-negative");
-  }
+  byte_range_info(int64_t offset, int64_t size);
 
   /**
    * @brief Copy constructor
    *
    * @param other byte_range_info object to copy
    */
-  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  byte_range_info(byte_range_info const& other) noexcept = default;
   /**
    * @brief  Copy assignment operator
    *
    * @param other byte_range_info object to copy
    * @return this object after copying
    */
-  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+  byte_range_info& operator=(byte_range_info const& other) noexcept = default;
 
   /**
    * @brief Get the offset in bytes
    *
    * @return Offset in bytes
    */
-  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] int64_t offset() const { return _offset; }
 
   /**
    * @brief Get the size in bytes
    *
    * @return Size in bytes
    */
-  [[nodiscard]] constexpr int64_t size() { return _size; }
+  [[nodiscard]] int64_t size() const { return _size; }
 
   /**
    * @brief Returns whether the span is empty.
    *
-   * @return true iff the span is empty, i.e. `size() == 0`
+   * @return true iff the range is empty, i.e. `size() == 0`
    */
-  [[nodiscard]] constexpr bool empty() { return size() == 0; }
+  [[nodiscard]] bool is_empty() const { return size() == 0; }
 };
 
 /**
diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
index 6a7836ed4e1..fe811739b97 100644
--- a/cpp/src/io/text/byte_range_info.cpp
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -23,6 +24,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+{
+  CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+  CUDF_EXPECTS(size >= 0, "size must be non-negative");
+}
+
 byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
 
 std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 97729a091fb..e3435a24b18 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -310,7 +310,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 {
   CUDF_FUNC_RANGE();
 
-  if (byte_range.empty()) { return make_empty_column(type_id::STRING); }
+  if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); }
 
   auto device_delim = cudf::string_scalar(delimiter, true, stream, mr);
 

From 419fb99fa9ac471ae00ebe7787543b8e9cc154b5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:52:30 -0400
Subject: [PATCH 52/72] Fix all-empty input column for strings split APIs
 (#16466)

Fixes specialized behavior for all empty input column on the strings split APIs.
Verifying behavior with Pandas `str.split( pat, expand, regex )`
`pat=None     -- whitespace`
`expand=False -- record APIs`
`regex=True   -- re APIs`

- [x] `split`
- [x] `split` - whitespace
- [x] `rsplit`
- [x] `rsplit` - whitespace
- [x] `split_record`
- [x] `split_record` - whitespace
- [x] `rsplit_record`
- [x] `rsplit_record` - whitespace
- [x] `split_re`
- [x] `rsplit_re`
- [x] `split_record_re`
- [x] `rsplit_record_re`

Closes #16453

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16466
---
 cpp/src/strings/split/split.cuh       | 24 ++++++--------
 cpp/src/strings/split/split_re.cu     |  4 +++
 cpp/tests/strings/split_tests.cpp     | 47 ++++++++++++++++++++++++---
 python/cudf/cudf/tests/test_string.py | 16 +++++++++
 4 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 4d7096c02ca..af70367678e 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -142,7 +142,7 @@ struct base_split_tokenizer {
 
     // max_tokens already included in token counts
     if (d_tokens.size() == 1) {
-      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()};
       return;
     }
 
@@ -357,24 +357,20 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
-  if (chars_bytes == 0) {
-    auto offsets = cudf::make_column_from_scalar(
-      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
-    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
-    return std::pair{std::move(offsets), std::move(tokens)};
-  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
   rmm::device_scalar<int64_t> d_count(0, stream);
-  constexpr int64_t block_size         = 512;
-  constexpr size_type bytes_per_thread = 4;
-  auto const num_blocks                = util::div_rounding_up_safe(
-    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
-  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
-    <<<num_blocks, block_size, 0, stream.value()>>>(
-      tokenizer, d_offsets, chars_bytes, d_count.data());
+  if (chars_bytes > 0) {
+    constexpr int64_t block_size         = 512;
+    constexpr size_type bytes_per_thread = 4;
+    auto const num_blocks                = util::div_rounding_up_safe(
+      util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+    count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+      <<<num_blocks, block_size, 0, stream.value()>>>(
+        tokenizer, d_offsets, chars_bytes, d_count.data());
+  }
 
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d72ec1085b5..e0aacf07ef0 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -71,6 +71,10 @@ struct token_reader_fn {
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto const d_result     = d_tokens + token_offset;  // store tokens here
+    if (nchars == 0) {
+      d_result[0] = string_index_pair{"", 0};
+      return;
+    }
 
     int64_t token_idx = 0;
     auto itr          = d_str.begin();
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 4c020cb4c29..7ece08b19f2 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+TEST_F(StringsSplitTest, SplitAllEmpty)
 {
   auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
   auto sv        = cudf::strings_column_view(input);
+  auto empty     = cudf::string_scalar("");
   auto delimiter = cudf::string_scalar("s");
+
+  auto result = cudf::strings::split(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  // whitespace hits a special case where nothing matches returns an all-null column
+  auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0});
+  result        = cudf::strings::split(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+  result = cudf::strings::rsplit(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
   auto empty     = cudf::string_scalar("");
+  auto delimiter = cudf::string_scalar("s");
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}});
+  LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}});
+
   auto result = cudf::strings::split_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::split_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 
   result = cudf::strings::rsplit_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::rsplit_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 }
 
 TEST_F(StringsSplitTest, MultiByteDelimiters)
@@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   }
 }
 
+TEST_F(StringsSplitTest, SplitRegexAllEmpty)
+{
+  auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv    = cudf::strings_column_view(input);
+  auto prog  = cudf::strings::regex_program::create("[ _]");
+
+  auto result = cudf::strings::split_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  auto rec_result = cudf::strings::split_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+  rec_result = cudf::strings::rsplit_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<char const*> h_strings{
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a2a3e874c91..30880f074c0 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("pat", [None, "\\s+"])
+@pytest.mark.parametrize("regex", [False, True])
+@pytest.mark.parametrize("expand", [False, True])
+def test_string_split_all_empty(pat, regex, expand):
+    ps = pd.Series(["", "", "", ""], dtype="str")
+    gs = cudf.Series(["", "", "", ""], dtype="str")
+
+    expect = ps.str.split(pat=pat, expand=expand, regex=regex)
+    got = gs.str.split(pat=pat, expand=expand, regex=regex)
+
+    if isinstance(got, cudf.DataFrame):
+        assert_eq(expect, got, check_column_type=False)
+    else:
+        assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )

From 3a791cb8a83ca2cf446a910cb94d5a4e3edf2b9f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:56:43 -0400
Subject: [PATCH 53/72] Remove unneeded pair-iterator benchmark (#16511)

Removes the pair-iterator benchmark logic. The remaining benchmarks use the null-replacement-iterator which uses the libcudf pair-iterator internally. There is no need for benchmarking this unique iterator pattern that is not used by libcudf.

The `cpp/benchmarks/iterator/iterator.cu` failed to compile with gcc 12 because the sum-reduce function cannot resolve adding `thrust::pair` objects together likely due to some recent changes in CCCL. Regardless, adding `thrust::pair` objects is not something we need to benchmark. The existing benchmark benchmarks libcudf's usage of the internal pair-iterator correctly.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16511
---
 cpp/benchmarks/iterator/iterator.cu | 77 -----------------------------
 1 file changed, 77 deletions(-)

diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index ada7a9bd73d..fd0cebb12ea 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -30,7 +30,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 
 #include <random>
@@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
                           sizeof(TypeParam));
 }
 
-// operator+ defined for pair iterator reduction
-template <typename T>
-__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
-{
-  return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
-                               lhs.second + rhs.second};
-}
-// -----------------------------------------------------------------------------
-template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view& col,
-                             rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col    = cudf::column_device_view::create(col);
-  int num_items = col.size();
-  auto begin    = d_col->pair_begin<T, has_null>();
-  reduce_by_cub(result.begin(), begin, num_items, init);
-}
-
-template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view& col,
-                                rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col = cudf::column_device_view::create(col);
-  auto d_in  = d_col->pair_begin<T, has_null>();
-  auto d_end = d_in + col.size();
-  thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
-}
-
-template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State& state)
-{
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  using T      = TypeParam;
-  auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
-  auto null_gen =
-    thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });
-
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
-    num_gen, num_gen + column_size, null_gen);
-  cudf::column_view hasnull_F = wrap_hasnull_F;
-  cudf::column_view hasnull_T = wrap_hasnull_T;
-
-  // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    if (cub_or_thrust) {
-      pair_iterator_bench_cub<T, false>(hasnull_T,
-                                        dev_result);  // driven by pair iterator with nulls
-    } else {
-      pair_iterator_bench_thrust<T, false>(hasnull_T,
-                                           dev_result);  // driven by pair iterator with nulls
-    }
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          sizeof(TypeParam));
-}
-
 #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
   BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state)             \
   {                                                                          \
@@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
 ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);
-
-#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust)  \
-  BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
-  {                                                              \
-    BM_pair_iterator<type, cub_or_thrust>(state);                \
-  }                                                              \
-  BENCHMARK_REGISTER_F(Iterator, name)                           \
-    ->RangeMultiplier(10)                                        \
-    ->Range(1000, 10000000)                                      \
-    ->UseManualTime()                                            \
-    ->Unit(benchmark::kMillisecond);
-
-PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
-PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);

From 3801f811ab7713e4cb9cc3bb34d282f8a04e71e4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 13 Aug 2024 12:40:40 -0500
Subject: [PATCH 54/72] Remove hardcoded versions from workflows. (#16540)

This PR removes hardcoded Python versions from CI workflows. It is a prerequisite for dropping Python 3.9. See https://github.com/rapidsai/build-planning/issues/88.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16540
---
 .github/workflows/pandas-tests.yaml | 3 ++-
 .github/workflows/pr.yaml           | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index cf0c2b377dd..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -19,7 +19,8 @@ jobs:
       secrets: inherit
       uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+        # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+        matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2e7f64f952..ea8a1762b2c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -187,6 +187,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/run_tests.sh
@@ -196,7 +197,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.

From 5780c4d8fb5afac2e04988a2ff5531f94c22d3a3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 13 Aug 2024 13:46:31 -0700
Subject: [PATCH 55/72] Register `read_parquet` and `read_csv` with dask-expr
 (#16535)

After https://github.com/dask/dask-expr/pull/1114, Dask cuDF must register specific `read_parquet` and `read_csv` functions to be used when query-planning is enabled (the default).

**This PR is required for CI to pass with dask>2024.8.0**

**NOTE**: It probably doesn't make sense to add specific tests for this change. Once the 2014.7.1 dask pin is removed, all `dask_cudf` tests using `read_parquet` and  `read_csv` will fail without this change...

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/16535
---
 python/dask_cudf/dask_cudf/backends.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2b1f745fc04..01bab30190a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,6 +667,41 @@ def from_dict(
             constructor=constructor,
         )
 
+    @staticmethod
+    def read_parquet(*args, engine=None, **kwargs):
+        import dask_expr as dx
+
+        from dask_cudf.io.parquet import CudfEngine
+
+        return _default_backend(
+            dx.read_parquet, *args, engine=CudfEngine, **kwargs
+        )
+
+    @staticmethod
+    def read_csv(
+        path,
+        *args,
+        header="infer",
+        dtype_backend=None,
+        storage_options=None,
+        **kwargs,
+    ):
+        import dask_expr as dx
+        from fsspec.utils import stringify_path
+
+        if not isinstance(path, str):
+            path = stringify_path(path)
+        return dx.new_collection(
+            dx.io.csv.ReadCSV(
+                path,
+                dtype_backend=dtype_backend,
+                storage_options=storage_options,
+                kwargs=kwargs,
+                header=header,
+                dataframe_backend="cudf",
+            )
+        )
+
     @staticmethod
     def read_json(*args, **kwargs):
         from dask_cudf.io.json import read_json as read_json_impl

From cf3fabf7d090dcd983080e3c844002ebb7280e77 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 13 Aug 2024 22:59:47 +0200
Subject: [PATCH 56/72] Ensure comparisons with pyints and integer series
 always succeed (#16532)

When Python integers are compared to a series of integers, the result can always be correctly defined no matter the values of the Python integer.

This was always a very mild issue.  But with NumPy 2 behavior not upcasting the computation result type based on the value anymore, even things like:
```
cudf.Series([1, 2, 3], dtype="int8") < 1000
```
would fail.
(Similar paths could be taken for other integer scalars, but there would be mostly nice for performance.)

N.B. NumPy/pandas also support exact comparisons when mixing e.g. uint64 and int64.  This is another rare exception that cudf currently does not support.

Closes gh-16282

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16532
---
 python/cudf/cudf/core/column/numerical.py | 54 +++++++++++++++++------
 python/cudf/cudf/tests/test_binops.py     | 41 +++++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b83d7600c82..bbc74ef349e 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -199,16 +199,53 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             np.bool_: np.float32,
         }
 
+        out_dtype = None
         if op in {"__truediv__", "__rtruediv__"}:
             # Division with integer types results in a suitable float.
             if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
                 return self.astype(truediv_type)._binaryop(other, op)
+        elif op in {
+            "__lt__",
+            "__gt__",
+            "__le__",
+            "__ge__",
+            "__eq__",
+            "__ne__",
+        }:
+            out_dtype = "bool"
+
+            # If `other` is a Python integer and it is out-of-bounds
+            # promotion could fail but we can trivially define the result
+            # in terms of `notnull` or `NULL_NOT_EQUALS`.
+            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+                truthiness = None
+                iinfo = np.iinfo(self.dtype)
+                if iinfo.min > other:
+                    truthiness = op in {"__ne__", "__gt__", "__ge__"}
+                elif iinfo.max < other:
+                    truthiness = op in {"__ne__", "__lt__", "__le__"}
+
+                # Compare with minimum value so that the result is true/false
+                if truthiness is True:
+                    other = iinfo.min
+                    op = "__ge__"
+                elif truthiness is False:
+                    other = iinfo.min
+                    op = "__lt__"
+
+        elif op in {"NULL_EQUALS", "NULL_NOT_EQUALS"}:
+            out_dtype = "bool"
 
         reflect, op = self._check_reflected_op(op)
         if (other := self._wrap_binop_normalization(other)) is NotImplemented:
             return NotImplemented
-        out_dtype = self.dtype
-        if other is not None:
+
+        if out_dtype is not None:
+            pass  # out_dtype was already set to bool
+        if other is None:
+            # not a binary operator, so no need to promote
+            out_dtype = self.dtype
+        elif out_dtype is None:
             out_dtype = np.result_type(self.dtype, other.dtype)
             if op in {"__mod__", "__floordiv__"}:
                 tmp = self if reflect else other
@@ -225,17 +262,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                             out_dtype = cudf.dtype("float64")
                     elif is_scalar(tmp) and tmp == 0:
                         out_dtype = cudf.dtype("float64")
-        if op in {
-            "__lt__",
-            "__gt__",
-            "__le__",
-            "__ge__",
-            "__eq__",
-            "__ne__",
-            "NULL_EQUALS",
-            "NULL_NOT_EQUALS",
-        }:
-            out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
             if self.dtype.kind == "f" or other.dtype.kind == "f":
@@ -247,7 +273,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
-        if (
+        elif (
             op == "__pow__"
             and self.dtype.kind in "iu"
             and (is_integer(other) or other.dtype.kind in "iu")
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 503b1a975b4..4256ec872e6 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -290,6 +290,47 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2))
 
 
+@pytest.mark.parametrize(
+    "dtype,val",
+    [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)],
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+    ],
+)
+@pytest.mark.parametrize("reverse", [False, True])
+def test_series_compare_integer(dtype, val, op, reverse):
+    # Tests that these actually work, even though they are out of bound.
+    force_cast_val = np.array(val).astype(dtype)
+    sr = Series(
+        [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None],
+        dtype=dtype,
+    )
+
+    if reverse:
+        _op = op
+
+        def op(x, y):
+            return _op(y, x)
+
+    # We expect the same result as comparing to a value within range (e.g. 0)
+    # except that a NULL value evaluates to False
+    if op(0, val):
+        expected = Series([True, True, True, None])
+    else:
+        expected = Series([False, False, False, None])
+
+    res = op(sr, val)
+    assert_eq(res, expected)
+
+
 def _series_compare_nulls_typegen():
     return [
         *combinations_with_replacement(DATETIME_TYPES, 2),

From 1f0d0c93f315f64698ffcc80082926896facf13a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:22 -0400
Subject: [PATCH 57/72] Change cudf::empty_like to not include offsets for
 empty strings columns (#16529)

Fixes `cudf::empty_like` to only create empty child columns for nested types. The empty child columns are needed to store the types for consistency with `cudf::make_empty_column`.

Closes #16490

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16529
---
 cpp/src/copying/copy.cpp                  | 6 ++++++
 cpp/tests/copying/pack_tests.cpp          | 6 ++++--
 cpp/tests/replace/replace_nulls_tests.cpp | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 98ee6aa8f68..bac8dbe5d95 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -143,6 +143,12 @@ std::unique_ptr<column> empty_like(column_view const& input)
 {
   CUDF_FUNC_RANGE();
 
+  // test_dataframe.py passes an EMPTY column type here;
+  // this causes is_nested to throw an error since it uses the type-dispatcher
+  if ((input.type().id() == type_id::EMPTY) || !cudf::is_nested(input.type())) {
+    return make_empty_column(input.type());
+  }
+
   std::vector<std::unique_ptr<column>> children;
   std::transform(input.child_begin(),
                  input.child_end(),
diff --git a/cpp/tests/copying/pack_tests.cpp b/cpp/tests/copying/pack_tests.cpp
index ea4408efa6a..8a50e071cb9 100644
--- a/cpp/tests/copying/pack_tests.cpp
+++ b/cpp/tests/copying/pack_tests.cpp
@@ -573,6 +573,8 @@ TEST_F(PackUnpackTest, SlicedEmpty)
 
   cudf::table_view t({a, b, c, d});
 
-  auto sliced = cudf::split(t, {0});
-  this->run_test(sliced[0]);
+  auto sliced   = cudf::split(t, {0});
+  auto packed   = cudf::pack(t);
+  auto unpacked = cudf::unpack(packed);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t, unpacked);
 }
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 9603ea44a76..fcee27305f2 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -674,7 +674,7 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
   cudf::test::fixed_width_column_wrapper<int64_t> input_empty_w({});
   auto input_empty = cudf::dictionary::encode(input_empty_w);
   auto result      = cudf::replace_nulls(input_empty->view(), input_empty->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input_empty->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), input_empty->view());
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsNoNulls)

From c20d6b3a3588c70d985e0d737fed844a9c0c6426 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:51 -0400
Subject: [PATCH 58/72] Remove unneeded output size parameter from internal
 count_matches utility (#16531)

Removes `output_size` parameter from `cudf::strings::detail::count_matches` utility since the output size should equal the input size from the first parameter. This also removes an unnecessary `assert()` call. The parameter became unnecessary as part of the large strings work.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16531
---
 cpp/src/strings/contains.cu            | 2 +-
 cpp/src/strings/count_matches.cu       | 9 +++------
 cpp/src/strings/count_matches.hpp      | 2 --
 cpp/src/strings/extract/extract_all.cu | 2 +-
 cpp/src/strings/search/findall.cu      | 2 +-
 cpp/src/strings/split/split_re.cu      | 6 +++---
 6 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 718ac41e36c..79d241205df 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -112,7 +112,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
+  auto result = count_matches(*d_strings, *d_prog, stream, mr);
   if (input.has_nulls()) {
     result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
                           input.null_count());
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index e8672ea5335..4ad3a75baf7 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -60,18 +60,15 @@ struct count_fn {
 
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  assert(output_size >= d_strings.size() and "Unexpected output size");
-
   auto results = make_numeric_column(
-    data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, d_strings.size(), mask_state::UNALLOCATED, stream, mr);
 
-  if (d_strings.size() == 0) return results;
+  if (d_strings.size() == 0) { return results; }
 
-  auto d_results = results->mutable_view().data<int32_t>();
+  auto d_results = results->mutable_view().data<cudf::size_type>();
 
   launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index 4a5efac37fd..eab9863b975 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -37,14 +37,12 @@ class reprog_device;
  *
  * @param d_strings Device view of the input strings column.
  * @param d_prog Regex instance to evaluate on each string.
- * @param output_size Number of rows for the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Integer column of match counts
  */
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 27691068d5a..897eba58833 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -119,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 
   // Get the match counts for each string.
   // This column will become the output lists child offsets column.
-  auto counts   = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts   = count_matches(*d_strings, *d_prog, stream, mr);
   auto d_counts = counts->mutable_view().data<size_type>();
 
   // Compute null output rows
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 0d0962258cf..2f7e7352458 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // Create lists offsets column
-  auto const sizes              = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto const sizes              = count_matches(*d_strings, *d_prog, stream, mr);
   auto [offsets, total_matches] = cudf::detail::make_offsets_child_column(
     sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
   auto const d_offsets = offsets->view().data<size_type>();
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index e0aacf07ef0..d273c93ec12 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -210,8 +210,8 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto const counts = count_matches(
-    *d_strings, *d_prog, strings_count, stream, rmm::mr::get_current_device_resource());
+  auto const counts =
+    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =
@@ -275,7 +275,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts = count_matches(*d_strings, *d_prog, stream, mr);
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =

From bf3372b1aa02939db32b2df62ab816a0eb9abdde Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 14 Aug 2024 12:06:29 -0500
Subject: [PATCH 59/72] Switch python version to `3.10` in `cudf.pandas` pandas
 test scripts (#16559)

python 3.9 support was recently dropped in rapids, hence changing the python version to 3.10

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16559
---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 6cf70a2347f..5dbb4ba991c 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -12,7 +12,7 @@ RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
 rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
-PY_VER="39"
+PY_VER="310"
 MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 

From d684ae0e80d179d4d711c00278d00b5f66625303 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 12:36:51 -1000
Subject: [PATCH 60/72] Raise NotImplementedError for Series.rename that's not
 a scalar (#16525)

xref https://github.com/rapidsai/cudf/issues/16507

Raising a `NotImplementedError` gives a chance for this work in `cudf.pandas`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16525
---
 python/cudf/cudf/core/series.py       | 4 ++++
 python/cudf/cudf/tests/test_series.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 53675d339ac..822b966364f 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3589,6 +3589,10 @@ def rename(
             raise NotImplementedError("level is currently not supported.")
         if errors != "ignore":
             raise NotImplementedError("errors is currently not supported.")
+        if not is_scalar(index):
+            raise NotImplementedError(
+                ".rename does not currently support relabeling the index."
+            )
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6a1887afb1f..c7aea563535 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2289,6 +2289,13 @@ def test_series_rename(initial_name, name):
     assert_eq(actual, expected)
 
 
+@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}])
+def test_rename_index_not_supported(index):
+    ser = cudf.Series(range(2))
+    with pytest.raises(NotImplementedError):
+        ser.rename(index=index)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 0253e976ede25d954c607663da61b445e213523f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:27:52 -0400
Subject: [PATCH 61/72] [FEA] Support named aggregations in
 `df.groupby().agg()` (#16528)

Closes #15967

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16528
---
 python/cudf/cudf/core/column_accessor.py      |  7 +---
 python/cudf/cudf/core/groupby/groupby.py      | 41 ++++++++++++-------
 python/cudf/cudf/tests/groupby/test_agg.py    | 30 ++++++++++++++
 .../cudf/cudf/tests/test_column_accessor.py   |  4 ++
 python/cudf/cudf/tests/test_dataframe.py      |  1 -
 5 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 83596704672..48bc84070b1 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -610,7 +610,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Mapping[Any, Any] | Callable, level: int | None
+        self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
     ) -> ColumnAccessor:
         """
         Rename the specified levels of the given ColumnAccessor
@@ -653,10 +653,7 @@ def rename_column(x):
                 return x
 
             if level is None:
-                raise NotImplementedError(
-                    "Renaming columns with a MultiIndex and level=None is"
-                    "not supported"
-                )
+                level = 0
             new_col_names = (rename_column(k) for k in self.keys())
 
         else:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 92c4b73ceaa..9b71ea57f1f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -548,7 +548,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+    def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -648,11 +648,10 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             raise NotImplementedError(
                 "Passing args to func is currently not supported."
             )
-        if kwargs:
-            raise NotImplementedError(
-                "Passing kwargs to func is currently not supported."
-            )
-        column_names, columns, normalized_aggs = self._normalize_aggs(func)
+
+        column_names, columns, normalized_aggs = self._normalize_aggs(
+            func, **kwargs
+        )
         orig_dtypes = tuple(c.dtype for c in columns)
 
         # Note: When there are no key columns, the below produces
@@ -1266,11 +1265,11 @@ def _grouped(self, *, include_groups: bool = True):
         return (group_names, offsets, grouped_keys, grouped_values)
 
     def _normalize_aggs(
-        self, aggs: MultiColumnAggType
+        self, aggs: MultiColumnAggType, **kwargs
     ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]:
         """
         Normalize aggs to a list of list of aggregations, where `out[i]`
-        is a list of aggregations for column `self.obj[i]`. We support three
+        is a list of aggregations for column `self.obj[i]`. We support four
         different form of `aggs` input here:
         - A single agg, such as "sum". This agg is applied to all value
         columns.
@@ -1279,18 +1278,30 @@ def _normalize_aggs(
         - A mapping of column name to aggs, such as
         {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified
         column.
+        - Pairs of column name and agg tuples passed as kwargs
+        eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are
+        the keys. The aggs are applied to the corresponding column in the tuple.
         Each agg can be string or lambda functions.
         """
 
         aggs_per_column: Iterable[AggType | Iterable[AggType]]
-        if isinstance(aggs, dict):
-            column_names, aggs_per_column = aggs.keys(), aggs.values()
-            columns = tuple(self.obj._data[col] for col in column_names)
+        # TODO: Remove isinstance condition when the legacy dask_cudf API is removed.
+        # See https://github.com/rapidsai/cudf/pull/16528#discussion_r1715482302 for information.
+        if aggs or isinstance(aggs, dict):
+            if isinstance(aggs, dict):
+                column_names, aggs_per_column = aggs.keys(), aggs.values()
+                columns = tuple(self.obj._data[col] for col in column_names)
+            else:
+                values = self.grouping.values
+                column_names = values._column_names
+                columns = values._columns
+                aggs_per_column = (aggs,) * len(columns)
+        elif not aggs and kwargs:
+            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
+            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
+            aggs_per_column = tuple(x[1] for x in kwargs.values())
         else:
-            values = self.grouping.values
-            column_names = values._column_names
-            columns = values._columns
-            aggs_per_column = (aggs,) * len(columns)
+            raise TypeError("Must provide at least one aggregation function.")
 
         # is_list_like performs type narrowing but type-checkers don't
         # know it. One could add a TypeGuard annotation to
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index f8847f02d5a..99e7523031b 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -3,6 +3,7 @@
 import pytest
 
 import cudf
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
@@ -26,3 +27,32 @@ def test_series_agg(attr):
     pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count")
 
     assert agg.ndim == pd_agg.ndim
+
+
+@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"])
+@pytest.mark.parametrize("attr", ["agg", "aggregate"])
+def test_dataframe_agg(attr, func):
+    df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+    pdf = df.to_pandas()
+
+    agg = getattr(df.groupby("a"), attr)(func)
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(func)
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)({"b": func})
+    pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func})
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)([func])
+    pd_agg = getattr(pdf.groupby(["a"]), attr)([func])
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func))
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index e84e1433c10..2d7bc809d4d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -362,6 +362,10 @@ def test_replace_level_values_MultiColumn():
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
 
+    # passing without level kwarg assumes level=0
+    got = ca.rename_levels(mapper={"a": "f"})
+    check_ca_equal(expect, got)
+
 
 def test_clear_nrows_empty_before():
     ca = ColumnAccessor({})
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2c59253d500..89eb5a12c71 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9409,7 +9409,6 @@ def test_rename_for_level_RangeIndex_dataframe():
     assert_eq(expect, got)
 
 
-@pytest_xfail(reason="level=None not implemented yet")
 def test_rename_for_level_is_None_MC():
     gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])

From 19846b6c0ac40fc91ad28573af04ac7403754acb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:15:03 -1000
Subject: [PATCH 62/72] Disallow cudf.Index accepting column in favor of
 ._from_column (#16549)

Similar to https://github.com/rapidsai/cudf/pull/16454, this PR disallows the public `cudf.Index` accepting a private `ColumnBase` object in favor of `_from_column` (which was added in the linked PR)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16549
---
 python/cudf/cudf/_lib/parquet.pyx           |   4 +-
 python/cudf/cudf/_lib/utils.pyx             |   6 +-
 python/cudf/cudf/api/types.py               |   2 +-
 python/cudf/cudf/core/_base_index.py        |   2 +-
 python/cudf/cudf/core/algorithms.py         |   6 +-
 python/cudf/cudf/core/column/categorical.py |   8 +-
 python/cudf/cudf/core/column/datetime.py    |  10 +-
 python/cudf/cudf/core/column/methods.py     |   6 +-
 python/cudf/cudf/core/column/string.py      |   2 +-
 python/cudf/cudf/core/cut.py                |   2 +-
 python/cudf/cudf/core/dataframe.py          |   8 +-
 python/cudf/cudf/core/dtypes.py             |  14 +-
 python/cudf/cudf/core/groupby/groupby.py    |   9 +-
 python/cudf/cudf/core/index.py              | 238 ++++++++++++--------
 python/cudf/cudf/core/indexed_frame.py      |  24 +-
 python/cudf/cudf/core/multiindex.py         |   7 +-
 python/cudf/cudf/core/resample.py           |   4 +-
 python/cudf/cudf/core/series.py             |   4 +-
 python/cudf/cudf/core/tools/datetimes.py    |  16 +-
 python/cudf/cudf/testing/testing.py         |   8 +-
 python/cudf/cudf/tests/test_multiindex.py   |   4 +-
 python/cudf/cudf/tests/test_string.py       |   2 +-
 22 files changed, 232 insertions(+), 154 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4a4b13b0b31..0fffb6ade58 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -222,7 +222,7 @@ cdef object _process_metadata(object df,
                 if len(filtered_idx) > 0:
                     idx = cudf.concat(filtered_idx)
                 else:
-                    idx = cudf.Index(cudf.core.column.column_empty(0))
+                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
             else:
                 start = range_index_meta["start"] + skip_rows
                 stop = range_index_meta["stop"]
@@ -240,7 +240,7 @@ cdef object _process_metadata(object df,
             index_data = df[index_col]
             actual_index_names = list(index_col_names.values())
             if len(index_data._data) == 1:
-                idx = cudf.Index(
+                idx = cudf.Index._from_column(
                     index_data._data.columns[0],
                     name=actual_index_names[0]
                 )
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index f136cd997a7..267432a0182 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -93,12 +93,12 @@ cpdef generate_pandas_metadata(table, index):
     materialize_index = False
     if index is not False:
         for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+            if isinstance(table._index, cudf.MultiIndex):
                 idx = table.index.get_level_values(level)
             else:
                 idx = table.index
 
-            if isinstance(idx, cudf.core.index.RangeIndex):
+            if isinstance(idx, cudf.RangeIndex):
                 if index is None:
                     descr = {
                         "kind": "range",
@@ -110,7 +110,7 @@ cpdef generate_pandas_metadata(table, index):
                 else:
                     materialize_index = True
                     # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    materialized_idx = idx._as_int_index()
                     descr = _index_level_name(
                         index_name=materialized_idx.name,
                         level=level,
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 294ae2fd985..9c436dfad18 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -249,7 +249,7 @@ def _union_categoricals(
             new_categories=sorted_categories
         )
 
-    return cudf.Index(result_col)
+    return cudf.CategoricalIndex._from_column(result_col)
 
 
 def is_bool_dtype(arr_or_dtype):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c91514202c5..d13351c49dd 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1979,7 +1979,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
                 name=index.name,
             )
         else:
-            return cudf.Index(
+            return cudf.Index._from_column(
                 column.as_column(index, nan_as_null=nan_as_null),
                 name=index.name,
             )
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 6c69fbd2637..e27d6ec8d3e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.index import RangeIndex, ensure_index
+from cudf.core.index import Index, RangeIndex
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
@@ -112,7 +112,9 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else ensure_index(cats)
+    return labels, cats.values if return_cupy_array else Index._from_column(
+        cats
+    )
 
 
 def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 6fa69eb9cc1..d25983842f9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -601,11 +601,13 @@ def __setitem__(self, key, value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
-                arr = [value]
+                arr = column.as_column(value, length=1, nan_as_null=False)
             else:
-                arr = value
+                arr = column.as_column(value, nan_as_null=False)
             to_add_categories = len(
-                cudf.Index(arr, nan_as_null=False).difference(self.categories)
+                cudf.Index._from_column(arr).difference(
+                    cudf.Index._from_column(self.categories)
+                )
             )
 
         if to_add_categories > 0:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index ce67ce81e6b..1dbc94384d3 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -250,6 +250,10 @@ def __contains__(self, item: ScalarLike) -> bool:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    @property
+    def quarter(self) -> ColumnBase:
+        return libcudf.datetime.extract_quarter(self)
+
     @property
     def year(self) -> ColumnBase:
         return self.get_dt_field("year")
@@ -308,7 +312,7 @@ def is_quarter_start(self) -> ColumnBase:
     @property
     def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
-        leap_dates = libcudf.datetime.is_leap_year(self)
+        leap_dates = self.is_leap_year
 
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
@@ -316,6 +320,10 @@ def is_year_end(self) -> ColumnBase:
             False
         )
 
+    @property
+    def is_leap_year(self) -> ColumnBase:
+        return libcudf.datetime.is_leap_year(self)
+
     @property
     def is_year_start(self) -> ColumnBase:
         return (self.day_of_year == 1).fillna(False)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 8c46d238057..05a0ab2e09a 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -65,8 +65,8 @@ def _return_or_inplace(
         """
         if inplace:
             self._parent._mimic_inplace(
-                self._parent.__class__._from_data(
-                    {self._parent.name: new_col}
+                type(self._parent)._from_column(
+                    new_col, name=self._parent.name
                 ),
                 inplace=True,
             )
@@ -92,6 +92,6 @@ def _return_or_inplace(
                     index=self._parent.index if retain_index else None,
                 )
             elif isinstance(self._parent, cudf.BaseIndex):
-                return cudf.Index(new_col, name=self._parent.name)
+                return cudf.Index._from_column(new_col, name=self._parent.name)
             else:
                 return self._parent._mimic_inplace(new_col, inplace=False)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1a4b558749d..a710a9f46c2 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4693,7 +4693,7 @@ def character_tokenize(self) -> SeriesOrIndex:
                 result_col, name=self._parent.name, index=index
             )
         elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(result_col, name=self._parent.name)
+            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
             return result_col
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 197f46ee9fe..a4ceea266b4 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.CategoricalIndex._from_data({None: col})
+    categorical_index = cudf.CategoricalIndex._from_column(col)
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a53c7bcc63c..3033abd53f5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -326,7 +326,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=cudf.Index(tmp_arg[0]),
+                        index=cudf.Index._from_column(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -1758,7 +1758,7 @@ def _concat(
         for cols in columns:
             table_index = None
             if 1 == first_data_column_position:
-                table_index = cudf.Index(cols[0])
+                table_index = cudf.Index._from_column(cols[0])
             elif first_data_column_position > 1:
                 table_index = cudf.MultiIndex._from_data(
                     data=dict(
@@ -1810,7 +1810,7 @@ def _concat(
             if not isinstance(out.index, MultiIndex) and isinstance(
                 out.index.dtype, cudf.CategoricalDtype
             ):
-                out = out.set_index(cudf.Index(out.index._values))
+                out = out.set_index(out.index)
         for name, col in out._data.items():
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
@@ -3007,7 +3007,7 @@ def set_index(
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
             # Don't turn single level MultiIndex into an Index
-            idx = cudf.Index(data_to_add[0], name=names[0])
+            idx = cudf.Index._from_column(data_to_add[0], name=names[0])
         else:
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 27afec18b4e..6d532e01cba 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -182,7 +182,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
         self._ordered = ordered
 
     @property
-    def categories(self) -> "cudf.core.index.Index":
+    def categories(self) -> cudf.Index:
         """
         An ``Index`` containing the unique categories allowed.
 
@@ -194,10 +194,12 @@ def categories(self) -> "cudf.core.index.Index":
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            return cudf.Index(
-                cudf.core.column.column_empty(0, dtype="object", masked=False)
+            col = cudf.core.column.column_empty(
+                0, dtype="object", masked=False
             )
-        return cudf.Index(self._categories, copy=False)
+        else:
+            col = self._categories
+        return cudf.Index._from_column(col)
 
     @property
     def type(self):
@@ -259,7 +261,9 @@ def to_pandas(self) -> pd.CategoricalDtype:
             categories = self._categories.to_pandas()
         return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
 
-    def _init_categories(self, categories: Any):
+    def _init_categories(
+        self, categories: Any
+    ) -> cudf.core.column.ColumnBase | None:
         if categories is None:
             return categories
         if len(categories) == 0 and not isinstance(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 9b71ea57f1f..4f283d41b17 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -403,8 +403,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
-            (group_keys,) = group_keys
-            index = cudf.Index(group_keys)
+            index = cudf.Index._from_column(group_keys[0])
         return dict(
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
@@ -2583,7 +2582,7 @@ def _mimic_pandas_order(
             # corresponding output rows in pandas, to do that here
             # expand the result by reindexing.
             ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
+            result.index = cudf.Index._from_column(ordering)
             # This reorders and expands
             result = result.reindex(ri)
         else:
@@ -3154,7 +3153,9 @@ def keys(self):
                 dict(zip(range(nkeys), self._key_columns))
             )._set_names(self.names)
         else:
-            return cudf.Index(self._key_columns[0], name=self.names[0])
+            return cudf.Index._from_column(
+                self._key_columns[0], name=self.names[0]
+            )
 
     @property
     def values(self) -> cudf.core.frame.Frame:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3eab27bd165..c55f86d48e1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -18,7 +18,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.datetime import extract_quarter, is_leap_year
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
@@ -819,22 +818,23 @@ def sort_values(
     @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return cudf.Index._from_data(
-            {self.name: self._values.take(gather_map, nullify, check_bounds)}
+        return cudf.Index._from_column(
+            self._column.take(gather_map, nullify, check_bounds),
+            name=self.name,
         )
 
     @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
-        return cudf.Index._from_data(
-            {self.name: self._values.apply_boolean_mask(boolean_mask)}
+        return cudf.Index._from_column(
+            self._column.apply_boolean_mask(boolean_mask), name=self.name
         )
 
     def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return cudf.Index._from_data(
-            {self.name: self._as_int_index()._split(splits)}
+        return cudf.Index._from_column(
+            self._as_int_index()._split(splits), name=self.name
         )
 
     def _binaryop(self, other, op: str):
@@ -1087,10 +1087,13 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     def _from_column(
         cls, column: ColumnBase, *, name: Hashable = None
     ) -> Self:
-        ca = cudf.core.column_accessor.ColumnAccessor(
-            {name: column}, verify=False
-        )
-        return _index_from_data(ca)
+        if cls is Index:
+            ca = cudf.core.column_accessor.ColumnAccessor(
+                {name: column}, verify=False
+            )
+            return _index_from_data(ca)
+        else:
+            return super()._from_column(column, name=name)
 
     @classmethod
     @_performance_tracking
@@ -1223,8 +1226,8 @@ def _concat(cls, objs):
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
-            data = concat_columns([o._values for o in non_empties])
-            result = Index(data)
+            data = concat_columns([o._column for o in non_empties])
+            result = Index._from_column(data)
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1491,7 +1494,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = Index(res, name=self.name)
+            res = Index._from_column(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1610,8 +1613,8 @@ def _clean_nulls_from_index(self):
                 if isinstance(self, (DatetimeIndex, TimedeltaIndex))
                 else str(cudf.NA)
             )
-            return cudf.Index(
-                self._values.astype("str").fillna(fill_value),
+            return cudf.Index._from_column(
+                self._column.astype("str").fillna(fill_value),
                 name=self.name,
             )
 
@@ -1866,6 +1869,17 @@ def _from_data(
         result._freq = _validate_freq(freq)
         return result
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "M":
+            raise ValueError("column must have a datetime type.")
+        result = super()._from_column(column, name=name)
+        result._freq = _validate_freq(freq)
+        return result
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -1923,8 +1937,8 @@ def strftime(self, date_format: str) -> Index:
         date_format : str
             Date format string (e.g. "%Y-%m-%d").
         """
-        return Index._from_data(
-            {self.name: self._column.strftime(date_format)}
+        return Index._from_column(
+            self._column.strftime(date_format), name=self.name
         )
 
     @property
@@ -1989,7 +2003,9 @@ def to_pydatetime(self) -> np.ndarray:
         return self.to_pandas().to_pydatetime()
 
     def to_julian_date(self) -> Index:
-        return Index._from_data({self.name: self._column.to_julian_date()})
+        return Index._from_column(
+            self._column.to_julian_date(), name=self.name
+        )
 
     def to_period(self, freq) -> pd.PeriodIndex:
         return self.to_pandas().to_period(freq=freq)
@@ -2000,7 +2016,9 @@ def normalize(self) -> Self:
 
         Currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.normalize()})
+        return type(self)._from_column(
+            self._column.normalize(), name=self.name
+        )
 
     @property
     def time(self) -> np.ndarray:
@@ -2084,7 +2102,7 @@ def days_in_month(self) -> Index:
         """
         Get the total number of days in the month that the date falls on.
         """
-        return Index._from_data({self.name: self._column.days_in_month})
+        return Index._from_column(self._column.days_in_month, name=self.name)
 
     daysinmonth = days_in_month
 
@@ -2093,7 +2111,7 @@ def day_of_week(self) -> Index:
         """
         Get the day of week that the date falls on.
         """
-        return Index._from_data({self.name: self._column.day_of_week})
+        return Index._from_column(self._column.day_of_week, name=self.name)
 
     @property  # type: ignore
     @_performance_tracking
@@ -2234,15 +2252,15 @@ def microsecond(self):
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
         """  # noqa: E501
-        return Index(
+        return Index._from_column(
             (
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
                 # __mul__ returns an int16 column.
-                self._values.get_dt_field("millisecond").astype("int32")
+                self._column.get_dt_field("millisecond").astype("int32")
                 * cudf.Scalar(1000, dtype="int32")
             )
-            + self._values.get_dt_field("microsecond"),
+            + self._column.get_dt_field("microsecond"),
             name=self.name,
         )
 
@@ -2374,7 +2392,7 @@ def is_leap_year(self) -> cupy.ndarray:
         ndarray
         Booleans indicating if dates belong to a leap year.
         """
-        res = is_leap_year(self._values).fillna(False)
+        res = self._column.is_leap_year.fillna(False)
         return cupy.asarray(res)
 
     @property  # type: ignore
@@ -2400,8 +2418,7 @@ def quarter(self):
         >>> gIndex.quarter
         Index([2, 4], dtype='int8')
         """
-        res = extract_quarter(self._values)
-        return Index(res, dtype="int8")
+        return Index._from_column(self._column.quarter.astype("int8"))
 
     @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
@@ -2423,7 +2440,7 @@ def day_name(self, locale: str | None = None) -> Index:
               dtype='object')
         """
         day_names = self._column.get_day_names(locale)
-        return Index._from_data({self.name: day_names})
+        return Index._from_column(day_names, name=self.name)
 
     @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
@@ -2442,7 +2459,7 @@ def month_name(self, locale: str | None = None) -> Index:
         Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object')
         """
         month_names = self._column.get_month_names(locale)
-        return Index._from_data({self.name: month_names})
+        return Index._from_column(month_names, name=self.name)
 
     @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
@@ -2481,14 +2498,14 @@ def to_pandas(
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Index:
         """Return an Index of a numerical component of the DatetimeIndex."""
-        out_column = self._values.get_dt_field(field)
+        out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,
         )
-        return Index(out_column, name=self.name)
+        return Index._from_column(out_column, name=self.name)
 
     def _is_boolean(self):
         return False
@@ -2522,9 +2539,7 @@ def ceil(self, freq):
         >>> gIndex.ceil("T")
         DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.ceil(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
     def floor(self, freq):
@@ -2555,9 +2570,9 @@ def floor(self, freq):
         >>> gIndex.floor("T")
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.floor(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     @_performance_tracking
     def round(self, freq):
@@ -2595,9 +2610,9 @@ def round(self, freq):
         >>> dt_idx.round('T')
         DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.round(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     def tz_localize(
         self,
@@ -2647,8 +2662,8 @@ def tz_localize(
         to 'NaT'.
         """  # noqa: E501
         result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
-        return DatetimeIndex._from_data(
-            {self.name: result_col}, freq=self._freq
+        return DatetimeIndex._from_column(
+            result_col, name=self.name, freq=self._freq
         )
 
     def tz_convert(self, tz: str | None):
@@ -2684,7 +2699,7 @@ def tz_convert(self, tz: str | None):
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
         result_col = self._column.tz_convert(tz)
-        return DatetimeIndex._from_data({self.name: result_col})
+        return DatetimeIndex._from_column(result_col, name=self.name)
 
     def repeat(self, repeats, axis=None):
         res = super().repeat(repeats, axis=axis)
@@ -2794,6 +2809,15 @@ def __init__(
 
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "m":
+            raise ValueError("column must have a timedelta type.")
+        return super()._from_column(column, name=name)
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -2876,7 +2900,7 @@ def ceil(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.ceil(freq)})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     def floor(self, freq: str) -> Self:
         """
@@ -2884,7 +2908,9 @@ def floor(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.floor(freq)})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     def round(self, freq: str) -> Self:
         """
@@ -2892,41 +2918,51 @@ def round(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.round(freq)})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def days(self):
+    def days(self) -> cudf.Index:
         """
         Number of days for each element.
         """
         # Need to specifically return `int64` to avoid overflow.
-        return Index(self._values.days, name=self.name, dtype="int64")
+        return Index._from_column(
+            self._column.days.astype("int64"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def seconds(self):
+    def seconds(self) -> cudf.Index:
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
-        return Index(self._values.seconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.seconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def microseconds(self):
+    def microseconds(self) -> cudf.Index:
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
-        return Index(self._values.microseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.microseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def nanoseconds(self):
+    def nanoseconds(self) -> cudf.Index:
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
         element.
         """
-        return Index(self._values.nanoseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.nanoseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -3061,17 +3097,26 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.CategoricalDtype):
+            raise ValueError("column must have a categorial type.")
+        return super()._from_column(column, name=name)
+
     @property
     def ordered(self) -> bool:
         return self._column.ordered
 
     @property  # type: ignore
     @_performance_tracking
-    def codes(self):
+    def codes(self) -> cudf.Index:
         """
         The category codes of this categorical.
         """
-        return Index(self._values.codes)
+        return Index._from_column(self._column.codes)
 
     @property  # type: ignore
     @_performance_tracking
@@ -3094,24 +3139,24 @@ def add_categories(self, new_categories) -> Self:
         `new_categories` will be included at the last/highest place in the
         categories and will be unused directly after this call.
         """
-        return type(self)._from_data(
-            {self.name: self._column.add_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.add_categories(new_categories), name=self.name
         )
 
     def as_ordered(self) -> Self:
         """
         Set the Categorical to be ordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=True)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=True), name=self.name
         )
 
     def as_unordered(self) -> Self:
         """
         Set the Categorical to be unordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=False)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=False), name=self.name
         )
 
     def remove_categories(self, removals) -> Self:
@@ -3125,8 +3170,8 @@ def remove_categories(self, removals) -> Self:
         removals : category or list of categories
            The categories which should be removed.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_categories(removals)}
+        return type(self)._from_column(
+            self._column.remove_categories(removals), name=self.name
         )
 
     def remove_unused_categories(self) -> Self:
@@ -3135,8 +3180,8 @@ def remove_unused_categories(self) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_unused_categories()}
+        return type(self)._from_column(
+            self._column.remove_unused_categories(), name=self.name
         )
 
     def rename_categories(self, new_categories) -> Self:
@@ -3145,8 +3190,8 @@ def rename_categories(self, new_categories) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.rename_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.rename_categories(new_categories), name=self.name
         )
 
     def reorder_categories(self, new_categories, ordered=None) -> Self:
@@ -3164,12 +3209,9 @@ def reorder_categories(self, new_categories, ordered=None) -> Self:
            Whether or not the categorical is treated as a ordered categorical.
            If not given, do not change the ordered information.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.reorder_categories(
-                    new_categories, ordered=ordered
-                )
-            }
+        return type(self)._from_column(
+            self._column.reorder_categories(new_categories, ordered=ordered),
+            name=self.name,
         )
 
     def set_categories(
@@ -3191,12 +3233,11 @@ def set_categories(
             considered as a rename of the old categories
             or as reordered categories.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.set_categories(
-                    new_categories, ordered=ordered, rename=rename
-                )
-            }
+        return type(self)._from_column(
+            self._column.set_categories(
+                new_categories, ordered=ordered, rename=rename
+            ),
+            name=self.name,
         )
 
 
@@ -3411,6 +3452,15 @@ def __init__(
     def closed(self):
         return self.dtype.closed
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.IntervalDtype):
+            raise ValueError("column must have a interval type.")
+        return super()._from_column(column, name=name)
+
     @classmethod
     @_performance_tracking
     def from_breaks(
@@ -3593,8 +3643,8 @@ def set_closed(
             Whether the intervals are closed on the left-side, right-side, both
             or neither.
         """
-        return type(self)._from_data(
-            {self.name: self._column.set_closed(closed)}
+        return type(self)._from_column(
+            self._column.set_closed(closed), name=self.name
         )
 
     def to_tuples(self, na_tuple: bool = True) -> pd.Index:
@@ -3680,15 +3730,7 @@ def as_index(
     elif isinstance(arbitrary, BaseIndex):
         idx = arbitrary.copy(deep=copy).rename(name)
     elif isinstance(arbitrary, ColumnBase):
-        idx = _index_from_data({name: arbitrary})
-    elif isinstance(arbitrary, cudf.Series):
-        return as_index(
-            arbitrary._column,
-            nan_as_null=nan_as_null,
-            copy=copy,
-            name=name,
-            dtype=dtype,
-        )
+        raise ValueError("Use cudf.Index._from_column instead.")
     elif isinstance(arbitrary, (pd.RangeIndex, range)):
         idx = RangeIndex(
             start=arbitrary.start,
@@ -3708,11 +3750,9 @@ def as_index(
     elif isinstance(arbitrary, cudf.DataFrame) or is_scalar(arbitrary):
         raise ValueError("Index data must be 1-dimensional and list-like")
     else:
-        return as_index(
+        return Index._from_column(
             column.as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null),
-            copy=copy,
             name=name,
-            dtype=dtype,
         )
     if dtype is not None:
         idx = idx.astype(dtype)
@@ -3749,7 +3789,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
         elif step is None:
             # First non-empty index had only one element
             if obj.start == start:
-                result = Index(concat_columns([x._values for x in indexes]))
+                result = Index._from_column(
+                    concat_columns([x._column for x in indexes])
+                )
                 return result
             step = obj.start - start
 
@@ -3757,7 +3799,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
             next_ is not None and obj.start != next_
         )
         if non_consecutive:
-            result = Index(concat_columns([x._values for x in indexes]))
+            result = Index._from_column(
+                concat_columns([x._column for x in indexes])
+            )
             return result
         if step is not None:
             next_ = obj[-1] + step
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3b44a0f5864..8be9f0ad78e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -182,11 +182,16 @@ def _indices_from_labels(obj, labels):
             )
         else:
             labels = labels.astype(obj.index.dtype)
+        idx_labels = cudf.Index._from_column(labels)
+    else:
+        idx_labels = labels
 
     # join is not guaranteed to maintain the index ordering
     # so we will sort it with its initial ordering which is stored
     # in column "__"
-    lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels)
+    lhs = cudf.DataFrame(
+        {"__": as_column(range(len(idx_labels)))}, index=idx_labels
+    )
     rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index)
     return lhs.join(rhs).sort_values(by=["__", "_"])["_"]
 
@@ -6642,7 +6647,11 @@ def _drop_rows_by_labels(
         # 3. Use "leftanti" join to drop
         # TODO: use internal API with "leftanti" and specify left and right
         # join keys to bypass logic check
-        to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+        if isinstance(labels, ColumnBase):
+            join_index = cudf.Index._from_column(labels, name=level)
+        else:
+            join_index = cudf.Index(labels, name=level)
+        to_join = cudf.DataFrame(index=join_index)
         join_res = working_df.join(to_join, how="leftanti")
 
         # 4. Reconstruct original layout, and rename
@@ -6669,12 +6678,11 @@ def _drop_rows_by_labels(
         if errors == "raise" and not labels.isin(obj.index).all():
             raise KeyError("One or more values not found in axis")
 
-        key_df = cudf.DataFrame._from_data(
-            data={},
-            index=cudf.Index(
-                labels, name=getattr(labels, "name", obj.index.name)
-            ),
-        )
+        if isinstance(labels, ColumnBase):
+            idx = cudf.Index._from_column(labels, name=obj.index.name)
+        else:
+            idx = cudf.Index(labels, name=labels.name)
+        key_df = cudf.DataFrame._from_data(data={}, index=idx)
         if isinstance(obj, cudf.DataFrame):
             res = obj.join(key_df, how="leftanti")
         else:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ab88b191570..a66e2936e3b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -811,8 +811,9 @@ def _index_and_downcast(self, result, index, index_key):
             # it into an Index and name the final index values according
             # to that column's name.
             *_, last_column = index._data.columns
-            out_index = cudf.Index(last_column)
-            out_index.name = index.names[-1]
+            out_index = cudf.Index._from_column(
+                last_column, name=index.names[-1]
+            )
             index = out_index
         elif out_index._num_columns > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
@@ -1061,7 +1062,7 @@ def get_level_values(self, level):
                 raise KeyError(f"Level not found: '{level}'")
         else:
             level_idx = colnames.index(level)
-        level_values = cudf.Index(
+        level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
         return level_values
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 715bbf89b15..e0aee28bfeb 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -145,7 +145,9 @@ def copy(self, deep=True):
     def keys(self):
         index = super().keys
         if self._freq is not None and isinstance(index, cudf.DatetimeIndex):
-            return cudf.DatetimeIndex._from_data(index._data, freq=self._freq)
+            return cudf.DatetimeIndex._from_column(
+                index._column, name=index.name, freq=self._freq
+            )
         return index
 
     def serialize(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 822b966364f..2fb4fde6552 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3245,8 +3245,8 @@ def value_counts(
             interval_col = IntervalColumn.from_struct_column(
                 res.index._column._get_decategorized_column()
             )
-            res.index = cudf.IntervalIndex._from_data(
-                {res.index.name: interval_col}
+            res.index = cudf.IntervalIndex._from_column(
+                interval_col, name=res.index.name
             )
         res.name = result_name
         return res
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c50a36b68b5..a92bf420147 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,7 +18,6 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
@@ -288,8 +287,7 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                ca = ColumnAccessor({arg.name: col}, verify=False)
-                return cudf.DatetimeIndex._from_data(ca)
+                return cudf.DatetimeIndex._from_column(col, name=arg.name)
             elif isinstance(arg, (cudf.Series, pd.Series)):
                 return cudf.Series._from_column(
                     col, name=arg.name, index=ensure_index(arg.index)
@@ -297,7 +295,7 @@ def to_datetime(
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
-                return cudf.Index(col)
+                return cudf.Index._from_column(col)
     except Exception as e:
         if errors == "raise":
             raise e
@@ -900,7 +898,9 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
+        return cudf.DatetimeIndex._from_column(result, name=name).tz_localize(
+            tz
+        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -1001,9 +1001,9 @@ def date_range(
             "datetime64[ns]"
         )
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
-        tz
-    )
+    return cudf.DatetimeIndex._from_column(
+        res, name=name, freq=freq
+    ).tz_localize(tz)
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index c2072d90e98..31ad24a4664 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -398,8 +398,12 @@ def assert_index_equal(
             )
 
         for level in range(left.nlevels):
-            llevel = cudf.Index(left._columns[level], name=left.names[level])
-            rlevel = cudf.Index(right._columns[level], name=right.names[level])
+            llevel = cudf.Index._from_column(
+                left._columns[level], name=left.names[level]
+            )
+            rlevel = cudf.Index._from_column(
+                right._columns[level], name=right.names[level]
+            )
             mul_obj = f"MultiIndex level [{level}]"
             assert_index_equal(
                 llevel,
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a68f4574da3..b1e095e8853 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -167,7 +167,9 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name")
+    stringIndex = cudf.Index._from_column(
+        as_column(["a", "b", "c", "d", "e"]), name="name"
+    )
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 30880f074c0..cc88cc79769 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1092,7 +1092,7 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(
+    stringIndex = cudf.Index._from_column(
         cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name"
     )
     pdf.index = stringIndex.to_pandas()

From 89863a3b791250a2285b90d2c13f51f009638f44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:22:31 -1000
Subject: [PATCH 63/72] Align public utility function signatures  with pandas
 2.x (#16565)

The following function signatures have a breaking change

* `concat`
* `get_dummies`
* `date_range`

Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16565
---
 python/cudf/cudf/__init__.py                  |  2 +
 python/cudf/cudf/core/index.py                |  4 +-
 python/cudf/cudf/core/reshape.py              | 74 ++++++++++++++-----
 python/cudf/cudf/core/tools/datetimes.py      | 12 +--
 python/cudf/cudf/core/tools/numeric.py        |  9 ++-
 .../cudf/cudf/tests/indexes/test_interval.py  |  6 ++
 python/cudf/cudf/tests/test_onehot.py         |  6 ++
 7 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index e14815a1b0d..77ae0791b81 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -97,6 +97,7 @@
     "DatetimeIndex",
     "Decimal32Dtype",
     "Decimal64Dtype",
+    "Decimal128Dtype",
     "Grouper",
     "Index",
     "IntervalDtype",
@@ -126,6 +127,7 @@
     "isclose",
     "melt",
     "merge",
+    "option_context",
     "pivot",
     "pivot_table",
     "read_avro",
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c55f86d48e1..d02633a97fa 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3350,14 +3350,14 @@ def interval_range(
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed)
+        return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
     )
-    return IntervalIndex(interval_col, closed=closed)
+    return IntervalIndex(interval_col, closed=closed, name=name)
 
 
 class IntervalIndex(Index):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 52a55760d4a..df471692702 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis):
             objs[idx] = obj.to_frame(name=name)
 
 
-def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
+def concat(
+    objs,
+    axis=0,
+    join="outer",
+    ignore_index=False,
+    keys=None,
+    levels=None,
+    names=None,
+    verify_integrity=False,
+    sort=None,
+):
     """Concatenate DataFrames, Series, or Indices row-wise.
 
     Parameters
@@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     ignore_index : bool, default False
         Set True to ignore the index of the *objs* and provide a
         default range index instead.
+    keys : sequence, default None
+        If multiple levels passed, should contain tuples. Construct
+        hierarchical index using the passed keys as the outermost level.
+        Currently not supported.
+    levels : list of sequences, default None
+        Specific levels (unique values) to use for constructing a
+        MultiIndex. Otherwise they will be inferred from the keys.
+        Currently not supported.
+    names : list, default None
+        Names for the levels in the resulting hierarchical index.
+        Currently not supported.
+    verify_integrity : bool, default False
+        Check whether the new concatenated axis contains duplicates. This can
+        be very expensive relative to the actual data concatenation.
+        Currently not supported.
     sort : bool, default False
         Sort non-concatenation axis if it is not already aligned.
 
@@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     0      a       1       c       3
     1      b       2       d       4
     """
+    if keys is not None:
+        raise NotImplementedError("keys is currently not supported")
+    if levels is not None:
+        raise NotImplementedError("levels is currently not supported")
+    if names is not None:
+        raise NotImplementedError("names is currently not supported")
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
@@ -260,7 +291,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 f"Can only concatenate dictionary input along axis=1, not {axis}"
             )
         objs = {k: obj for k, obj in objs.items() if obj is not None}
-        keys = list(objs)
+        keys_objs = list(objs)
         objs = list(objs.values())
         if any(isinstance(o, cudf.BaseIndex) for o in objs):
             raise TypeError(
@@ -268,7 +299,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             )
     else:
         objs = [obj for obj in objs if obj is not None]
-        keys = None
+        keys_objs = None
 
     if not objs:
         raise ValueError("All objects passed were None")
@@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = obj.to_frame()
             else:
                 result = obj.copy(deep=True)
-            if keys is not None and isinstance(result, cudf.DataFrame):
-                k = keys[0]
+            if keys_objs is not None and isinstance(result, cudf.DataFrame):
+                k = keys_objs[0]
                 result.columns = cudf.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
@@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        if keys is None:
+        if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
                     if name in df._data:
@@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     "label types in cuDF at this time. You must convert "
                     "the labels to the same type."
                 )
-            for k, o in zip(keys, objs):
+            for k, o in zip(keys_objs, objs):
                 for name, col in o._data.items():
-                    # if only series, then only keep keys as column labels
+                    # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
                     if only_series:
@@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     else:
                         df[col_label] = col
 
-        if keys is None:
+        if keys_objs is None:
             df.columns = result_columns.unique()
             if ignore_index:
                 df.columns = cudf.RangeIndex(len(result_columns.unique()))
@@ -666,7 +697,7 @@ def _tile(A, reps):
 
 
 def get_dummies(
-    df,
+    data,
     prefix=None,
     prefix_sep="_",
     dummy_na=False,
@@ -681,7 +712,7 @@ def get_dummies(
 
     Parameters
     ----------
-    df : array-like, Series, or DataFrame
+    data : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
         Prefix to append. Either a str (to apply a constant prefix), dict
@@ -759,17 +790,22 @@ def get_dummies(
 
     if cats is None:
         cats = {}
+    else:
+        warnings.warn(
+            "cats is deprecated and will be removed in a future version.",
+            FutureWarning,
+        )
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
     if drop_first:
         raise NotImplementedError("drop_first is not supported yet")
 
-    if isinstance(df, cudf.DataFrame):
+    if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(
+            columns = data.select_dtypes(
                 include=encode_fallback_dtypes
             )._column_names
 
@@ -796,33 +832,33 @@ def get_dummies(
         # If we have no columns to encode, we need to drop
         # fallback columns(if any)
         if len(columns) == 0:
-            return df.select_dtypes(exclude=encode_fallback_dtypes)
+            return data.select_dtypes(exclude=encode_fallback_dtypes)
         else:
             result_data = {
                 col_name: col
-                for col_name, col in df._data.items()
+                for col_name, col in data._data.items()
                 if col_name not in columns
             }
 
             for name in columns:
                 if name not in cats:
                     unique = _get_unique(
-                        column=df._data[name], dummy_na=dummy_na
+                        column=data._data[name], dummy_na=dummy_na
                     )
                 else:
                     unique = as_column(cats[name])
 
                 col_enc_data = _one_hot_encode_column(
-                    column=df._data[name],
+                    column=data._data[name],
                     categories=unique,
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df.index)
+            return cudf.DataFrame._from_data(result_data, index=data.index)
     else:
-        ser = cudf.Series(df)
+        ser = cudf.Series(data)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
         data = _one_hot_encode_column(
             column=ser._column,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index a92bf420147..7197560b5a4 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -785,7 +785,7 @@ def date_range(
     tz=None,
     normalize: bool = False,
     name=None,
-    closed: Literal["left", "right", "both", "neither"] = "both",
+    inclusive: Literal["left", "right", "both", "neither"] = "both",
     *,
     unit: str | None = None,
 ):
@@ -823,7 +823,7 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {"left", "right", "both", "neither"}, default "both"
+    inclusive : {"left", "right", "both", "neither"}, default "both"
         Whether to set each bound as closed or open.
         Currently only "both" is supported
 
@@ -839,7 +839,7 @@ def date_range(
     -----
     Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
     must be specified. If `freq` is omitted, the resulting DatetimeIndex will
-    have periods linearly spaced elements between start and end (closed on both
+    have periods linearly spaced elements between start and end (inclusive on both
     sides).
 
     cudf supports `freq` specified with either fixed-frequency offset
@@ -866,8 +866,8 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if closed != "both":
-        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if inclusive != "both":
+        raise NotImplementedError(f"{inclusive=} is currently unsupported.")
     if unit is not None:
         raise NotImplementedError(f"{unit=} is currently unsupported.")
     if normalize is not False:
@@ -961,7 +961,7 @@ def date_range(
             periods = 0
         else:
             # If end == start, periods == 0 and we return exactly 1 timestamp (start).
-            # Otherwise, since closed="both", we ensure the end point is included.
+            # Otherwise, since inclusive="both", we ensure the end point is included.
             periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 8b95f6f6a04..6cecf3fa170 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -20,7 +20,7 @@
     from cudf.core.column import ColumnBase
 
 
-def to_numeric(arg, errors="raise", downcast=None):
+def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
     """
     Convert argument into numerical types.
 
@@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         Note that downcast behavior is decoupled from parsing. Errors
         encountered during downcast is raised regardless of ``errors``
         parameter.
+    dtype_backend : None
+        Not implemented.
 
     Returns
     -------
@@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None):
         For example ``[1, 'a']``. A ``TypeError`` will be raised when such
         input is received, regardless of ``errors`` parameter.
     """
-
+    if dtype_backend is not None:
+        raise NotImplementedError(
+            "dtype_backend is not currently implemented."
+        )
     if errors not in {"raise", "ignore", "coerce"}:
         raise ValueError("invalid error value specified")
     elif errors == "ignore":
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 3b3a9f96543..a567c27f584 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -401,3 +401,9 @@ def test_from_tuples():
     result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
     expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
     assert_eq(result, expected)
+
+
+def test_interval_range_name():
+    expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
+    result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 154e1e19072..cc17dc46e0a 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan():
     actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
 
     assert_eq(expected, actual)
+
+
+def test_get_dummies_cats_deprecated():
+    df = cudf.DataFrame(range(3))
+    with pytest.warns(FutureWarning):
+        cudf.get_dummies(df, cats={0: [0, 1, 2]})

From 2bcb7ecd2c077b3989ced1b8be8727e1b71f93b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:24:48 -1000
Subject: [PATCH 64/72] Fix `.replace(Index, Index)` raising a TypeError
 (#16513)

Since `cudf.Index` is list-like, passing this to `.replace` should act like replacing a list of values with a corresponding list of values.

Discovered while working on https://github.com/rapidsai/cuml/pull/6019

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16513
---
 python/cudf/cudf/core/indexed_frame.py | 14 +++++++-------
 python/cudf/cudf/tests/test_replace.py |  6 ++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8be9f0ad78e..ae7369c80d1 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6469,7 +6469,7 @@ def _get_replacement_values_for_columns(
         to_replace_columns = {col: [to_replace] for col in columns_dtype_map}
         values_columns = {col: [value] for col in columns_dtype_map}
     elif cudf.api.types.is_list_like(to_replace) or isinstance(
-        to_replace, ColumnBase
+        to_replace, (ColumnBase, BaseIndex)
     ):
         if is_scalar(value):
             to_replace_columns = {col: to_replace for col in columns_dtype_map}
@@ -6483,7 +6483,9 @@ def _get_replacement_values_for_columns(
                 )
                 for col in columns_dtype_map
             }
-        elif cudf.api.types.is_list_like(value):
+        elif cudf.api.types.is_list_like(
+            value
+        ) or cudf.utils.dtypes.is_column_like(value):
             if len(to_replace) != len(value):
                 raise ValueError(
                     f"Replacement lists must be "
@@ -6495,9 +6497,6 @@ def _get_replacement_values_for_columns(
                     col: to_replace for col in columns_dtype_map
                 }
                 values_columns = {col: value for col in columns_dtype_map}
-        elif cudf.utils.dtypes.is_column_like(value):
-            to_replace_columns = {col: to_replace for col in columns_dtype_map}
-            values_columns = {col: value for col in columns_dtype_map}
         else:
             raise TypeError(
                 "value argument must be scalar, list-like or Series"
@@ -6592,12 +6591,13 @@ def _get_replacement_values_for_columns(
     return all_na_columns, to_replace_columns, values_columns
 
 
-def _is_series(obj):
+def _is_series(obj: Any) -> bool:
     """
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
+    to avoid circular imports.
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
+    return isinstance(obj, IndexedFrame) and obj.ndim == 1
 
 
 @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 1973fe6fb41..e5ee0127a74 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1378,3 +1378,9 @@ def test_fillna_nan_and_null():
     result = ser.fillna(2.2)
     expected = cudf.Series([2.2, 2.2, 1.1])
     assert_eq(result, expected)
+
+
+def test_replace_with_index_objects():
+    result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2]))
+    expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2]))
+    assert_eq(result, expected)

From ac42bc870a65d807784cae63e25b9e9ca788eb23 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 15 Aug 2024 09:37:43 -0400
Subject: [PATCH 65/72] Hide all gtest symbols in cudftestutil (#16546)

By hiding the gtest symbols in cudftestutil it allows consumers of the library to build with a differing version of gtest without issue.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Marcus D. Hanwell (https://github.com/cryos)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16546
---
 cpp/cmake/thirdparty/get_gtest.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 10e6b026d9a..ec8cbd8c568 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -16,9 +16,18 @@
 function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
+  # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  set(gtest_hide_internal_symbols ON)
+
   # Find or install GoogleTest
   rapids_cpm_gtest(BUILD_STATIC)
 
+  # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  if(TARGET gtest)
+    target_compile_definitions(gtest PUBLIC "$<BUILD_LOCAL_INTERFACE:GTEST_API_=>")
+  endif()
 endfunction()
 
 find_and_configure_gtest()

From f4a9b1c5016e254ebf2de55ac9946af6420ebff5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 11:14:06 -1000
Subject: [PATCH 66/72] Use more idomatic cudf APIs in dask_cudf meta
 generation (#16487)

Namely:

* Avoiding `cudf.core` imports by checking public column `.dtype`s
* Using more straightforward cudf APIs to construct meta objects

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16487
---
 python/dask_cudf/dask_cudf/backends.py | 124 ++++++++++++-------------
 1 file changed, 58 insertions(+), 66 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 01bab30190a..82ea2ac033a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -55,37 +55,31 @@
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
-    if isinstance(idx, cudf.core.index.RangeIndex):
-        return cudf.core.index.RangeIndex(2, name=idx.name)
-    elif isinstance(idx, cudf.core.index.DatetimeIndex):
-        start = "1970-01-01"
-        data = np.array([start, "1970-01-02"], dtype=idx.dtype)
+    """Return a non-empty cudf.Index as metadata."""
+    # TODO: IntervalIndex, TimedeltaIndex?
+    if isinstance(idx, cudf.RangeIndex):
+        return cudf.RangeIndex(2, name=idx.name)
+    elif isinstance(idx, cudf.DatetimeIndex):
+        data = np.array(["1970-01-01", "1970-01-02"], dtype=idx.dtype)
         values = cudf.core.column.as_column(data)
-        return cudf.core.index.DatetimeIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.index.CategoricalIndex):
-        key = tuple(idx._data.keys())
-        assert len(key) == 1
-        categories = idx._data[key[0]].categories
-        codes = [0, 0]
-        ordered = idx._data[key[0]].ordered
+        return cudf.DatetimeIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.CategoricalIndex):
         values = cudf.core.column.build_categorical_column(
-            categories=categories, codes=codes, ordered=ordered
+            categories=idx.categories, codes=[0, 0], ordered=idx.ordered
         )
-        return cudf.core.index.CategoricalIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.multiindex.MultiIndex):
+        return cudf.CategoricalIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.MultiIndex):
         levels = [meta_nonempty(lev) for lev in idx.levels]
-        codes = [[0, 0] for i in idx.levels]
-        return cudf.core.multiindex.MultiIndex(
-            levels=levels, codes=codes, names=idx.names
-        )
-    elif isinstance(idx._column, cudf.core.column.StringColumn):
+        codes = [[0, 0]] * idx.nlevels
+        return cudf.MultiIndex(levels=levels, codes=codes, names=idx.names)
+    elif is_string_dtype(idx.dtype):
         return cudf.Index(["cat", "dog"], name=idx.name)
-    elif isinstance(idx, cudf.core.index.Index):
-        return cudf.core.index.Index(
-            np.arange(2, dtype=idx.dtype), name=idx.name
-        )
+    elif isinstance(idx, cudf.Index):
+        return cudf.Index(np.arange(2, dtype=idx.dtype), name=idx.name)
 
-    raise TypeError(f"Don't know how to handle index of type {type(idx)}")
+    raise TypeError(
+        f"Don't know how to handle index of type {type(idx).__name__}"
+    )
 
 
 def _nest_list_data(data, leaf_type):
@@ -101,50 +95,49 @@ def _nest_list_data(data, leaf_type):
 
 
 @_dask_cudf_performance_tracking
-def _get_non_empty_data(s):
-    """Return a non empty column as metadata."""
-    if isinstance(s, cudf.core.column.CategoricalColumn):
+def _get_non_empty_data(
+    s: cudf.core.column.ColumnBase,
+) -> cudf.core.column.ColumnBase:
+    """Return a non-empty column as metadata from a column."""
+    if isinstance(s.dtype, cudf.CategoricalDtype):
         categories = (
-            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
+            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]  # type: ignore[attr-defined]
         )
         codes = cudf.core.column.as_column(
             0,
             dtype=cudf._lib.types.size_type_dtype,
             length=2,
         )
-        ordered = s.ordered
-        data = cudf.core.column.build_categorical_column(
+        ordered = s.ordered  # type: ignore[attr-defined]
+        return cudf.core.column.build_categorical_column(
             categories=categories, codes=codes, ordered=ordered
         )
-    elif isinstance(s, cudf.core.column.ListColumn):
+    elif isinstance(s.dtype, cudf.ListDtype):
         leaf_type = s.dtype.leaf_type
         if is_string_dtype(leaf_type):
             data = ["cat", "dog"]
         else:
             data = np.array([0, 1], dtype=leaf_type).tolist()
         data = _nest_list_data(data, s.dtype) * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
-    elif isinstance(s, cudf.core.column.StructColumn):
+        return cudf.core.column.as_column(data, dtype=s.dtype)
+    elif isinstance(s.dtype, cudf.StructDtype):
+        # Handles IntervalColumn
         struct_dtype = s.dtype
-        data = [{key: None for key in struct_dtype.fields.keys()}] * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
+        struct_data = [{key: None for key in struct_dtype.fields.keys()}] * 2
+        return cudf.core.column.as_column(struct_data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
+        return cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
-        from cudf.utils.dtypes import get_time_unit
-
-        data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))
-        data = data.tz_localize(str(s.dtype.tz))._column
+        date_data = cudf.date_range("2001-01-01", periods=2, freq=s.time_unit)  # type: ignore[attr-defined]
+        return date_data.tz_localize(str(s.dtype.tz))._column
+    elif s.dtype.kind in "fiubmM":
+        return cudf.core.column.as_column(
+            np.arange(start=0, stop=2, dtype=s.dtype)
+        )
     else:
-        if pd.api.types.is_numeric_dtype(s.dtype):
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype=s.dtype)
-            )
-        else:
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype="int64")
-            ).astype(s.dtype)
-    return data
+        raise TypeError(
+            f"Don't know how to handle column of type {type(s).__name__}"
+        )
 
 
 @meta_nonempty.register(cudf.Series)
@@ -162,24 +155,25 @@ def _nonempty_series(s, idx=None):
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
-    res = cudf.DataFrame(index=idx)
-    for col in x._data.names:
-        dtype = str(x._data[col].dtype)
-        if dtype in ("list", "struct", "category"):
+    res = {}
+    for col_label, col in x._data.items():
+        dtype = col.dtype
+        if isinstance(
+            dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.CategoricalDtype),
+        ):
             # 1. Not possible to hash and store list & struct types
             #    as they can contain different levels of nesting or
             #    fields.
-            # 2. Not possible to has `category` types as
+            # 2. Not possible to hash `category` types as
             #    they often contain an underlying types to them.
-            res._data[col] = _get_non_empty_data(x._data[col])
+            res[col_label] = _get_non_empty_data(col)
         else:
             if dtype not in columns_with_dtype:
-                columns_with_dtype[dtype] = cudf.core.column.as_column(
-                    _get_non_empty_data(x._data[col])
-                )
-            res._data[col] = columns_with_dtype[dtype]
+                columns_with_dtype[dtype] = _get_non_empty_data(col)
+            res[col_label] = columns_with_dtype[dtype]
 
-    return res
+    return cudf.DataFrame._from_data(res, index=idx)
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
@@ -197,9 +191,7 @@ def make_meta_cudf_index(x, index=None):
 @_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
-        return cudf.Series(
-            [UNKNOWN_CATEGORIES], dtype=dtype, name=name, index=index
-        ).iloc[:0]
+        dtype = cudf.CategoricalDtype(categories=[UNKNOWN_CATEGORIES])
     return cudf.Series([], dtype=dtype, name=name, index=index)
 
 
@@ -337,7 +329,7 @@ def percentile_cudf(a, q, interpolation="linear"):
     if isinstance(q, Iterator):
         q = list(q)
 
-    if cudf.api.types._is_categorical_dtype(a.dtype):
+    if isinstance(a.dtype, cudf.CategoricalDtype):
         result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
 
         return (
@@ -346,7 +338,7 @@ def percentile_cudf(a, q, interpolation="linear"):
             ),
             n,
         )
-    if np.issubdtype(a.dtype, np.datetime64):
+    if a.dtype.kind == "M":
         result = a.quantile(
             [i / 100.0 for i in q], interpolation=interpolation
         )

From 1e220b708582c73d128c53f3279d4588167a310f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:58:45 -1000
Subject: [PATCH 67/72] Return Interval object in pandas compat mode for
 IntervalIndex reductions (#16523)

xref https://github.com/rapidsai/cudf/issues/16507

In non pandas compat mode, I think this still makes sense to return a `dict` since that's the "scalar" type of a cudf struct/interval type, but in pandas compat mode we should match pandas and return an Interval.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16523
---
 python/cudf/cudf/_lib/reduce.pyx         |  6 +++++-
 python/cudf/cudf/core/column/interval.py | 14 ++++++++++++++
 python/cudf/cudf/tests/test_interval.py  | 11 +++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 64634b7a6f9..511bba20ef5 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -61,7 +61,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             result,
             dtype=col_dtype.__class__(precision, scale),
         ).value
-    return DeviceScalar.from_pylibcudf(result).value
+    scalar = DeviceScalar.from_pylibcudf(result).value
+    if isinstance(col_dtype, cudf.StructDtype):
+        # TODO: Utilize column_metadata in libcudf to maintain field labels
+        return dict(zip(col_dtype.fields.keys(), scalar.values()))
+    return scalar
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index b2f79ef0c65..d9fc96a9f3e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -11,6 +11,7 @@
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
     from cudf.core.column import ColumnBase
 
 
@@ -186,3 +187,16 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
+
+    def _reduce(
+        self,
+        op: str,
+        skipna: bool | None = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Interval(**result, closed=self.dtype.closed)
+        return result
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 5eeea87d8e0..2d194107658 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -194,3 +194,14 @@ def test_intervaldtype_eq_string_with_attributes():
     dtype = cudf.IntervalDtype("int64", closed="left")
     assert dtype == "interval"
     assert dtype == "interval[int64, left]"
+
+
+def test_reduction_return_interval_pandas_compatible():
+    ii = pd.IntervalIndex.from_tuples(
+        [("2017-01-03", "2017-01-04")], dtype="interval[datetime64[ns], right]"
+    )
+    cudf_ii = cudf.IntervalIndex.from_pandas(ii)
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf_ii.min()
+    expected = ii.min()
+    assert result == expected

From 50841355812685e0e48d1577b8384399cdad5a0f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:59:58 -1000
Subject: [PATCH 68/72] Make NumericalColumn.__init__ strict (#16457)

This PR makes `NumericalBaseColumn.__init__` and its subclasses strict putting restrictions on `data`, `dtype`, `size` and `children` so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16457
---
 python/cudf/cudf/_lib/column.pyx              |  2 +
 python/cudf/cudf/core/column/column.py        | 22 ++---
 python/cudf/cudf/core/column/decimal.py       | 92 ++++++++++++++++++-
 python/cudf/cudf/core/column/numerical.py     | 13 ++-
 .../cudf/cudf/core/column/numerical_base.py   | 29 +++++-
 5 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e030147fdd3..f0c07dfbc1b 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -88,6 +88,8 @@ cdef class Column:
         object null_count=None,
         object children=()
     ):
+        if size < 0:
+            raise ValueError("size must be >=0")
         self._size = size
         self._distinct_count = {}
         self._dtype = dtype
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a7d2cb441dd..9785c3e5517 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1652,23 +1652,19 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, StructDtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.StructColumn(
             data=data,
             dtype=dtype,
-            size=size,
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             mask=mask,
             null_count=null_count,
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal64Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal64Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1676,11 +1672,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal32Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal32Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1688,11 +1682,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal128Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal128Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6a7f338b065..3b979ef2e97 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -31,14 +31,38 @@
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+    from cudf.core.buffer import Buffer
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
     """Base column for decimal32, decimal64 or decimal128 columns"""
 
-    dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(size, int):
+            raise ValueError("Must specify an integer size")
+        if not isinstance(dtype, DecimalDtype):
+            raise ValueError(f"{dtype=} must be a DecimalDtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
@@ -205,7 +229,27 @@ def as_numerical_column(
 
 
 class Decimal32Column(DecimalBaseColumn):
-    dtype: Decimal32Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal32Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal32Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal32Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -266,7 +310,27 @@ def _with_type_metadata(
 
 
 class Decimal128Column(DecimalBaseColumn):
-    dtype: Decimal128Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal128Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal128Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal128Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -287,7 +351,27 @@ def _with_type_metadata(
 
 
 class Decimal64Column(DecimalBaseColumn):
-    dtype: Decimal64Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal64Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal64Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal64Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     def __setitem__(self, key, value):
         if isinstance(value, np.integer):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bbc74ef349e..16e78ef35ef 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -61,25 +61,30 @@ class NumericalColumn(NumericalBaseColumn):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make this non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
+        if not (isinstance(dtype, np.dtype) and dtype.kind in "iufb"):
+            raise ValueError(
+                "dtype must be a floating, integer or boolean numpy dtype."
+            )
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def _clear_cache(self):
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index f41010062c8..3b8dd05c13a 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -9,16 +9,19 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
+    from cudf.core.column.decimal import DecimalDtype
 
 
 class NumericalBaseColumn(ColumnBase, Scannable):
-    """A column composed of numerical data.
+    """
+    A column composed of numerical (bool, integer, float, decimal) data.
 
     This class encodes a standard interface for different types of columns
     containing numerical types of data. In particular, mathematical operations
@@ -42,6 +45,30 @@ class NumericalBaseColumn(ColumnBase, Scannable):
         "cummax",
     }
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype | np.dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer instance.")
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls()
 

From 155eddedc0e2b68d203cfbc318172396f4293d98 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 14:00:57 -1000
Subject: [PATCH 69/72] Make Timedelta/DatetimeColumn.__init__ strict (#16464)

This PR makes Datetime/TimedeltaColumn.__init__ and its subclasses strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16464
---
 python/cudf/cudf/core/column/column.py    | 12 ++-----
 python/cudf/cudf/core/column/datetime.py  | 43 ++++++++++++++++-------
 python/cudf/cudf/core/column/timedelta.py | 17 +++++----
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9785c3e5517..b0e33e8b9ce 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1592,10 +1592,8 @@ def build_column(
             children=children,
         )
     elif dtype.type is np.datetime64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.DatetimeColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1603,10 +1601,8 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, pd.DatetimeTZDtype):
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.datetime.DatetimeTZColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1614,10 +1610,8 @@ def build_column(
             null_count=null_count,
         )
     elif dtype.type is np.timedelta64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.TimeDeltaColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1dbc94384d3..d0ea4612a1b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -24,6 +24,7 @@
     get_compatible_timezone,
     get_tz_data,
 )
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -34,10 +35,8 @@
         ColumnBinaryOperand,
         DatetimeLikeScalar,
         Dtype,
-        DtypeObj,
         ScalarLike,
     )
-    from cudf.core.buffer import Buffer
     from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_220:
@@ -207,30 +206,39 @@ class DatetimeColumn(column.ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype | pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "M":
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        dtype = self._validate_dtype_instance(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
+    @staticmethod
+    def _validate_dtype_instance(dtype: np.dtype) -> np.dtype:
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "M"):
+            raise ValueError("dtype must be a datetime, numpy dtype")
+        return dtype
+
     def __contains__(self, item: ScalarLike) -> bool:
         try:
             ts = pd.Timestamp(item).as_unit(self.time_unit)
@@ -858,21 +866,30 @@ class DatetimeTZColumn(DatetimeColumn):
     def __init__(
         self,
         data: Buffer,
+        size: int | None,
         dtype: pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
         super().__init__(
             data=data,
-            dtype=_get_base_dtype(dtype),
-            mask=mask,
             size=size,
+            dtype=dtype,
+            mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
-        self._dtype = get_compatible_timezone(dtype)
+
+    @staticmethod
+    def _validate_dtype_instance(
+        dtype: pd.DatetimeTZDtype,
+    ) -> pd.DatetimeTZDtype:
+        if not isinstance(dtype, pd.DatetimeTZDtype):
+            raise ValueError("dtype must be a pandas.DatetimeTZDtype")
+        return get_compatible_timezone(dtype)
 
     def to_pandas(
         self,
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ba0dc4779bb..6b6f3e517a8 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -75,28 +75,33 @@ class TimeDeltaColumn(ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: Dtype,
-        size: int | None = None,  # TODO: make non-optional
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "m":
-            raise TypeError(f"{self.dtype} is not a supported duration type")
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "m"):
+            raise ValueError("dtype must be a timedelta numpy dtype.")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError("TimedeltaColumn must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def __contains__(self, item: DatetimeLikeScalar) -> bool:

From f955dd76b47779d4f527efe25de417b1acbff4a7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 15 Aug 2024 17:13:58 -0700
Subject: [PATCH 70/72] Rewrite remaining Python Arrow interop conversions
 using the C Data Interface (#16548)

This PR rewrites all remaining parts of the Python interop code previously using Arrow C++ types to instead use the C Data Interface. With this change, we no longer require pyarrow in that part of the Cython code. There are further improvements that we should make to streamline the internals, but I would like to keep this changeset minimal since getting it merged unblocks progress on multiple fronts so that we can progress further in parallel.

Contributes to #15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16548
---
 cpp/src/interop/arrow_utilities.cpp           |   1 +
 cpp/src/interop/to_arrow_schema.cpp           |   5 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |   6 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   | 188 +++++++++---------
 .../cudf/_lib/pylibcudf/libcudf/interop.pxd   |  53 +++--
 .../cudf/cudf/pylibcudf_tests/common/utils.py |   6 +-
 7 files changed, 146 insertions(+), 118 deletions(-)

diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 4292552a800..3776daf41aa 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -98,6 +98,7 @@ ArrowType id_to_arrow_type(cudf::type_id id)
 ArrowType id_to_arrow_storage_type(cudf::type_id id)
 {
   switch (id) {
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_INT32;
     case cudf::type_id::TIMESTAMP_SECONDS:
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index b98ca8a7bed..5afed772656 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -170,8 +170,9 @@ int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
   auto child = input.child(cudf::lists_column_view::child_column_index);
   ArrowSchemaInit(out->children[0]);
-  auto child_meta =
-    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+  auto child_meta = metadata.children_meta.empty()
+                      ? column_metadata{"element"}
+                      : metadata.children_meta[cudf::lists_column_view::child_column_index];
 
   out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 38b7e9ebe04..d32a2d8e3f8 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -64,9 +64,13 @@ rapids_cython_create_modules(
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
+set(targets_using_arrow_headers avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(interop PUBLIC nanoarrow)
+
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index df4591baa71..da32d530928 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -52,7 +52,10 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
-link_to_pyarrow_headers(pylibcudf_interop)
+
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 
 add_subdirectory(libcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index adf7e1fd7e8..caa19724786 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,11 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from cython.operator cimport dereference
-from libcpp.memory cimport shared_ptr, unique_ptr
+from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from libc.stdlib cimport free
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow cimport lib as pa
 
 from dataclasses import dataclass, field
 from functools import singledispatch
@@ -18,23 +17,14 @@ from cudf._lib.pylibcudf.libcudf.interop cimport (
     ArrowArrayStream,
     ArrowSchema,
     column_metadata,
-    from_arrow as cpp_from_arrow,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
-    to_arrow as cpp_to_arrow,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    fixed_point_scalar,
-    scalar,
+    to_arrow_host_raw,
+    to_arrow_schema_raw,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 
+from . cimport copying
 from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
@@ -109,7 +99,9 @@ def from_arrow(pyarrow_object, *, DataType data_type=None):
     Union[Table, Scalar]
         The converted object of type corresponding to the input type in cudf.
     """
-    raise TypeError("from_arrow only accepts Table and Scalar objects")
+    raise TypeError(
+        f"Unsupported type {type(pyarrow_object)} for conversion from arrow"
+    )
 
 
 @from_arrow.register(pa.DataType)
@@ -133,7 +125,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
         raise ValueError("data_type may not be passed for tables")
     stream = pyarrow_object.__arrow_c_stream__()
     cdef ArrowArrayStream* c_stream = (
-        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+        <ArrowArrayStream*>PyCapsule_GetPointer(stream, "arrow_array_stream")
     )
 
     cdef unique_ptr[table] c_result
@@ -146,54 +138,17 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
 
 @from_arrow.register(pa.Scalar)
 def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
-    cdef shared_ptr[pa.CScalar] arrow_scalar = pa.pyarrow_unwrap_scalar(pyarrow_object)
-
-    cdef unique_ptr[scalar] c_result
-    with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_scalar)))
-
-    cdef Scalar result = Scalar.from_libcudf(move(c_result))
-
-    if result.type().id() != type_id.DECIMAL128:
-        if data_type is not None:
-            raise ValueError(
-                "dtype may not be passed for non-decimal types"
-            )
-        return result
-
-    if data_type is None:
-        raise ValueError(
-            "Decimal scalars must be constructed with a dtype"
-        )
-
-    cdef type_id tid = data_type.id()
-
-    if tid == type_id.DECIMAL32:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal32](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid == type_id.DECIMAL64:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal64](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid != type_id.DECIMAL128:
-        raise ValueError(
-            "Decimal scalars may only be cast to decimals"
-        )
-
-    return result
+    if isinstance(pyarrow_object.type, pa.ListType) and pyarrow_object.as_py() is None:
+        # pyarrow doesn't correctly handle None values for list types, so
+        # we have to create this one manually.
+        # https://github.com/apache/arrow/issues/40319
+        pa_array = pa.array([None], type=pyarrow_object.type)
+    else:
+        pa_array = pa.array([pyarrow_object])
+    return copying.get_element(
+        from_arrow(pa_array, data_type=data_type),
+        0,
+    )
 
 
 @from_arrow.register(pa.Array)
@@ -204,10 +159,10 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
 
     schema, array = pyarrow_object.__arrow_c_array__()
     cdef ArrowSchema* c_schema = (
-        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+        <ArrowSchema*>PyCapsule_GetPointer(schema, "arrow_schema")
     )
     cdef ArrowArray* c_array = (
-        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+        <ArrowArray*>PyCapsule_GetPointer(array, "arrow_array")
     )
 
     cdef unique_ptr[column] c_result
@@ -238,7 +193,7 @@ def to_arrow(cudf_object, metadata=None):
     Union[pyarrow.Array, pyarrow.Table, pyarrow.Scalar]
         The converted object of type corresponding to the input type in PyArrow.
     """
-    raise TypeError("to_arrow only accepts Table and Scalar objects")
+    raise TypeError(f"Unsupported type {type(cudf_object)} for conversion to arrow")
 
 
 @to_arrow.register(DataType)
@@ -281,46 +236,83 @@ def _to_arrow_datatype(cudf_object, **kwargs):
             )
 
 
-@to_arrow.register(Table)
-def _to_arrow_table(cudf_object, metadata=None):
+cdef void _release_schema(object schema_capsule) noexcept:
+    """Release the ArrowSchema object stored in a PyCapsule."""
+    cdef ArrowSchema* schema = <ArrowSchema*>PyCapsule_GetPointer(
+        schema_capsule, 'arrow_schema'
+    )
+    if schema.release != NULL:
+        schema.release(schema)
+
+    free(schema)
+
+
+cdef void _release_array(object array_capsule) noexcept:
+    """Release the ArrowArray object stored in a PyCapsule."""
+    cdef ArrowArray* array = <ArrowArray*>PyCapsule_GetPointer(
+        array_capsule, 'arrow_array'
+    )
+    if array.release != NULL:
+        array.release(array)
+
+    free(array)
+
+
+def _table_to_schema(Table tbl, metadata):
     if metadata is None:
-        metadata = [ColumnMetadata() for _ in range(len(cudf_object.columns()))]
+        metadata = [ColumnMetadata() for _ in range(len(tbl.columns()))]
     metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata]
-    cdef vector[column_metadata] c_table_metadata
-    cdef shared_ptr[pa.CTable] c_table_result
+
+    cdef vector[column_metadata] c_metadata
+    c_metadata.reserve(len(metadata))
     for meta in metadata:
-        c_table_metadata.push_back(_metadata_to_libcudf(meta))
+        c_metadata.push_back(_metadata_to_libcudf(meta))
+
+    cdef ArrowSchema* raw_schema_ptr
     with nogil:
-        c_table_result = move(
-            cpp_to_arrow((<Table> cudf_object).view(), c_table_metadata)
-        )
+        raw_schema_ptr = to_arrow_schema_raw(tbl.view(), c_metadata)
 
-    return pa.pyarrow_wrap_table(c_table_result)
+    return PyCapsule_New(<void*>raw_schema_ptr, 'arrow_schema', _release_schema)
 
 
-@to_arrow.register(Scalar)
-def _to_arrow_scalar(cudf_object, metadata=None):
-    # Note that metadata for scalars is primarily important for preserving
-    # information on nested types since names are otherwise irrelevant.
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    cdef column_metadata c_scalar_metadata = _metadata_to_libcudf(metadata)
-    cdef shared_ptr[pa.CScalar] c_scalar_result
+def _table_to_host_array(Table tbl):
+    cdef ArrowArray* raw_host_array_ptr
     with nogil:
-        c_scalar_result = move(
-            cpp_to_arrow(
-                dereference((<Scalar> cudf_object).c_obj), c_scalar_metadata
-            )
-        )
+        raw_host_array_ptr = to_arrow_host_raw(tbl.view())
+
+    return PyCapsule_New(<void*>raw_host_array_ptr, "arrow_array", _release_array)
+
+
+class _TableWithArrowMetadata:
+    def __init__(self, tbl, metadata=None):
+        self.tbl = tbl
+        self.metadata = metadata
 
-    return pa.pyarrow_wrap_scalar(c_scalar_result)
+    def __arrow_c_array__(self, requested_schema=None):
+        return _table_to_schema(self.tbl, self.metadata), _table_to_host_array(self.tbl)
+
+
+# TODO: In the long run we should get rid of the `to_arrow` functions in favor of using
+# the protocols directly via `pa.table(cudf_object, schema=...)` directly. We can do the
+# same for columns. We cannot do this for scalars since there is no corresponding
+# protocol. Since this will require broader changes throughout the codebase, the current
+# approach is to leverage the protocol internally but to continue exposing `to_arrow`.
+@to_arrow.register(Table)
+def _to_arrow_table(cudf_object, metadata=None):
+    test_table = _TableWithArrowMetadata(cudf_object, metadata)
+    return pa.table(test_table)
 
 
 @to_arrow.register(Column)
 def _to_arrow_array(cudf_object, metadata=None):
     """Create a PyArrow array from a pylibcudf column."""
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    return to_arrow(Table([cudf_object]), [metadata])[0]
+    if metadata is not None:
+        metadata = [metadata]
+    return to_arrow(Table([cudf_object]), metadata)[0]
+
+
+@to_arrow.register(Scalar)
+def _to_arrow_scalar(cudf_object, metadata=None):
+    # Note that metadata for scalars is primarily important for preserving
+    # information on nested types since names are otherwise irrelevant.
+    return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 2151da28d4b..24d96b602dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -3,11 +3,11 @@
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -29,6 +29,9 @@ cdef extern from "cudf/interop.hpp" nogil:
     cdef struct ArrowArrayStream:
         void (*release)(ArrowArrayStream*) noexcept nogil
 
+    cdef struct ArrowDeviceArray:
+        ArrowArray array
+
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
@@ -38,27 +41,49 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
     DLManagedTensor* to_dlpack(table_view input_table
                                ) except +
 
-    cdef unique_ptr[table] from_arrow(CTable input) except +
-    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
-
     cdef cppclass column_metadata:
         column_metadata() except +
         column_metadata(string name_) except +
         string name
         vector[column_metadata] children_meta
 
-    cdef shared_ptr[CTable] to_arrow(
-        table_view input,
-        vector[column_metadata] metadata,
-    ) except +
-
-    cdef shared_ptr[CScalar] to_arrow(
-        const scalar& input,
-        column_metadata metadata,
-    ) except +
-
     cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
     cdef unique_ptr[column] from_arrow_column(
         const ArrowSchema* schema,
         const ArrowArray* input
     ) except +
+
+
+cdef extern from *:
+    # Rather than exporting the underlying functions directly to Cython, we expose
+    # these wrappers that handle the release to avoid needing to teach Cython how
+    # to handle unique_ptrs with custom deleters that aren't default constructible.
+    # This will go away once we introduce cudf::arrow_column (need a
+    # cudf::arrow_schema as well), see
+    # https://github.com/rapidsai/cudf/issues/16104.
+    """
+    #include <nanoarrow/nanoarrow.h>
+    #include <nanoarrow/nanoarrow_device.h>
+
+    ArrowSchema* to_arrow_schema_raw(
+      cudf::table_view const& input,
+      cudf::host_span<cudf::column_metadata const> metadata) {
+      return to_arrow_schema(input, metadata).release();
+    }
+
+    ArrowArray* to_arrow_host_raw(
+      cudf::table_view const& tbl,
+      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) {
+      // Assumes the sync event is null and the data is already on the host.
+      ArrowArray *arr = new ArrowArray();
+      auto device_arr = cudf::to_arrow_host(tbl, stream, mr);
+      ArrowArrayMove(&device_arr->array, arr);
+      return arr;
+    }
+    """
+    cdef ArrowSchema *to_arrow_schema_raw(
+        const table_view& tbl,
+        const vector[column_metadata]& metadata,
+    ) except + nogil
+    cdef ArrowArray* to_arrow_host_raw(const table_view& tbl) except + nogil
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e19ff58927f..acb2b5be85c 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -44,7 +44,7 @@ def metadata_from_arrow_type(
 def assert_column_eq(
     lhs: pa.Array | plc.Column,
     rhs: pa.Array | plc.Column,
-    check_field_nullability=True,
+    check_field_nullability=False,
 ) -> None:
     """Verify that a pylibcudf array and PyArrow array are equal.
 
@@ -59,7 +59,9 @@ def assert_column_eq(
         on child fields are equal.
 
         Useful for checking roundtripping of lossy formats like JSON that may not
-        preserve this information.
+        preserve this information. Also, our Arrow interop functions make different
+        choices by default than pyarrow field constructors since the interop functions
+        may make data-dependent choices.
     """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(

From 1c63e1ee31a07fb4999d7356919280ba3d528741 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:51:47 -0400
Subject: [PATCH 71/72] Initial investigation into NumPy proxying in
 `cudf.pandas` (#16286)

Apart of #15397. Closes #14537. Creates `ProxyNDarray` which inherits from `np.ndarray`.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16286
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  3 +++
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 20 +++++++++++++++-
 python/cudf/cudf/pandas/proxy_base.py         | 23 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 3b012169676..eabea9713f1 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -14,6 +14,7 @@
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
+from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -111,12 +112,14 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
+    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bb678fd1efe..61aa6310082 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,6 +19,7 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
+from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -564,7 +565,11 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        proxy = object.__new__(cls)
+        base_class = _get_proxy_base_class(cls)
+        if base_class is object:
+            proxy = base_class.__new__(cls)
+        else:
+            proxy = base_class.__new__(cls, value)
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1193,6 +1198,19 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
+def _get_proxy_base_class(cls):
+    """Returns the proxy base class if one exists"""
+    for proxy_class in PROXY_BASE_CLASSES:
+        if proxy_class in cls.__mro__:
+            return proxy_class
+    return object
+
+
+PROXY_BASE_CLASSES: set[type] = {
+    ProxyNDarrayBase,
+}
+
+
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
new file mode 100644
index 00000000000..61d9cde127c
--- /dev/null
+++ b/python/cudf/cudf/pandas/proxy_base.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import cupy as cp
+import numpy as np
+
+
+class ProxyNDarrayBase(np.ndarray):
+    def __new__(cls, arr):
+        if isinstance(arr, cp.ndarray):
+            obj = np.asarray(arr.get()).view(cls)
+            return obj
+        elif isinstance(arr, np.ndarray):
+            obj = np.asarray(arr).view(cls)
+            return obj
+        else:
+            raise TypeError(
+                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
+            )
+
+    def __array_finalize__(self, obj):
+        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", None)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 6292022d8e4..e5483fff913 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1632,3 +1632,11 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
+
+
+def test_numpy_ndarray_isinstancecheck(series):
+    s1, s2 = series
+    arr1 = s1.values
+    arr2 = s2.values
+    assert isinstance(arr1, np.ndarray)
+    assert isinstance(arr2, np.ndarray)

From e690d9d25b4fadbd553f7ef14ac4918e95d98b0e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:48:49 -1000
Subject: [PATCH 72/72] Ensure size is always passed to NumericalColumn
 (#16576)

https://github.com/rapidsai/cudf/pull/16457 requires `NumericalColumn` to be constructed with `size`. It appears another PR got in after this PR was created so there are currently a few usages where `size` isn't passed in.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16576
---
 python/cudf/cudf/core/_internals/where.py   |  4 +--
 python/cudf/cudf/core/column/categorical.py | 37 +++++----------------
 python/cudf/cudf/core/column/column.py      |  1 +
 python/cudf/cudf/core/column/numerical.py   |  1 +
 python/cudf/cudf/core/dataframe.py          |  5 +--
 python/cudf/cudf/core/index.py              |  1 +
 6 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 9f36499586b..0c754317185 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,9 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.NumericalColumn(
-                result.base_data, dtype=result.dtype
-            ),
+            codes=result,
             mask=result.base_mask,
             size=result.size,
             offset=result.offset,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d25983842f9..66aed38bffd 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -659,10 +659,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.NumericalColumn(
-                    codes.base_data,  # type: ignore[arg-type]
-                    dtype=codes.dtype,
-                ),
+                codes=codes,
                 mask=codes.base_mask,
                 ordered=self.ordered,
                 size=codes.size,
@@ -734,10 +731,7 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -845,10 +839,7 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -986,9 +977,7 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=cudf.core.column.NumericalColumn(
-                output.base_data, dtype=output.dtype
-            ),
+            codes=output,
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1184,10 +1173,7 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=cudf.core.column.NumericalColumn(
-                codes_col.base_data,  # type: ignore[arg-type]
-                dtype=codes_col.dtype,
-            ),
+            codes=codes_col,
             mask=codes_col.base_mask,
             size=codes_col.size,
             offset=codes_col.offset,
@@ -1199,10 +1185,7 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=cudf.core.column.NumericalColumn(
-                    self.codes.base_data,  # type: ignore[arg-type]
-                    dtype=self.codes.dtype,
-                ),
+                codes=self.codes,
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
                 size=self.codes.size,
@@ -1345,9 +1328,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=cudf.core.column.NumericalColumn(
-                    new_codes.base_data, dtype=new_codes.dtype
-                ),
+                codes=new_codes,
                 mask=new_codes.base_mask,
                 size=new_codes.size,
                 offset=new_codes.offset,
@@ -1478,9 +1459,7 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=cudf.core.column.NumericalColumn(
-            codes.base_data, dtype=codes.dtype
-        ),
+        codes=codes,
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b0e33e8b9ce..090c02da990 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1513,6 +1513,7 @@ def column_empty(
                         * cudf.dtype(libcudf.types.size_type_dtype).itemsize
                     )
                 ),
+                size=None,
                 dtype=libcudf.types.size_type_dtype,
             ),
         )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 16e78ef35ef..ac36813202a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -654,6 +654,7 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
                 categories=dtype.categories._values,
                 codes=cudf.core.column.NumericalColumn(
                     self.base_data,  # type: ignore[arg-type]
+                    self.size,
                     dtype=self.dtype,
                 ),
                 mask=self.base_mask,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3033abd53f5..f935217f4f9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,7 +46,6 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
-    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
@@ -8541,9 +8540,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=NumericalColumn(
-                    cols[name].base_data, dtype=cols[name].dtype
-                ),
+                codes=cols[name],
                 mask=cols[name].base_mask,
                 offset=cols[name].offset,
                 size=cols[name].size,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index d02633a97fa..ee2f0317f8d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2501,6 +2501,7 @@ def _get_dt_field(self, field: str) -> Index:
         out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
+            size=out_column.size,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,