diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d6d3e3fdd33..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index d670132cca9..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..b515dbff9f3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index fe77ad4b6b2..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4af6a0d690d..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index f62f7885d63..8f8c2adac2f 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 93a815838b7..7a12db927e5 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,8 +68,18 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
+total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
+pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
+pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
 
-pr_df = pr_df[["total", "passed", "failed", "skipped"]]
+cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
+gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+
+# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
+pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
+pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+
+pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
@@ -95,6 +105,8 @@ def emoji_failed(x):
 
 print(comment)
 print()
+print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
 print(df.to_markdown())
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 28ded2f8e0f..a701bfe15e0 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf"
 pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
+  --numprocesses=8 \
   --dist=worksteal \
   .
 popd
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 0d39807d56c..361a42ccda9 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
 
@@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 3bf9d02b384..abc6f74fccf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
 # ##################################################################################################
 # * nds-h benchmark --------------------------------------------------------------------------------
-ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp)
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
@@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 61e79a47a50..e4ff0c8c4a7 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 7563c823454..ce115fd7723 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 3abd4280081..7121cb9f034 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 }
 
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
-  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
 {
   cudf::size_type const cardinality = state.get_int64("cardinality");
   cudf::size_type const run_length  = state.get_int64("run_length");
@@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_source_type};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 {
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
 
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto const num_threads    = state.get_int64("num_threads");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .set_name("parquet_multithreaded_read_decode_fixed_width")
@@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .set_name("parquet_multithreaded_read_decode_string")
@@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .set_name("parquet_multithreaded_read_decode_list")
@@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 // mixed data types: fixed width, strings
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
@@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
@@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .set_name("parquet_multithreaded_read_decode_chunked_string")
@@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .set_name("parquet_multithreaded_read_decode_chunked_list")
@@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
new file mode 100644
index 00000000000..adc3dddc59c
--- /dev/null
+++ b/cpp/benchmarks/text/word_minhash.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/minhash.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_word_minhash(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
+
+  data_profile const strings_profile =
+    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+  auto strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+
+  auto const num_offsets = (num_rows / row_width) + 1;
+  auto offsets           = cudf::sequence(num_offsets,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(row_width));
+
+  auto source = cudf::make_lists_column(num_offsets - 1,
+                                        std::move(offsets),
+                                        std::move(strings_table->release().front()),
+                                        0,
+                                        rmm::device_buffer{});
+
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
+                         : nvtext::word_minhash(source->view(), seeds.view());
+  });
+}
+
+NVBENCH_BENCH(bench_word_minhash)
+  .set_name("word_minhash")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
+  .add_int64_axis("row_width", {10, 100, 1000})
+  .add_int64_axis("seed_count", {2, 25})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 8d206f245dc..6d1c91a5752 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
+- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
+- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
+- Next line (Unicode: `0085`, UTF-8: `C285`)
+- Carriage return (Unicode: `000D`, UTF-8: `0D`)
+
 **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
 - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
 - Unmatched paired special characters like `()`, `[]`, and `{}`.
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index b257eef1e9e..4255faea702 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  *
  * @tparam F Type of callable
  * @param k The `aggregation::Kind` value to dispatch
- * aram f The callable that accepts an `aggregation::Kind` non-type template
- * argument.
+ * @param f The callable that accepts an `aggregation::Kind` callable function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * @return Forwards the return value of the callable.
  */
@@ -1626,6 +1625,7 @@ struct dispatch_source {
  * parameter of the callable `F`
  * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind`
  * non-type template parameter for the second template parameter of the callable
+ * @param f The callable that accepts `data_type` and `aggregation::Kind` function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * `F`.
  */
@@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d
  * @brief Returns the target `data_type` for the specified aggregation  k
  * performed on elements of type  source_type.
  *
- * aram source_type The element type to be aggregated
- * aram k The aggregation
+ * @param source_type The element type to be aggregated
+ * @param k The aggregation kind
  * @return data_type The target_type of  k performed on  source_type
  * elements
  */
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 672b95e2d01..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a tdigest column of empty clusters.
+ * @brief Create an empty tdigest column.
  *
- * The column created contains the specified number of rows of empty clusters.
+ * An empty tdigest column contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A tdigest column of empty clusters.
+ * @returns An empty tdigest column.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a scalar of an empty tdigest cluster.
+ * @brief Create an empty tdigest scalar.
  *
- * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
+ * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A scalar of an empty tdigest cluster.
+ * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 73ff17b2b93..940d03cdb41 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
  * @param indata Input device buffer
+ * @param col_offsets Offsets to column contents in input buffer
+ * @param col_lengths Length of contents of each row in column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
+ *
+ * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents
+ * of each row
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr);
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ed7b2ac0850..ee03a382bec 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -39,8 +39,9 @@ namespace io {
  * @file
  */
 
-constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  ///< 128MB per row group
-constexpr size_type default_row_group_size_rows = 1000000;     ///< 1 million rows per row group
+constexpr size_t default_row_group_size_bytes =
+  std::numeric_limits<size_t>::max();                          ///< Infinite bytes per row group
+constexpr size_type default_row_group_size_rows = 1'000'000;   ///< 1 million rows per row group
 constexpr size_t default_max_page_size_bytes    = 512 * 1024;  ///< 512KB per page
 constexpr size_type default_max_page_size_rows  = 20000;       ///< 20k rows per page
 constexpr int32_t default_column_index_truncate_length = 64;   ///< truncate to 64 bytes
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f7108129dee..4f3fc7086f2 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -35,10 +35,11 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,   ///< default
-  MULTILINE = 8,   ///< the '^' and '$' honor new-line characters
-  DOTALL    = 16,  ///< the '.' matching includes new-line characters
-  ASCII     = 256  ///< use only ASCII when matching built-in character classes
+  DEFAULT     = 0,    ///< default
+  MULTILINE   = 8,    ///< the '^' and '$' honor new-line characters
+  DOTALL      = 16,   ///< the '.' matching includes new-line characters
+  ASCII       = 256,  ///< use only ASCII when matching built-in character classes
+  EXT_NEWLINE = 512   ///< new-line matches extended characters
 };
 
 /**
@@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
   return (f & regex_flags::ASCII) == regex_flags::ASCII;
 }
 
+/**
+ * @brief Returns true if the given flags contain EXT_NEWLINE
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes EXT_NEWLINE
+ */
+constexpr bool is_ext_newline(regex_flags const f)
+{
+  return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
+}
+
 /**
  * @brief Capture groups setting
  *
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index abb26d7ccb4..14695c3bb27 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
 
 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
 {
-  if (byte_pos > 0)
-    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
-      ;
+  if (byte_pos > 0) {
+    if (byte_pos == char_pos) {
+      --byte_pos;
+    } else {
+      while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+        ;
+    }
+  }
   --char_pos;
   return *this;
 }
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index be7d19b2227..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index c83a4260c19..7c909f1a948 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
@@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint32_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm though
+ * only the first 64-bits of the hash are used in computing the output.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash64(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8890c786287..756047d383a 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -625,6 +626,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
@@ -695,11 +698,14 @@ void make_device_json_column(device_span<SymbolT const> input,
     // Struct, List, String, Value
     auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    // if parent is mixed type column or this column is pruned, ignore this column.
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
     if (parent_col_id != parent_node_sentinel &&
-        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
       ignore_vals[this_col_id] = 1;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
     }
 
@@ -765,22 +771,26 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     auto this_column_category = column_categories[this_col_id];
-    if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct/list forced as string, and enforce it
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if ((column_categories[this_col_id] == NC_STRUCT or
-           column_categories[this_col_id] == NC_LIST) and
-          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-        is_mixed_type_column[this_col_id] = 1;
-        this_column_category              = NC_STR;
-      }
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
     initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
@@ -802,12 +812,30 @@ void make_device_json_column(device_span<SymbolT const> input,
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                    column_categories.data(),
-                    column_categories.size() * sizeof(column_categories[0]),
-                    cudaMemcpyDefault,
-                    stream.value());
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
   }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
@@ -982,39 +1010,58 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                    "string offset, string length mismatch");
       rmm::device_uvector<char_length_pair_t> d_string_data(col_size, stream);
       // TODO how about directly storing pair<char*, size_t> in json_column?
-      auto offset_length_it =
-        thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
 
-      data_type target_type{};
+      auto [result_bitmask, null_count] = make_validity(json_col);
 
-      if (schema.has_value()) {
+      data_type target_type{};
+      std::unique_ptr<column> col{};
+      if (options.normalize_whitespace && json_col.forced_as_string_column) {
+        CUDF_EXPECTS(prune_columns || options.mixed_types_as_string,
+                     "Whitespace normalization of nested columns requested as string requires "
+                     "either prune_columns or mixed_types_as_string to be enabled");
+        auto [normalized_d_input, col_offsets, col_lengths] =
+          cudf::io::json::detail::normalize_whitespace(
+            d_input, json_col.string_offsets, json_col.string_lengths, stream, mr);
+        auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin());
+        target_type           = data_type{type_id::STRING};
+        // Convert strings to the inferred data type
+        col = parse_data(normalized_d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
+      } else {
+        auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(),
+                                                          json_col.string_lengths.begin());
+        if (schema.has_value()) {
 #ifdef NJP_DEBUG_PRINT
-        std::cout << "-> explicit type: "
-                  << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
-                                         : "n/a");
+          std::cout << "-> explicit type: "
+                    << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
+                                           : "n/a");
 #endif
-        target_type = schema.value().type;
-      } else if (json_col.forced_as_string_column) {
-        target_type = data_type{type_id::STRING};
-      }
-      // Infer column type, if we don't have an explicit type for it
-      else {
-        target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, offset_length_it, col_size, stream);
+          target_type = schema.value().type;
+        }
+        // Infer column type, if we don't have an explicit type for it
+        else {
+          target_type = cudf::io::detail::infer_data_type(
+            options.json_view(), d_input, offset_length_it, col_size, stream);
+        }
+        // Convert strings to the inferred data type
+        col = parse_data(d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
       }
 
-      auto [result_bitmask, null_count] = make_validity(json_col);
-      // Convert strings to the inferred data type
-      auto col = parse_data(d_input.data(),
-                            offset_length_it,
-                            col_size,
-                            target_type,
-                            std::move(result_bitmask),
-                            null_count,
-                            options.view(),
-                            stream,
-                            mr);
-
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
@@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
-    return get_tree_representation(tokens_gpu,
-                                   token_indices_gpu,
-                                   options.is_enabled_mixed_types_as_string(),
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
+    // Note that to normalize whitespaces in nested columns coerced to be string, we need the column
+    // to either be of mixed type or we need to request the column to be returned as string by
+    // pruning it with the STRING dtype
+    return get_tree_representation(
+      tokens_gpu,
+      token_indices_gpu,
+      options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(),
+      stream,
+      cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 97d5884fef1..2d435dc8e1a 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -17,6 +17,7 @@
 #include "io/fst/lookup_tables.cuh"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -25,8 +26,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/atomic>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
 
 #include <cstdlib>
 #include <string>
@@ -215,14 +225,6 @@ std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
  *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- * Such strings are not part of the JSON standard (characters allowed within quotes should
- * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
- * reading JSON files
  */
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
 // Aliases for readability of the transition table
@@ -255,17 +257,17 @@ struct TransduceToNormalizedWS {
     //      Let the alphabet set be Sigma
     // ---------------------------------------
     // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
+    //    Input symbol translates to output symbol
     // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
+    // DQS   | Sigma            -> <nop>
+    // OOS   | Sigma\{<SPC>,\t} -> <nop>
+    // DEC   | Sigma            -> <nop>
     // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
+    //      Output symbol same as input symbol <s>
+    // OOS   | {<SPC>}          -> {<SPC>}
+    // OOS   | {\t}             -> {\t}
 
-    // Case when read symbol is a space or tab but is unquoted
+    // Case when read symbol is not an unquoted space or tab
     // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
     // However, since there is no output in this case i.e. the count returned by
     // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
@@ -287,8 +289,8 @@ struct TransduceToNormalizedWS {
                                                  SymbolT const read_symbol) const
   {
     // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+    if (!(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+          state_id == static_cast<StateT>(dfa_states::TT_OOS))) {
       return 0;
     }
     return 1;
@@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr)
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-  static constexpr std::int32_t min_out = 0;
-  static constexpr std::int32_t max_out = 2;
+  /*
+   * Algorithm:
+    1. Create a single buffer by concatenating the rows of the string column. Create segment offsets
+   and lengths array for concatenated buffer
+    2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted
+   whitespace characters, and outputs indices of unquoted whitespace characters
+    3. Update segment lengths based on the number of output indices between segment offsets
+    4. Remove characters at output indices from concatenated buffer.
+    5. Return updated buffer, segment lengths and updated segment offsets
+   */
+  auto inbuf_lengths = cudf::detail::make_device_uvector_async(
+    col_lengths, stream, cudf::get_current_device_resource_ref());
+  size_t inbuf_lengths_size = inbuf_lengths.size();
+  size_type inbuf_size =
+    thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end());
+  rmm::device_uvector<char> inbuf(inbuf_size, stream);
+  rmm::device_uvector<size_type> inbuf_offsets(inbuf_lengths_size, stream);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  auto input_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char const*>(
+      [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__(
+        size_t i) -> char const* { return &d_input[col_offsets[i]]; }));
+  auto output_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char*>(
+      [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__(
+        size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; }));
+
+  {
+    // cub device batched copy
+    size_t temp_storage_bytes = 0;
+    cub::DeviceCopy::Batched(nullptr,
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+    rmm::device_buffer temp_storage(temp_storage_bytes, stream);
+    cub::DeviceCopy::Batched(temp_storage.data(),
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+  }
+
+  // whitespace normalization : get the indices of the unquoted whitespace characters
   auto parser =
     fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
                           fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                          fst::detail::make_translation_functor<SymbolT, 0, 2>(
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_buffer outbuf(indata.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
-                   static_cast<SymbolOffsetT>(indata.size()),
-                   static_cast<SymbolT*>(outbuf.data()),
+  rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
-                   outbuf_size.data(),
+                   outbuf_indices.data(),
+                   outbuf_indices_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
-  outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
-  std::swap(indata, outdata);
+  auto const num_deletions = outbuf_indices_size.value(stream);
+  outbuf_indices.resize(num_deletions, stream);
+
+  // now these indices need to be removed
+  // TODO: is there a better way to do this?
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    outbuf_indices.begin(),
+    outbuf_indices.end(),
+    [inbuf_offsets_begin = inbuf_offsets.begin(),
+     inbuf_offsets_end   = inbuf_offsets.end(),
+     inbuf_lengths       = inbuf_lengths.begin()] __device__(size_type idx) {
+      auto it  = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx);
+      auto pos = thrust::distance(inbuf_offsets_begin, it) - 1;
+      cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(inbuf_lengths + pos)};
+      ref.fetch_add(-1, cuda::std::memory_order_relaxed);
+    });
+
+  auto stencil = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    static_cast<std::size_t>(inbuf_size), stream, cudf::get_current_device_resource_ref());
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  thrust::make_constant_iterator(true),
+                  thrust::make_constant_iterator(true) + num_deletions,
+                  outbuf_indices.begin(),
+                  stencil.begin());
+  thrust::remove_if(rmm::exec_policy_nosync(stream),
+                    inbuf.begin(),
+                    inbuf.end(),
+                    stencil.begin(),
+                    thrust::identity<int>());
+  inbuf.resize(inbuf_size - num_deletions, stream);
+
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  stream.synchronize();
+  return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4e513d3495c..1c15e147b13 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  parse_opts.dayfirst   = options.is_enabled_dayfirst();
-  parse_opts.keepquotes = options.is_enabled_keep_quotes();
-  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.dayfirst              = options.is_enabled_dayfirst();
+  parse_opts.keepquotes            = options.is_enabled_keep_quotes();
+  parse_opts.normalize_whitespace  = options.is_enabled_normalize_whitespace();
+  parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  parse_opts.trie_true             = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false            = cudf::detail::create_serialized_trie({"false"}, stream);
   std::vector<std::string> na_values{"", "null"};
   na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
   parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index bd82b040359..99a5b17bce8 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
     normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
-  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
-  // enabled, invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
-  }
-
   auto buffer =
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3f91f6859b..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  constexpr int batch_size = 32;
+  constexpr int batch_size = cudf::detail::warp_size;
   int cur_leaf_count       = target_leaf_count;
   while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 5d10472b0ae..7c985643887 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -203,10 +203,9 @@ struct SchemaElement {
   bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
-           type_length == other.type_length && repetition_type == other.repetition_type &&
-           name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
-           field_id == other.field_id;
+           type_length == other.type_length && name == other.name &&
+           num_children == other.num_children && decimal_scale == other.decimal_scale &&
+           decimal_precision == other.decimal_precision && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 125d35f6499..1390339c1ae 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -400,7 +400,8 @@ struct ColumnChunkDesc {
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
                            float list_bytes_per_row_est_,
-                           bool strings_to_categorical_)
+                           bool strings_to_categorical_,
+                           int32_t src_file_idx_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -419,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      src_file_idx(src_file_idx_)
 
   {
   }
@@ -456,6 +458,7 @@ struct ColumnChunkDesc {
 
   bool is_strings_to_cat{};    // convert strings to hashes
   bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
+  int32_t src_file_idx{};      // source file index
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 245e1829c72..c588fedb85c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info()
     std::transform(
       _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) {
         // translate schema_idx into something we can use for the page indexes
-        if (auto it = std::find_if(
-              columns.begin(),
-              columns.end(),
-              [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; });
+        if (auto it = std::find_if(columns.begin(),
+                                   columns.end(),
+                                   [&](auto const& col_chunk) {
+                                     return col_chunk.schema_idx ==
+                                            _metadata->map_schema_index(col.schema_idx,
+                                                                        rg.source_index);
+                                   });
             it != columns.end()) {
           return std::distance(columns.begin(), it);
         }
@@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info()
       auto col = _input_columns[i];
       // look up metadata
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
+      auto& schema   = _metadata->get_schema(
+        _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index);
 
       auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
@@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info()
                                        col.schema_idx,
                                        chunk_info,
                                        list_bytes_per_row_est,
-                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical,
+                                       rg.source_index));
     }
-
     // Adjust for skip_rows when updating the remaining rows after the first group
     remaining_rows -=
       (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 8b5678f202b..6d566b5815e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
   std::vector<column_chunk_info> chunks(rg.columns.size());
 
   for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) {
-    auto const& col_chunk    = rg.columns[col_idx];
-    auto& schema             = get_schema(col_chunk.schema_idx);
+    auto const& col_chunk = rg.columns[col_idx];
+    auto const is_schema_idx_mapped =
+      is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index);
+    auto const mapped_schema_idx = is_schema_idx_mapped
+                                     ? map_schema_index(col_chunk.schema_idx, rg_info.source_index)
+                                     : col_chunk.schema_idx;
+    auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0);
     auto const max_def_level = schema.max_definition_level;
     auto const max_rep_level = schema.max_repetition_level;
 
@@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata(
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  // Validate that all sources have the same schema unless we are reading select columns
-  // from mismatched sources, in which case, we will only check the projected columns later.
-  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
-    auto const& first_meta = per_file_metadata.front();
+  if (per_file_metadata.size() > 1) {
+    auto& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
-    auto const& schema = first_meta.schema;
-
-    // Verify that the input files have matching numbers of columns and schema.
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() > 0) {
-        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
-                     "All sources must have the same number of columns");
+    auto& schema = first_meta.schema;
+
+    // Validate that all sources have the same schema unless we are reading select columns
+    // from mismatched sources, in which case, we will only check the projected columns later.
+    if (not has_cols_from_mismatched_srcs) {
+      // Verify that the input files have matching numbers of columns and schema.
+      for (auto const& pfm : per_file_metadata) {
+        if (pfm.row_groups.size() > 0) {
+          CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                       "All sources must have the same number of columns");
+        }
+        CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
       }
-      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
+
+    // Mark the column schema in the first (default) source as nullable if it is nullable in any of
+    // the input sources. This avoids recomputing this within build_column() and
+    // populate_metadata().
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(1)),
+      thrust::make_counting_iterator(schema.size()),
+      [&](auto const schema_idx) {
+        if (schema[schema_idx].repetition_type == REQUIRED and
+            std::any_of(
+              per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) {
+                return pfm.schema[schema_idx].repetition_type != REQUIRED;
+              })) {
+          schema[schema_idx].repetition_type = OPTIONAL;
+        }
+      });
   }
 
   // Collect and apply arrow:schema from Parquet's key value metadata section
@@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
-  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
-  // mismatched Parquet sources.
-  if (src_idx and not schema_idx_maps.empty()) {
-    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
-    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
-                 "Unmapped schema index encountered in the specified source tree",
-                 std::range_error);
-    schema_idx = schema_idx_map.at(schema_idx);
-  }
+  // Map schema index to the provided source file index
+  schema_idx = map_schema_index(schema_idx, src_idx);
 
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const
   return rg_metadata;
 }
 
+bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; }
+
+  // Check if mapped
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  return schema_idx_map.find(schema_idx) != schema_idx_map.end();
+}
+
+int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; }
+
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+               "Unmapped schema index encountered in the specified source tree",
+               std::out_of_range);
+
+  // Return the mapped schema idx.
+  return schema_idx_map.at(schema_idx);
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns(
   // Compares two schema elements to be equal except their number of children
   auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
     return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
-           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
-           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.type_length == rhs.type_length and lhs.name == rhs.name and
+           lhs.decimal_scale == rhs.decimal_scale and
            lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
   };
 
@@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns(
                    "the selected path",
                    std::invalid_argument);
 
+      // Get the schema_idx_map for this data source (pfm)
+      auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+      // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+      schema_idx_map[src_schema_idx] = dst_schema_idx;
+
       // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
       // hierarchy. So continue on with mapping.
       if (src_schema_elem.is_stub()) {
@@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns(
                        pfm_idx);
           });
       }
-
-      // We're at a leaf and this is an input column (one with actual data stored) so map it.
-      if (src_schema_elem.num_children == 0) {
-        // Get the schema_idx_map for this data source (pfm)
-        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
-
-        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
-        schema_idx_map[src_schema_idx] = dst_schema_idx;
-      }
     };
 
   std::vector<int> output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6f2863136b2..6487c92f48f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -234,6 +234,26 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
+  /**
+   * @brief Checks if a schema index from 0th source is mapped to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to check mappings for.
+   *
+   * @return True if schema index is mapped
+   */
+  [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const;
+
+  /**
+   * @brief Maps schema index from 0th source file to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to.
+   *
+   * @return Mapped schema index
+   */
+  [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const;
+
   /**
    * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
    *
@@ -248,7 +268,7 @@ class aggregate_reader_metadata {
     CUDF_EXPECTS(
       schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
       "Parquet reader encountered an invalid schema_idx or pfm_idx",
-      std::invalid_argument);
+      std::out_of_range);
     return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
@@ -256,7 +276,10 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
-   * @brief Gets the concrete nesting depth of output cudf columns
+   * @brief Gets the concrete nesting depth of output cudf columns.
+   *
+   * Gets the nesting depth of the output cudf column for the given schema.
+   * The nesting depth must be equal for the given schema_index across all sources.
    *
    * @param schema_index Schema index of the input column
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 52918f5bc80..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
  * is indicated when adding new values.  This function generates the mappings of
  * the R/D levels to those start/end bounds
  *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
+ * @param remap Maps column schema index to the R/D remapping vectors for that column for a
+ *              particular input source file
+ * @param src_col_schema The source column schema to generate the new mapping for
+ * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file
+ * @param src_file_idx The input source file index for the column schema
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
+void generate_depth_remappings(
+  std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>& remap,
+  int const src_col_schema,
+  int const mapped_src_col_schema,
+  int const src_file_idx,
+  aggregate_reader_metadata const& md)
 {
   // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
+  if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; }
+  auto const& schema   = md.get_schema(mapped_src_col_schema, src_file_idx);
+  auto const max_depth = md.get_output_nesting_depth(src_col_schema);
 
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+  CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(),
                "Attempting to remap a schema more than once");
   auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+    remap.insert(std::pair<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>{
+      {src_col_schema, src_file_idx}, {}});
   auto& depth_remap = inserted.first->second;
 
   std::vector<int>& rep_depth_remap = (depth_remap.first);
@@ -136,15 +143,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
       int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
+        auto& cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r) {
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
         // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) {
+        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) {
           shallowest = cur_depth - 1;
         }
         if (!cur_schema.is_stub()) { cur_depth--; }
@@ -159,10 +166,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
     auto find_deepest = [&](int d) {
       SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       int r1         = 0;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_definition_level == d) {
           // if this is a repeated field, map it one level deeper
           r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
@@ -175,10 +182,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
       // we now know R1 from above. return the deepest nesting level that has the
       // same repetition level
-      schema_idx = src_col_schema;
+      schema_idx = mapped_src_col_schema;
       int depth  = max_depth - 1;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r1) {
           // if this is a repeated field, map it one level deeper
           depth = cur_schema.is_stub() ? depth + 1 : depth;
@@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info()
   std::vector<int> per_page_nesting_info_size(num_columns);
   auto iter = thrust::make_counting_iterator(size_type{0});
   std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    // Schema index of the current input column
     auto const schema_idx = _input_columns[i].schema_idx;
-    auto const& schema    = _metadata->get_schema(schema_idx);
-    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+    // Get the max_definition_level of this column across all sources.
+    auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1;
+    std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                  thrust::make_counting_iterator(_sources.size()),
+                  [&](auto const src_file_idx) {
+                    auto const& schema = _metadata->get_schema(
+                      _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+                    max_definition_level =
+                      std::max(max_definition_level, schema.max_definition_level + 1);
+                  });
+
+    return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx));
   });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
@@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info()
         page_nesting_decode_info.device_ptr() + src_info_index;
 
       pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
+      // Set the number of output nesting levels from the zeroth source as nesting must be
+      // identical across sources.
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // Reset the target_page_index
+  target_page_index = 0;
+
   // fill in
   int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const src_col_schema = _input_columns[idx].schema_idx;
 
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    // nesting depth must be same across sources so getting it from the zeroth source is ok
     int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
+    // Map to store depths if this column has lists
+    std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
     // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(0)),
+      thrust::make_counting_iterator(_sources.size()),
+      [&](auto const src_file_idx) {
+        auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx);
+        if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) {
+          generate_depth_remappings(
+            depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata);
+        }
+      });
 
     // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
+    int schema_idx = src_col_schema;
+    // This is okay as we only use this to check stubness of cur_schema and
+    // to get its parent's indices, both of which are one to one mapped.
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
@@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+          // Source file index for the current page.
+          auto const src_file_idx =
+            pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx;
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
@@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info()
             &page_nesting_decode_info[nesting_info_index +
                                       (p_idx * per_page_nesting_info_size[idx])];
 
+          auto const mapped_src_col_schema =
+            _metadata->map_schema_index(src_col_schema, src_file_idx);
           // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
+          if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) {
+            auto remap = depth_remapping.find({src_col_schema, src_file_idx});
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
             std::vector<int> const& rep_depth_remap = (remap->second.first);
@@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info()
             }
           }
 
+          // Get the schema from the current input source.
+          auto& actual_cur_schema = _metadata->get_schema(
+            _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+
           // values indexed by output column index
-          nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
+          nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
+            to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
+    // Offset the page and nesting info indices
+    target_page_index += subpass.column_page_count[idx];
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 81fd4ab9f82..ec05f35d405 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto const table_size  = std::reduce(column_sizes.begin(), column_sizes.end());
     auto const avg_row_len = util::div_rounding_up_safe<size_t>(table_size, input.num_rows());
     if (avg_row_len > 0) {
-      auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len);
-      max_page_fragment_size  = std::min<size_type>(rg_frag_size, max_page_fragment_size);
+      // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size
+      // value (=uint64::max) to avoid a sign overflow when comparing
+      auto const rg_frag_size =
+        std::min<size_t>(std::numeric_limits<size_type>::max(),
+                         util::div_rounding_up_safe(max_row_group_size, avg_row_len));
+      // Safe comparison as rg_frag_size fits in size_type
+      max_page_fragment_size =
+        std::min<size_type>(static_cast<size_type>(rg_frag_size), max_page_fragment_size);
     }
 
     // dividing page size by average row length will tend to overshoot the desired
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index bc2722441d0..734067582f7 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -67,6 +67,8 @@ struct parse_options_view {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::trie_view trie_true;
   cudf::detail::trie_view trie_false;
   cudf::detail::trie_view trie_na;
@@ -85,6 +87,8 @@ struct parse_options {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::optional_trie trie_true;
   cudf::detail::optional_trie trie_false;
   cudf::detail::optional_trie trie_na;
@@ -111,6 +115,8 @@ struct parse_options {
             doublequote,
             dayfirst,
             skipblanklines,
+            normalize_whitespace,
+            mixed_types_as_string,
             cudf::detail::make_trie_view(trie_true),
             cudf::detail::make_trie_view(trie_false),
             cudf::detail::make_trie_view(trie_na),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 76cd55bf994..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,33 +292,32 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(num_rows,
-                             cudf::make_empty_column(type_id::FLOAT64),
-                             cudf::make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(1,
+                             make_empty_column(type_id::FLOAT64),
+                             make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -339,7 +338,7 @@ std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
+  auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d591fb5c171..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
- * set to false when there is no empty cluster, true otherwise.
+ * @param has_nulls Whether or not the input contains nulls
+ *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool may_have_empty_clusters)
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // If the input contains empty clusters, we can potentially have a group that also generates
-    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
-    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
-    // reduction value for that group of nulls and empty clusters. These "stubs" will be
-    // postprocessed out afterwards.
-    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
+    // if the input contains nulls we can potentially have a group that generates no
+    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
+    // in the tdigest generation step will need a location to store the unused reduction value for
+    // that group of nulls. these "stubs" will be postprocessed out afterwards.
+    if (has_nulls) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -518,7 +516,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool may_have_empty_clusters,
+                            bool has_nulls,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -537,7 +535,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    may_have_empty_clusters);
+    has_nulls);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -569,7 +567,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    may_have_empty_clusters);
+    has_nulls);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -582,7 +580,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool may_have_empty_clusters,
+                                            bool has_nulls,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -597,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!may_have_empty_clusters) { return 0; }
+    if (!has_nulls) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -663,10 +661,6 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
-/**
- * @brief A functor which returns the cluster index within a group that the value at
- * the given value index falls into.
- */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param centroids_begin    Beginning of the range of centroids.
- * @param centroids_end      End of the range of centroids.
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -738,7 +731,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool may_have_empty_clusters,
+                                         bool has_nulls,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -757,9 +750,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -802,7 +793,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             may_have_empty_clusters,
+                             has_nulls,
                              stream,
                              mr);
 }
@@ -1154,13 +1145,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
-  auto merged_weights = merged->get_column(1).view();
-  // If there are no values, we can simply return a column that has only empty tdigests.
-  if (merged_weights.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
-  }
-
   // generate cumulative weights
+  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1175,10 +1161,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
-  // We do not know whether there is any empty cluster in the input without actually reading the
-  // data, which could be expensive. So, we just assume that there could be empty clusters.
-  auto const may_have_empty_clusters = true;
-
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1195,7 +1177,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 
@@ -1220,7 +1202,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 }
@@ -1285,9 +1267,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1313,7 +1293,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index adf650a4f27..7c4c89bd3fb 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -539,15 +539,26 @@ class regex_parser {
                                                          : static_cast<int32_t>(LBRA);
       case ')': return RBRA;
       case '^': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return BOL;
       }
       case '$': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return EOL;
       }
       case '[': return build_cclass();
-      case '.': return dot_type;
+      case '.': {
+        _chr = is_ext_newline(_flags) ? 'N' : chr;
+        return dot_type;
+      }
     }
 
     if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
@@ -959,7 +970,7 @@ class regex_compiler {
       _prog.inst_at(inst_id).u1.cls_id = class_id;
     } else if (token == CHAR) {
       _prog.inst_at(inst_id).u1.c = yy;
-    } else if (token == BOL || token == EOL) {
+    } else if (token == BOL || token == EOL || token == ANY) {
       _prog.inst_at(inst_id).u1.c = yy;
     }
     push_and(inst_id, inst_id);
@@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
       case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
       case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
       case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
-      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
       case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
       case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3b899e4edc1..e34a1e12015 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
   list2    = tmp;
 }
 
+/**
+ * @brief Check for supported new-line characters
+ *
+ * '\n, \r, \u0085, \u2028, or \u2029'
+ */
+constexpr bool is_newline(char32_t const ch)
+{
+  return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
+}
+
 /**
  * @brief Utility to check a specific character against this class instance.
  *
@@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     if (checkstart) {
       auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case BOL:
-          if (pos == 0) break;
-          if (jnk.startchar != '^') { return cuda::std::nullopt; }
+        case BOL: {
+          if (pos == 0) { break; }
+          if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
+          if (startchar != '\n') { break; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
+          [[fallthrough]];
+        }
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
           if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
@@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
             id_activate = inst.u2.next_id;
             expanded    = true;
             break;
-          case BOL:
-            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+          case BOL: {
+            auto titr         = itr;
+            auto const prev_c = pos > 0 ? *(--titr) : 0;
+            if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
+                ((inst.u1.c == 'S') && (is_newline(prev_c)))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          case EOL:
+          }
+          case EOL: {
             // after the last character OR:
             // - for MULTILINE, if current character is new-line
             // - for non-MULTILINE, the very last character of the string can also be a new-line
+            bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
             if (last_character ||
-                ((c == '\n') && (inst.u1.c != 'Z') &&
-                 ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
+                (nl && (inst.u1.c != 'Z') &&
+                 ((inst.u1.c == '$' || inst.u1.c == 'S') ||
+                  (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
+          }
           case BOW:
           case NBOW: {
-            auto const prev_c       = pos > 0 ? dstr[pos - 1] : 0;
+            auto titr               = itr;
+            auto const prev_c       = pos > 0 ? *(--titr) : 0;
             auto const word_class   = reclass_device{CCLASS_W};
             bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
             bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
@@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case CHAR:
           if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
-        case ANY:
-          if (c != '\n') id_activate = inst.u2.next_id;
-          break;
+        case ANY: {
+          if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
+          [[fallthrough]];
+        }
         case ANYNL: id_activate = inst.u2.next_id; break;
         case NCCLASS:
         case CCLASS: {
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 978a844c476..4c39fc96397 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
       break;
     }
     size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
-    size_type const bc = (itr < end);
+    size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0;
     char_count += cg::reduce(warp, cc, cg::plus<int>());
     byte_count += cg::reduce(warp, bc, cg::plus<int>());
     itr += cudf::detail::warp_size;
   }
 
+  __syncwarp();
+
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {
       d_output[str_idx] = string_index_pair{"", 0};
       return;
     }
 
-    // we are just below start/stop and must now increment up to it from here
+    // we are just below start/stop and must now increment up to them from here
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
       first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
-    stop           = max(stop, char_count);
+    stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 605582f28a6..a03a34f5fa7 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,8 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr int block_size = 256;
-  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+/**
+ * @brief Compute the minhash of each list row of strings for each seed
+ *
+ * This is a warp-per-row algorithm where parallel threads within a warp
+ * work on strings in a single list row.
+ *
+ * @tparam HashFunction hash function to use on each string
+ *
+ * @param d_input List of strings to process
+ * @param seeds Seeds for hashing each string
+ * @param d_hashes Minhash output values (one per row)
+ */
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
+                                     cudf::device_span<hash_value_type const> seeds,
+                                     hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = idx / cudf::detail::warp_size;
+
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const d_output = d_hashes + (row_idx * seeds.size());
+
+  // initialize hashes output for this row
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  if (lane_idx == 0) {
+    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  // each lane hashes a string from the input row
+  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
+    auto const hash_str =
+      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash string and store the min value
+      hash_value_type hv;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv = hasher(hash_str);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        hv = thrust::get<0>(hasher(hash_str));
+      }
+      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
+                                              cudf::device_span<hash_value_type const> seeds,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
+
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  minhash_word_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
+
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
@@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                   std::move(offsets),
                                   std::move(hashes),
                                   input.null_count(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  cudf::detail::copy_bitmask(input, stream, mr),
                                   stream,
                                   mr);
   // expect this condition to be very rare
@@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 }  // namespace detail
 
@@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash(input, seeds, stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash64(input, seeds, stream, mr);
+}
 }  // namespace nvtext
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d3a7ce5a4e..9824c472b20 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -132,6 +132,13 @@ struct cuda_event {
   cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
   virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
 
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
   operator cudaEvent_t() { return e_; }
 
  private:
@@ -147,11 +154,12 @@ struct cuda_event {
  */
 cudaEvent_t event_for_thread()
 {
-  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
   auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) {
-    thread_events[device_id.value()] = std::make_unique<cuda_event>();
-  }
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
   return *thread_events[device_id.value()];
 }
 
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 3780dbb1d95..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
-
-std::unique_ptr<cudf::table> do_agg(
-  cudf::column_view key,
-  cudf::column_view val,
-  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
-{
-  std::vector<cudf::column_view> keys;
-  keys.push_back(key);
-  cudf::table_view const key_table(keys);
-
-  cudf::groupby::groupby gb(key_table);
-  std::vector<cudf::groupby::aggregation_request> requests;
-  cudf::groupby::aggregation_request req;
-  req.values = val;
-  req.aggregations.push_back(make_agg());
-  requests.push_back(std::move(req));
-
-  auto result = gb.aggregate(std::move(requests));
-
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (auto&& c : result.first->release()) {
-    result_columns.push_back(std::move(c));
-  }
-
-  EXPECT_EQ(result.second.size(), 1);
-  EXPECT_EQ(result.second[0].results.size(), 1);
-  result_columns.push_back(std::move(result.second[0].results[0]));
-
-  return std::make_unique<cudf::table>(std::move(result_columns));
-}
-
-TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
-{
-  // The input must be sorted by the key.
-  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
-  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
-  auto const keys_view = cudf::column_view(keys);
-  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    // All values are null
-    return false;
-  });
-  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
-    val_elems, val_elems + keys_view.size(), val_valids};
-
-  auto const delta = 10000;
-
-  // Compute tdigest. The result should have 3 empty clusters, one per group.
-  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
-    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-  });
-
-  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
-  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_computed_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
-  // The computed values are nullable even though the input values are not.
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
-                                 compute_result->get_column(1).view());
-
-  // Merge tdigest. The result should have 3 empty clusters, one per group.
-  auto const merge_result =
-    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
-      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-    });
-
-  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
-  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_merged_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
-}
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 960c19fce2e..48bc982d0e3 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, JsonDtypeSchema)
+{
+  std::string data = R"(
+    {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {"0": "lolol  "}, "c": true}
+    )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{{"c", {data_type{type_id::STRING}}},
+                                                               {"b", {data_type{type_id::STRING}}},
+                                                               {"a", {dtype<double>()}}};
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(dtype_schema)
+      .prune_columns(true)
+      .lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+
+  // cudf::column::contents contents = result.tbl->get_column(1).release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
+                                        "{\"0\": \"abc\"          }",
+                                        "{\"0\": \"lolol  \"}"}),
+    cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
+                                 cudf::test::strings_column_wrapper({"true", "false", "true"}),
+                                 cudf::test::debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 6d79fdc98ef..6a3bd69de81 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -34,129 +34,127 @@
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
-{
-  // Prepare cuda stream for data transfers & kernels
-  auto stream_view = cudf::test::get_default_stream();
-
-  auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
-
-  // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, cudf::get_current_device_resource_ref());
-
-  std::string preprocessed_host_output(device_data.size(), 0);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_data.data(),
-                                preprocessed_host_output.size(),
-                                cudaMemcpyDeviceToHost,
-                                stream_view.value()));
-
-  stream_view.synchronize();
-  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(
-    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
-}
-
-TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
 {
-  std::string input  = R"({ "A" : "TEST" })";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces)
-{
-  std::string input  = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})";
-  std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString)
-{
-  std::string input  = R"({" a ":50})";
-  std::string output = R"({" a ":50})";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString)
-{
-  std::string input  = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}";
-  std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs)
-{
-  std::string input  = "{\"a\":\t\"b\"}";
-  std::string output = R"({"a":"b"})";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows)
 {
-  std::string input  = "{\"A\" : \t\"TEST\" }";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs)
-{
-  std::string input =
-    "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, "
-    "\"bar\trapids\": 456 }";
-  std::string output =
-    "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" } ] } }
+  )";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample)
-{
-  std::string input  = R"([{"a":50}, {"a" : 60}])";
-  std::string output = R"([{"a":50},{"a":60}])";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired)
-{
-  std::string input  = R"({"a\\n\r\a":50})";
-  std::string output = R"({"a\\n\r\a":50})";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"}] } }
+  )";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
-{
-  std::string input  = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}";
-  std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType)
 {
   // When mixed type fields are read as strings, the table read will differ depending the
   // value of normalize_whitespace
 
   // Test input
-  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" }, { "YY" :   "abc" }    ] } }
+  { "Root": { "Key": [  { "YY" :   "abc" }    ] } }
+  )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
+    {"Key", {cudf::data_type{cudf::type_id::STRING}}}};
+
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(true);
+      .prune_columns(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
   // Expected table
-  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } , { "YY" :    2 } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } }
+  { "Root": { "Key": [{"YY":"abc"}] } }
+  )";
+
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(false);
+      .prune_columns(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 7359f0406fc..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c816316d0ff..acf850c7a66 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
+TEST_F(StringsContainsTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
+                                                   "qqq\rzzé" LINE_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" PARAGRAPH_SEPARATOR,
+                                                   "abc\nzzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                          cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml  = cudf::strings::regex_program::create(pattern, ml_flags);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  auto results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
+  results  = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
+  results     = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+  counts  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+
+  pattern  = std::string("q.*l");
+  prog     = cudf::strings::regex_program::create(pattern);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // inst ANY will stop matching on first 'newline' and so should not match anything here
+  prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // including the DOTALL flag accepts the newline characters
+  auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                           cudf::strings::regex_flags::DOTALL);
+  prog           = cudf::strings::regex_program::create(pattern, dot_flags);
+  expected       = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results        = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EndOfString)
 {
   auto input = cudf::test::strings_column_wrapper(
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index b26cbd5a549..1491da758d5 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsExtractTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
+                                                   "qqq" LINE_SEPARATOR "zzé\rlll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" NEXT_LINE,
+                                                   "abc" PARAGRAPH_SEPARATOR "zzé\n"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::extract(view, *prog);
+  auto expected =
+    cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::extract(view, *prog_ml);
+  expected =
+    cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  prog = cudf::strings::regex_program::create("q(q.*l)l");
+  expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
+                                                {0, 1, 0, 0, 0, 0});
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+  // expect no matches here since the newline(s) interrupts the pattern
+  prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4582dcb1e38..47606b9b3ed 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
+                                                   "qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé\r",
+                                                   "zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::findall(view, *prog);
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::findall(view, *prog_ml);
+  LCW expected_ml(
+    {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 8c0482653fb..9847d8d6bb5 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
+TEST_F(StringsReplaceRegexTest, SpecialNewLines)
+{
+  auto input   = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                     "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                     "zzé",
+                                                     "",
+                                                     "zzé" PARAGRAPH_SEPARATOR,
+                                                     "abc\rzzé\r"});
+  auto view    = cudf::strings_column_view(input);
+  auto repl    = cudf::string_scalar("_");
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results  = cudf::strings::replace_re(view, *prog, repl);
+  auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                      "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                      "_",
+                                                      "",
+                                                      "_" PARAGRAPH_SEPARATOR,
+                                                      "abc\rzzé\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags);
+  results      = cudf::strings::replace_re(view, *prog_ml, repl);
+  expected     = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_",
+                                                     "qqq" NEXT_LINE "_" NEXT_LINE "lll",
+                                                     "_",
+                                                     "",
+                                                     "_" PARAGRAPH_SEPARATOR,
+                                                     "abc\r_\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^zzé$)");
+  prog               = cudf::strings::regex_program::create(pattern, both_flags);
+  results            = cudf::strings::replace_with_backrefs(view, *prog, repl_template);
+  expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]",
+                                                 "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll",
+                                                 "[zzé]",
+                                                 "",
+                                                 "[zzé]" PARAGRAPH_SEPARATOR,
+                                                 "abc\r[zzé]\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 52e439bd93f..7f7fd9d521b 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsSliceTest, MultiByteChars)
+{
+  auto input = cudf::test::strings_column_wrapper({
+    // clang-format off
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "the following code snippet demonstrates how to use search for values in an ordered range  "
+            // this placement tests proper multi-byte chars handling  ------vvvvv
+    "it returns the last position where value could be inserted without the ééééé ordering ",
+    "algorithms execution is parallelized as determined by an execution policy; this is a 12345"
+    "continuation of previous row to make sure string boundaries are honored 012345678901234567"
+           //   v--- this one also
+    "01234567890é34567890012345678901234567890"
+    // clang-format on
+  });
+
+  auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsSliceTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};
diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h
new file mode 100644
index 00000000000..0d630f6bb52
--- /dev/null
+++ b/cpp/tests/strings/special_chars.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::test {
+
+// special new-line characters for use with regex_flags::EXT_NEWLINE
+#define NEXT_LINE           "\xC2\x85"
+#define LINE_SEPARATOR      "\xE2\x80\xA8"
+#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9"
+
+}  // namespace cudf::test
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..e23f3f6e7d8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, WordsMinHash)
+{
+  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto validity = cudf::test::iterators::null_at(1);
+
+  LCWS input(
+    {LCWS({"hello", "abcdéfgh"}),
+     LCWS{},
+     LCWS({"rapids", "moré", "test", "text"}),
+     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
+    validity);
+
+  auto view = cudf::lists_column_view(input);
+
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
+  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
+  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
+  LCW32 expected({LCW32{2069617641u, 1975382903u},
+                  LCW32{},
+                  LCW32{657297235u, 1010955999u},
+                  LCW32{644643885u, 310002789u}},
+                 validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
+  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
+                    LCW64{},
+                    LCW64{5331949571924938590ul, 2088583894581919741ul},
+                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/dependencies.yaml b/dependencies.yaml
index 483335c02ff..7a13043cc5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -710,7 +710,16 @@ dependencies:
               - numpy==1.23.*
               - pandas==2.0.*
               - pyarrow==14.0.0
-              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: conda
+        matrices:
+          - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"}
+            packages:
+              - cupy==12.2.0  # cupy 12.2.0 is the earliest with CUDA 12 ARM packages.
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy==12.0.0
           - matrix:
             packages:
       - output_types: requirements
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 0398a8d7086..41838e01dd9 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the
 ```bash
 python -m cudf.pandas --profile script.py
 ```
+
+### cudf.pandas CLI Features
+
+Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin.
+
+Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter.
+
+The `-c` flag accepts a code string to run, like this:
+
+```bash
+$ python -m cudf.pandas -c "import pandas; print(pandas)"
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
+
+Users can also provide code to execute from stdin, like this:
+
+```bash
+$ echo "import pandas; print(pandas)" | python -m cudf.pandas
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 2eaa75b3189..95f5f9734dd 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,9 +5,9 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and Dask-cuDF\n",
+    "# 10 Minutes to cuDF and Dask cuDF\n",
     "\n",
-    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
+    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n",
     "\n",
     "## What are these Libraries?\n",
     "\n",
@@ -18,13 +18,14 @@
     "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "> [!NOTE]  \n",
-    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note:</b> This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's <a href=\"https://docs.dask.org/en/latest/configuration.html\">configuration infrastructure</a> to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the <a href=\"https://github.com/rapidsai/cudf/tree/main/python/dask_cudf\">Dask cuDF documentation</a> for more information.\n",
+    "</div>\n",
     "\n",
     "\n",
-    "## When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask cuDF\n",
     "\n",
-    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
+    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF."
    ]
   },
   {
@@ -115,7 +116,7 @@
    "source": [
     "ds = dask_cudf.from_cudf(s, npartitions=2)\n",
     "# Note the call to head here to show the first few entries, unlike\n",
-    "# cuDF objects, dask-cuDF objects do not have a printing\n",
+    "# cuDF objects, Dask-cuDF objects do not have a printing\n",
     "# representation that shows values since they may not be in local\n",
     "# memory.\n",
     "ds.head(n=3)"
@@ -331,11 +332,11 @@
    "id": "b17db919",
    "metadata": {},
    "source": [
-    "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
+    "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
     "\n",
-    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
+    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
     "\n",
-    "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
+    "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
    ]
   },
   {
@@ -1680,7 +1681,7 @@
    "id": "7aa0089f",
    "metadata": {},
    "source": [
-    "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
+    "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
    ]
   },
   {
@@ -2393,7 +2394,7 @@
    "id": "f6094cbe",
    "metadata": {},
    "source": [
-    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
+    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
    ]
   },
   {
@@ -3492,7 +3493,7 @@
    "id": "5ac3b004",
    "metadata": {},
    "source": [
-    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
+    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF."
    ]
   },
   {
@@ -4181,7 +4182,7 @@
    "id": "aa8a445b",
    "metadata": {},
    "source": [
-    "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
+    "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
    ]
   },
   {
@@ -4899,7 +4900,7 @@
    "id": "787eae14",
    "metadata": {},
    "source": [
-    "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
+    "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
new file mode 100644
index 00000000000..06f74a38709
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
@@ -0,0 +1,6 @@
+=======
+extract
+=======
+
+.. automodule:: pylibcudf.strings.extract
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 462a756a092..2518afc80a7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -7,8 +7,10 @@ strings
     capitalize
     char_types
     contains
+    extract
     find
     regex_flags
     regex_program
+    repeat
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
new file mode 100644
index 00000000000..0041fe4c3da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
@@ -0,0 +1,6 @@
+======
+repeat
+======
+
+.. automodule:: pylibcudf.strings.repeat
+   :members:
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 9a216690384..7fe6cbd45fa 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -3,39 +3,42 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to dask-cudf's documentation!
+Welcome to Dask cuDF's documentation!
 =====================================
 
-**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension
+**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension
 library for the `Dask <https://dask.org>`__ parallel computing
-framework that provides a `cuDF
-<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
-dataframe with the same API as `Dask dataframes
-<https://docs.dask.org/en/stable/dataframe.html>`__.
+framework. When installed, Dask cuDF is automatically registered
+as the ``"cudf"`` dataframe backend for
+`Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__.
+
+.. note::
+  Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
+  or multi-node execution on their own. You must also deploy a
+  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
+  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
+  setup of the cluster, taking advantage of all features of the GPU
+  and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
-`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
 should feel familiar to you. If not, we recommend starting with `10
 minutes to Dask
 <https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
-by `10 minutes to cuDF and Dask-cuDF
+by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
-When running on multi-GPU systems, `Dask-CUDA
-<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
-simplify the setup of the cluster, taking advantage of all features of
-the GPU and networking hardware.
 
-Using Dask-cuDF
+Using Dask cuDF
 ---------------
 
-When installed, Dask-cuDF registers itself as a dataframe backend for
-Dask. This means that in many cases, using cuDF-backed dataframes requires
-only small changes to an existing workflow. The minimal change is to
-select cuDF as the dataframe backend in :doc:`Dask's
-configuration <dask:configuration>`. To do so, we must set the option
-``dataframe.backend`` to ``cudf``. From Python, this can be achieved
-like so::
+The Dask DataFrame API (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simply use the `Dask configuration <dask:configuration>` system to
+set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
+this can be achieved like so::
 
   import dask
 
@@ -44,52 +47,157 @@ like so::
 Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
 environment before running your code.
 
-Dataframe creation from on-disk formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your workflow creates Dask dataframes from on-disk formats
-(for example using :func:`dask.dataframe.read_parquet`), then setting
-the backend may well be enough to migrate your workflow.
-
-For example, consider reading a dataframe from parquet::
+Once this is done, the public Dask DataFrame API will leverage
+``cudf`` automatically when a new DataFrame collection is created
+from an on-disk format using any of the following ``dask.dataframe``
+functions::
 
-   import dask.dataframe as dd
+* :func:`dask.dataframe.read_parquet`
+* :func:`dask.dataframe.read_json`
+* :func:`dask.dataframe.read_csv`
+* :func:`dask.dataframe.read_orc`
+* :func:`dask.dataframe.read_hdf`
+* :func:`dask.dataframe.from_dict`
 
-   # By default, we obtain a pandas-backed dataframe
-   df = dd.read_parquet("data.parquet", ...)
+For example::
 
+  import dask.dataframe as dd
 
-To obtain a cuDF-backed dataframe, we must set the
-``dataframe.backend`` configuration option::
+  # By default, we obtain a pandas-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
 
   import dask
-  import dask.dataframe as dd
 
   dask.config.set({"dataframe.backend": "cudf"})
-  # This gives us a cuDF-backed dataframe
+  # This now gives us a cuDF-backed dataframe
   df = dd.read_parquet("data.parquet", ...)
 
-This code will use cuDF's GPU-accelerated :func:`parquet reader
-<cudf.read_parquet>` to read partitions of the data.
+When other functions are used to create a new collection
+(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`,
+and :func:`from_array`), the backend of the new collection will
+depend on the inputs to those functions. For example::
+
+  import pandas as pd
+  import cudf
+
+  # This gives us a pandas-backed dataframe
+  dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+  # This gives us a cuDF-backed dataframe
+  dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+
+An existing collection can always be moved to a specific backend
+using the :func:`dask.dataframe.DataFrame.to_backend` API::
+
+  # This ensures that we have a cuDF-backed dataframe
+  df = df.to_backend("cudf")
+
+  # This ensures that we have a pandas-backed dataframe
+  df = df.to_backend("pandas")
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to providing the ``"cudf"`` backend for Dask DataFrame,
+Dask cuDF also provides an explicit ``dask_cudf`` API::
+
+  import dask_cudf
+
+  # This always gives us a cuDF-backed dataframe
+  df = dask_cudf.read_parquet("data.parquet", ...)
+
+This API is used implicitly by the Dask DataFrame API when the ``"cudf"``
+backend is enabled. Therefore, using it directly will not provide any
+performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
+Also, using some parts of the explicit API are incompatible with
+automatic query planning (see the next section).
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
+As long as the ``"dataframe.query-planning"`` configuration is set to
+``True`` (the default) when ``dask.dataframe`` is first imported, `Dask
+Expressions <https://github.com/dask/dask-expr>`__ will be used under the hood.
+
+For example, the following code will automatically benefit from predicate
+pushdown when the result is computed::
+
+  df = dd.read_parquet("/my/parquet/dataset/")
+  result = df.sort_values('B')['A']
+
+Unoptimized expression graph (``df.pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+
+Simplified expression graph (``df.simplify().pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+
+.. note::
+  Dask will automatically simplify the expression graph (within
+  :func:`optimize`) when the result is converted to a task graph
+  (via :func:`compute` or :func:`persist`). You do not need to call
+  :func:`simplify` yourself.
+
+
+Using Multiple GPUs and Multiple Nodes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try
+to partition your data into small-enough tasks to fit comfortably in the
+memory of a single GPU. This means the necessary compute tasks needed to
+compute a query can often be streamed to a single GPU process for
+out-of-core computing. This also means that the compute tasks can be
+executed in parallel over a multi-GPU cluster.
+
+In order to execute your Dask workflow on multiple GPUs, you will
+typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+to deploy distributed Dask cluster, and
+`Distributed <https://distributed.dask.org/en/stable/client.html>`__
+to define a client object. For example::
+
+  from dask_cuda import LocalCUDACluster
+  from distributed import Client
+
+  if __name__ == "__main__":
+
+    client = Client(
+      LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      )
+    )
+
+    df = dd.read_parquet("/my/parquet/dataset/")
+    agg = df.groupby('B').sum()
+    agg.compute()  # This will use the cluster defined above
+
+.. note::
+  This example uses :func:`compute` to materialize a concrete
+  ``cudf.DataFrame`` object in local memory. Never call :func:`compute`
+  on a large collection that cannot fit comfortably in the memory of a
+  single GPU! See Dask's `documentation on managing computation
+  <https://distributed.dask.org/en/stable/manage-computation.html>`__
+  for more details.
 
-Dataframe creation from in-memory formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+documentation for more information about deploying GPU-aware clusters
+(including `best practices
+<https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).
 
-If you already have a dataframe in memory and want to convert it to a
-cuDF-backend one, there are two options depending on whether the
-dataframe is already a Dask one or not. If you have a Dask dataframe,
-then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
-as the backend; if you have a pandas dataframe then you can either
-call :func:`dask.dataframe.from_pandas` followed by
-:func:`~dask.dataframe.to_backend` or first convert the dataframe with
-:func:`cudf.from_pandas` and then parallelise this with
-:func:`dask_cudf.from_cudf`.
 
 API Reference
 -------------
 
-Generally speaking, Dask-cuDF tries to offer exactly the same API as
-Dask itself. There are, however, some minor differences mostly because
+Generally speaking, Dask cuDF tries to offer exactly the same API as
+Dask DataFrame. There are, however, some minor differences mostly because
 cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
 the pandas API, or because cuDF provides additional configuration
 flags (these mostly occur in data reading and writing interfaces).
@@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces).
 As a result, straightforward workflows can be migrated without too
 much trouble, but more complex ones that utilise more features may
 need a bit of tweaking. The API documentation describes details of the
-differences and all functionality that Dask-cuDF supports.
+differences and all functionality that Dask cuDF supports.
 
 .. toctree::
    :maxdepth: 2
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 5a0fbd224ad..6a0f0f6f169 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount,
         od, vd, nullCount, numRows, childHandles);
   }
 
-  static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
+  /**
+   * Creates a ColumnVector from a native column_view using a contiguous device allocation.
+   *
+   * @param columnViewAddress address of the native column_view
+   * @param buffer device buffer containing the data referenced by the column view
+   */
+  public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
     return new ColumnVector(columnViewAddress, buffer);
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 7b58817550d..8c8180436e6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,9 +24,13 @@
  */
 public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
+  private int rowGroupSizeRows;
+  private long rowGroupSizeBytes;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
+    this.rowGroupSizeRows = builder.rowGroupSizeRows;
+    this.rowGroupSizeBytes = builder.rowGroupSizeBytes;
     this.statsGranularity = builder.statsGranularity;
   }
 
@@ -51,18 +55,38 @@ public static Builder builder() {
     return new Builder();
   }
 
+  public int getRowGroupSizeRows() {
+    return rowGroupSizeRows;
+  }
+
+  public long getRowGroupSizeBytes() {
+    return rowGroupSizeBytes;
+  }
+
   public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
   public static class Builder extends CompressionMetadataWriterOptions.Builder
         <Builder, ParquetWriterOptions> {
+    private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group
+    private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
 
     public Builder() {
       super();
     }
 
+    public Builder withRowGroupSizeRows(int rowGroupSizeRows) {
+      this.rowGroupSizeRows = rowGroupSizeRows;
+      return this;
+    }
+
+    public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) {
+      this.rowGroupSizeBytes = rowGroupSizeBytes;
+      return this;
+    }
+
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b250d2d5e8b..115aa73ed95 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -334,20 +334,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames,
 
   /**
    * Setup everything to write parquet formatted data to a file.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param filename        local output path
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param filename          local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
@@ -357,6 +359,8 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
+                                                   int rowGroupSizeRows,
+                                                   long rowGroupSizeBytes,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
@@ -368,20 +372,22 @@ private static native long writeParquetFileBegin(String[] columnNames,
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param consumer        consumer of host buffers produced.
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param consumer          consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
@@ -391,6 +397,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
+                                                     int rowGroupSizeRows,
+                                                     long rowGroupSizeBytes,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
@@ -1834,6 +1842,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
@@ -1854,6 +1864,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index fa10ebf03b8..edc40a315ac 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2156,6 +2156,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
                                                   jobjectArray j_metadata_keys,
                                                   jobjectArray j_metadata_values,
                                                   jint j_compression,
+                                                  jint j_row_group_size_rows,
+                                                  jlong j_row_group_size_bytes,
                                                   jint j_stats_freq,
                                                   jbooleanArray j_isInt96,
                                                   jintArray j_precisions,
@@ -2211,6 +2213,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
@@ -2233,6 +2237,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
                                                 jobjectArray j_metadata_keys,
                                                 jobjectArray j_metadata_values,
                                                 jint j_compression,
+                                                jint j_row_group_size_rows,
+                                                jlong j_row_group_size_bytes,
                                                 jint j_stats_freq,
                                                 jbooleanArray j_isInt96,
                                                 jintArray j_precisions,
@@ -2286,6 +2292,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 56fe63598d9..830f2b33b32 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
-    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
+    ParquetWriterOptions optionsNoCompress =
+      optBuilder.withCompressionType(CompressionType.NONE)
+      .withRowGroupSizeRows(10000)
+      .withRowGroupSizeBytes(10000)
+      .build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
           .withDecimalColumn("_c7", 4)
           .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
+          .withRowGroupSizeRows(10000)
+          .withRowGroupSizeBytes(10000)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini
index db24415ef9e..187d91996b2 100644
--- a/python/cudf/benchmarks/pytest.ini
+++ b/python/cudf/benchmarks/pytest.ini
@@ -6,3 +6,4 @@ python_classes = Bench
 python_functions = bench_*
 markers =
     pandas_incompatible: mark a benchmark that cannot be run with pandas
+addopts = --tb=native
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index e661059faa3..e6c2d136f0d 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -23,9 +23,9 @@ def concat_columns(object columns):
 def concat_tables(object tables, bool ignore_index=False):
     plc_tables = []
     for table in tables:
-        cols = table._data.columns
+        cols = table._columns
         if not ignore_index:
-            cols = table._index._data.columns + cols
+            cols = table._index._columns + cols
         plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
 
     return data_from_pylibcudf_table(
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 16182e31c08..49714091f46 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -384,7 +384,7 @@ cdef class _CPackedColumns:
 
         p.column_names = input_table._column_names
         p.column_dtypes = {}
-        for name, col in input_table._data.items():
+        for name, col in input_table._column_labels_and_values:
             if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
                 p.column_dtypes[name] = col.dtype
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 058e884e08b..9ad96f610b3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -273,7 +273,7 @@ def read_csv(
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
                 if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._data.names[index]
+                    col_name = df._column_names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
     if names is not None and len(names) and isinstance(names[0], int):
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index b1900138d94..564daefbae2 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -179,7 +179,7 @@ cdef update_struct_field_names(
 ):
     # Deprecated, remove in favor of add_col_struct_names
     # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._data.items()):
+    for i, (name, col) in enumerate(table._column_labels_and_values):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
         )
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5ee15d0e409..59cb8d51440 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash64(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a0155671a26..fa2690c7f21 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -235,16 +235,16 @@ cdef object _process_metadata(object df,
             df._index = idx
         elif set(index_col).issubset(names):
             index_data = df[index_col]
-            actual_index_names = list(index_col_names.values())
-            if len(index_data._data) == 1:
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
                 idx = cudf.Index._from_column(
-                    index_data._data.columns[0],
-                    name=actual_index_names[0]
+                    index_data._columns[0],
+                    name=next(actual_index_names)
                 )
             else:
                 idx = cudf.MultiIndex.from_frame(
                     index_data,
-                    names=actual_index_names
+                    names=list(actual_index_names)
                 )
             df.drop(columns=index_col, inplace=True)
             df._index = idx
@@ -252,7 +252,7 @@ cdef object _process_metadata(object df,
             if use_pandas_metadata:
                 df.index.names = index_col
 
-    if len(df._data.names) == 0 and column_index_type is not None:
+    if df._num_columns == 0 and column_index_type is not None:
         df._data.label_dtype = cudf.dtype(column_index_type)
 
     return df
@@ -438,7 +438,7 @@ def write_parquet(
     object statistics="ROWGROUP",
     object metadata_file_path=None,
     object int96_timestamps=False,
-    object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+    object row_group_size_bytes=None,
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
@@ -616,9 +616,9 @@ cdef class ParquetWriter:
         Name of the compression to use. Use ``None`` for no compression.
     statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default 134217728
+    row_group_size_bytes: int, default ``uint64 max``
         Maximum size of each stripe of the output.
-        By default, 134217728 (128MB) will be used.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
     row_group_size_rows: int, default 1000000
         Maximum number of rows of each stripe of the output.
         By default, 1000000 (10^6 rows) will be used.
@@ -661,11 +661,11 @@ cdef class ParquetWriter:
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
-                  int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  int row_group_size_rows=1000000,
-                  int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000,
-                  int max_dictionary_size=1048576,
+                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+                  size_type row_group_size_rows=1000000,
+                  size_t max_page_size_bytes=524288,
+                  size_type max_page_size_rows=20000,
+                  size_t max_dictionary_size=1048576,
                   bool use_dictionary=True,
                   bool store_schema=False):
         filepaths_or_buffers = (
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 47a194c4fda..4bf8a9b1a8f 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
@@ -6,7 +6,12 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash, minhash64
+from cudf._lib.nvtext.minhash import (
+    minhash,
+    minhash64,
+    word_minhash,
+    word_minhash64,
+)
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82f5e06c547..03b4887f200 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,27 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.contains cimport (
-    count_re as cpp_count_re,
-    like as cpp_like,
-    matches_re as cpp_matches_re,
-)
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 from pylibcudf.strings import contains
 from pylibcudf.strings.regex_program import RegexProgram
@@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with count of occurrences of `reg_ex` in
     each string of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_count_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with each value True if the string matches `reg_ex`
     regular expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_matches_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
     Returns a Column with each value True if the string matches the
     `py_pattern` like expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef DeviceScalar pattern = py_pattern.device_value
-    cdef DeviceScalar escape = py_escape.device_value
-
-    cdef const string_scalar* scalar_ptn = <const string_scalar*>(
-        pattern.get_raw_ptr()
-    )
-    cdef const string_scalar* scalar_esc = <const string_scalar*>(
-        escape.get_raw_ptr()
+    plc_column = contains.like(
+        source_strings.to_pylibcudf(mode="read"),
+        py_pattern.device_value.c_value,
+        py_escape.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_like(
-            source_view,
-            scalar_ptn[0],
-            scalar_esc[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 63f4d57e562..5bf336f4f3c 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,21 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags):
     The returning data contains one row for each subject string,
     and one column for each group.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_extract(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags)
+    plc_result = plc.strings.extract.extract(
+        source_strings.to_pylibcudf(mode="read"), prog
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns()))
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 42fcfa5d94e..43649d4defe 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,17 +1,12 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def repeat_scalar(Column source_strings,
@@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repeats_view = repeats.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index cae28d02ef4..8660cca9322 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
         If True, don't include the index in the columns.
     """
     return table_view_from_columns(
-        tbl._index._data.columns + tbl._data.columns
+        tbl._index._columns + tbl._columns
         if not ignore_index and tbl._index is not None
-        else tbl._data.columns
+        else tbl._columns
     )
 
 
@@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index):
     index_descriptors = []
     columns_to_convert = list(table._columns)
     # Columns
-    for name, col in table._data.items():
+    for name, col in table._column_labels_and_values:
         if cudf.get_option("mode.pandas_compatible"):
             # in pandas-compat mode, non-string column names are stringified.
             col_names.append(str(name))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ff114474aa4..a6abd63d042 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1951,7 +1951,7 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._data)),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 16e6908f308..4463e3280df 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -623,11 +623,9 @@ def extract(
                 "unsupported value for `flags` parameter"
             )
 
-        data, _ = libstrings.extract(self._column, pat, flags)
+        data = libstrings.extract(self._column, pat, flags)
         if len(data) == 1 and expand is False:
-            data = next(iter(data.values()))
-        else:
-            data = data
+            _, data = data.popitem()
         return self._return_or_inplace(data, expand=expand)
 
     def contains(
@@ -5349,6 +5347,76 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        >>> ls.str.word_minhash(seeds=seeds)
+        0     [21141582, 1232889953, 1268336794]
+        1    [962346254, 2321233602, 1354839212]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash(self._column, seeds_column)
+        )
+
+    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> ls.str.word_minhash64(seeds)
+        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
+        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash64(self._column, seeds_column)
+        )
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 09b0f453692..bc093fdaa9a 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
     def __delitem__(self, key: abc.Hashable) -> None:
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         del self._data[key]
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
@@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]:
 
     @property
     def nlevels(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         if not self.multiindex:
             return 1
@@ -226,7 +226,7 @@ def name(self) -> abc.Hashable:
 
     @cached_property
     def nrows(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         else:
             return len(next(iter(self.values())))
@@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         Parameters
         ----------
         old_ncols: int
-            len(self._data) before self._data was modified
+            len(self) before self._data was modified
         new_ncols: int
-            len(self._data) after self._data was modified
+            len(self) after self._data was modified
         """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
@@ -335,7 +335,7 @@ def insert(
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         if loc == -1:
             loc = old_ncols
         elif not (0 <= loc <= old_ncols):
@@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple:
         tuple
         """
         if isinstance(index, slice):
-            start, stop, step = index.indices(len(self._data))
+            start, stop, step = index.indices(len(self))
             return self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
             return (self.names[index],)
@@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         if len(self) > 0 and len(value) != self.nrows:
             raise ValueError("All columns must be of equal length")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data[key] = value
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: tuple) -> Self:
@@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None:
         if level < 0:
             level += self.nlevels
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data = {
             _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 58a16a6d504..16b0aa95c35 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg):
         return False
 
     @_performance_tracking
-    def _downcast_to_series(self, df, arg):
+    def _downcast_to_series(self, df: DataFrame, arg):
         """
         "Downcast" from a DataFrame to a Series
         based on Pandas indexing rules
@@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg):
 
         # take series along the axis:
         if axis == 1:
-            return df[df._data.names[0]]
+            return df[df._column_names[0]]
         else:
             if df._num_columns > 0:
                 dtypes = df.dtypes.values.tolist()
                 normalized_dtype = np.result_type(*dtypes)
-                for name, col in df._data.items():
+                for name, col in df._column_labels_and_values:
                     df[name] = col.astype(normalized_dtype)
 
             sr = df.T
-            return sr[sr._data.names[0]]
+            return sr[sr._column_names[0]]
 
 
 class _DataFrameLocIndexer(_DataFrameIndexer):
@@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg):
                     and len(arg) > 1
                     and is_scalar(arg[1])
                 ):
-                    return result._data.columns[0].element_indexing(0)
+                    return result._columns[0].element_indexing(0)
                 return result
         else:
             if isinstance(arg[0], slice):
@@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg):
                 else:
                     tmp_col_name = str(uuid4())
                     cantor_name = "_" + "_".join(
-                        map(str, columns_df._data.names)
+                        map(str, columns_df._column_names)
                     )
                     if columns_df._data.multiindex:
                         # column names must be appropriate length tuples
@@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value):
                             else column.column_empty_like(
                                 col, masked=True, newsize=length
                             )
-                            for key, col in self._data.items()
+                            for key, col in self._column_labels_and_values
                         )
                         self._data = self._data._from_columns_like_self(
                             new_columns, verify=False
@@ -1494,8 +1494,8 @@ def __delitem__(self, name):
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False) -> cudf.Series:
-        mem_usage = [col.memory_usage for col in self._data.columns]
-        names = [str(name) for name in self._data.names]
+        mem_usage = [col.memory_usage for col in self._columns]
+        names = [str(name) for name in self._column_names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
@@ -1725,7 +1725,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f.index._data.columns)
+                else list(f.index._columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1808,7 +1808,7 @@ def _concat(
                 out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(out.index)
-        for name, col in out._data.items():
+        for name, col in out._column_labels_and_values:
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
             )
@@ -1831,13 +1831,13 @@ def astype(
         errors: Literal["raise", "ignore"] = "raise",
     ):
         if is_dict_like(dtype):
-            if len(set(dtype.keys()) - set(self._data.names)) > 0:
+            if len(set(dtype.keys()) - set(self._column_names)) > 0:
                 raise KeyError(
                     "Only a column name can be used for the "
                     "key in a dtype mappings argument."
                 )
         else:
-            dtype = {cc: dtype for cc in self._data.names}
+            dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
     def _clean_renderable_dataframe(self, output):
@@ -2601,7 +2601,7 @@ def equals(self, other) -> bool:
         # If all other checks matched, validate names.
         if ret:
             for self_name, other_name in zip(
-                self._data.names, other._data.names
+                self._column_names, other._column_names
             ):
                 if self_name != other_name:
                     ret = False
@@ -2676,7 +2676,7 @@ def columns(self, columns):
             )
 
         self._data = ColumnAccessor(
-            data=dict(zip(pd_columns, self._data.columns)),
+            data=dict(zip(pd_columns, self._columns)),
             multiindex=multiindex,
             level_names=level_names,
             label_dtype=label_dtype,
@@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
                 f"got {len(self)} elements"
             )
         self._data = ColumnAccessor(
-            data=dict(zip(other.names, self._data.columns)),
+            data=dict(zip(other.names, self._columns)),
             multiindex=other.multiindex,
             rangeindex=other.rangeindex,
             level_names=other.level_names,
@@ -2983,7 +2983,7 @@ def set_index(
             elif isinstance(col, (MultiIndex, pd.MultiIndex)):
                 if isinstance(col, pd.MultiIndex):
                     col = MultiIndex.from_pandas(col)
-                data_to_add.extend(col._data.columns)
+                data_to_add.extend(col._columns)
                 names.extend(col.names)
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
@@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
         out = []
-        for (name, col), other_col in zip(self._data.items(), other_cols):
+        for (name, col), other_col in zip(
+            self._column_labels_and_values, other_cols
+        ):
             source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
@@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                             column.column_empty_like(
                                 col_data, masked=True, newsize=length
                             )
-                            for col_data in self._data.values()
+                            for col_data in self._columns
                         ),
                         verify=False,
                     )
@@ -3664,7 +3666,7 @@ def rename(
                             name: col.find_and_replace(
                                 to_replace, vals, is_all_na
                             )
-                            for name, col in self.index._data.items()
+                            for name, col in self.index._column_labels_and_values
                         }
                     )
                 except OverflowError:
@@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            prefix + col_name for col_name in list(self._data.keys())
-        ]
+        out.columns = [prefix + col_name for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            col_name + suffix for col_name in list(self._data.keys())
-        ]
+        out.columns = [col_name + suffix for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -4805,7 +4803,7 @@ def _func(x):  # pragma: no cover
         # TODO: naive implementation
         # this could be written as a single kernel
         result = {}
-        for name, col in self._data.items():
+        for name, col in self._column_labels_and_values:
             apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
@@ -5444,7 +5442,7 @@ def to_pandas(
         out_index = self.index.to_pandas()
         out_data = {
             i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
-            for i, col in enumerate(self._data.columns)
+            for i, col in enumerate(self._columns)
         }
 
         out_df = pd.DataFrame(out_data, index=out_index)
@@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    index_descr = list(index._data.names)
+                    index_descr = index._column_names
                     index_levels = index.levels
                 else:
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
                 data = data.copy(deep=False)
-                for gen_name, col_name in zip(index_descr, index._data.names):
+                for gen_name, col_name in zip(
+                    index_descr, index._column_names
+                ):
                     data._insert(
                         data.shape[1],
                         gen_name,
@@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=[self[col] for col in self._data.names],
+            columns_to_convert=[self[col] for col in self._column_names],
             df=self,
             column_names=out.schema.names,
             index_levels=index_levels,
@@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
                 "column_dtypes is currently not supported."
             )
         members = [("index", self.index.dtype)] if index else []
-        members += [(col, self[col].dtype) for col in self._data.names]
+        members += list(self._dtypes)
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
             ret["index"] = self.index.to_numpy()
-        for col in self._data.names:
+        for col in self._column_names:
             ret[col] = self[col].to_numpy()
         return ret
 
@@ -6059,7 +6059,7 @@ def quantile(
             )
 
         if columns is None:
-            columns = data_df._data.names
+            columns = set(data_df._column_names)
 
         if isinstance(q, numbers.Number):
             q_is_number = True
@@ -6084,7 +6084,7 @@ def quantile(
             # Ensure that qs is non-scalar so that we always get a column back.
             interpolation = interpolation or "linear"
             result = {}
-            for k in data_df._data.names:
+            for k in data_df._column_names:
                 if k in columns:
                     ser = data_df[k]
                     res = ser.quantile(
@@ -6198,7 +6198,7 @@ def make_false_column_like_self():
                 if isinstance(values, DataFrame)
                 else {name: values._column for name in self._data}
             )
-            for col, self_col in self._data.items():
+            for col, self_col in self._column_labels_and_values:
                 if col in other_cols:
                     other_col = other_cols[col]
                     self_is_cat = isinstance(self_col, CategoricalColumn)
@@ -6231,13 +6231,13 @@ def make_false_column_like_self():
                 else:
                     result[col] = make_false_column_like_self()
         elif is_dict_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 if name in values:
                     result[name] = col.isin(values[name])
                 else:
                     result[name] = make_false_column_like_self()
         elif is_list_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 result[name] = col.isin(values)
         else:
             raise TypeError(
@@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
                     else as_column(True, length=len(filtered._data[name]))
-                    for name in filtered._data.names
+                    for name in filtered._column_names
                 }
             )
             mask = mask.all(axis=1)
@@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_column(
             as_column([length - col.null_count for col in self._columns]),
-            index=cudf.Index(self._data.names),
+            index=cudf.Index(self._column_names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6409,7 +6409,7 @@ def _reduce(
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
             axis_0_results = []
-            for col_label, col in source._data.items():
+            for col_label, col in source._column_labels_and_values:
                 try:
                     axis_0_results.append(getattr(col, op)(**kwargs))
                 except AttributeError as err:
@@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         prepared, mask, common_dtype = self._prepare_for_rowwise_op(
             method, skipna, numeric_only
         )
-        for col in prepared._data.names:
+        for col in prepared._column_names:
             if prepared._data[col].nullable:
                 prepared._data[col] = (
                     prepared._data[col]
@@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None):
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
 
-        for k, col in self._data.items():
+        for k, col in self._column_labels_and_values:
             infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
             if infered_type in inclusion:
                 df._insert(len(df._data), k, col)
@@ -6840,7 +6840,7 @@ def to_parquet(
         statistics="ROWGROUP",
         metadata_file_path=None,
         int96_timestamps=False,
-        row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+        row_group_size_bytes=None,
         row_group_size_rows=None,
         max_page_size_bytes=None,
         max_page_size_rows=None,
@@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Compute the column indices that serves as the input for
         # `interleave_columns`
         column_idx_df = pd.DataFrame(
-            data=range(len(self._data)), index=named_levels
+            data=range(self._num_columns), index=named_levels
         )
 
         column_indices: list[list[int]] = []
@@ -7392,17 +7392,17 @@ def to_struct(self, name=None):
         -----
         Note: a copy of the columns is made.
         """
-        if not all(isinstance(name, str) for name in self._data.names):
+        if not all(isinstance(name, str) for name in self._column_names):
             warnings.warn(
                 "DataFrame contains non-string column name(s). Struct column "
                 "requires field name to be string. Non-string column names "
                 "will be casted to string as the field name."
             )
-        fields = {str(name): col.dtype for name, col in self._data.items()}
+        fields = {str(name): dtype for name, dtype in self._dtypes}
         col = StructColumn(
             data=None,
             dtype=cudf.StructDtype(fields=fields),
-            children=tuple(col.copy(deep=True) for col in self._data.columns),
+            children=tuple(col.copy(deep=True) for col in self._columns),
             size=len(self),
             offset=0,
         )
@@ -7984,7 +7984,7 @@ def value_counts(
             diff = set(subset) - set(self._data)
             if len(diff) != 0:
                 raise KeyError(f"columns {diff} do not exist")
-        columns = list(self._data.names) if subset is None else subset
+        columns = list(self._column_names) if subset is None else subset
         result = (
             self.groupby(
                 by=columns,
@@ -8105,7 +8105,7 @@ def func(left, right, output):
                 right._column_names
             )
         elif _is_scalar_or_zero_d_array(right):
-            for name, col in output._data.items():
+            for name, col in output._column_labels_and_values:
                 output._data[name] = col.fillna(value)
             return output
         else:
@@ -8387,7 +8387,7 @@ def extract_col(df, col):
             and col not in df.index._data
             and not isinstance(df.index, MultiIndex)
         ):
-            return df.index._data.columns[0]
+            return df.index._column
         return df.index._data[col]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b2bc85b13b..98af006f6e5 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
-    def _dtypes(self) -> abc.Iterable:
-        return zip(self._data.names, (col.dtype for col in self._data.columns))
+    def _column_labels_and_values(
+        self,
+    ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]:
+        return zip(self._column_names, self._columns)
+
+    @property
+    def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]:
+        for label, col in self._column_labels_and_values:
+            yield label, col.dtype
 
     @property
     def ndim(self) -> int:
@@ -87,7 +94,7 @@ def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
             "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(tuple(self._data.names)),
+            "column_names": pickle.dumps(self._column_names),
             "column_rangeindex": pickle.dumps(self._data.rangeindex),
             "column_multiindex": pickle.dumps(self._data.multiindex),
             "column_label_dtype": pickle.dumps(self._data.label_dtype),
@@ -156,7 +163,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
         if inplace:
-            for col in self._data:
+            for col in self._column_names:
                 if col in result._data:
                     self._data[col]._mimic_inplace(
                         result._data[col], inplace=True
@@ -267,7 +274,7 @@ def __len__(self) -> int:
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
-            for col_name, col in self._data.items()
+            for col_name, col in self._column_labels_and_values
         )
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
@@ -338,9 +345,7 @@ def equals(self, other) -> bool:
 
         return all(
             self_col.equals(other_col, check_dtypes=True)
-            for self_col, other_col in zip(
-                self._data.values(), other._data.values()
-            )
+            for self_col, other_col in zip(self._columns, other._columns)
         )
 
     @_performance_tracking
@@ -434,11 +439,9 @@ def to_array(
 
         if dtype is None:
             if ncol == 1:
-                dtype = next(iter(self._data.values())).dtype
+                dtype = next(self._dtypes)[1]
             else:
-                dtype = find_common_type(
-                    [col.dtype for col in self._data.values()]
-                )
+                dtype = find_common_type([dtype for _, dtype in self._dtypes])
 
             if not isinstance(dtype, numpy.dtype):
                 raise NotImplementedError(
@@ -446,12 +449,12 @@ def to_array(
                 )
 
         if self.ndim == 1:
-            return to_array(self._data.columns[0], dtype)
+            return to_array(self._columns[0], dtype)
         else:
             matrix = module.empty(
                 shape=(len(self), ncol), dtype=dtype, order="F"
             )
-            for i, col in enumerate(self._data.values()):
+            for i, col in enumerate(self._columns):
                 # TODO: col.values may fail if there is nullable data or an
                 # unsupported dtype. We may want to catch and provide a more
                 # suitable error.
@@ -751,7 +754,7 @@ def fillna(
 
         filled_columns = [
             col.fillna(value[name], method) if name in value else col.copy()
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         ]
 
         return self._mimic_inplace(
@@ -988,7 +991,10 @@ def to_arrow(self):
         index: [[1,2,3]]
         """
         return pa.Table.from_pydict(
-            {str(name): col.to_arrow() for name, col in self._data.items()}
+            {
+                str(name): col.to_arrow()
+                for name, col in self._column_labels_and_values
+            }
         )
 
     @_performance_tracking
@@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         See `ColumnBase._with_type_metadata` for more information.
         """
-        for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
+        for (name, col), (_, dtype) in zip(
+            self._column_labels_and_values, other._dtypes
+        ):
             self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
@@ -1422,7 +1430,7 @@ def _split(self, splits):
         """
         return [
             self._from_columns_like_self(
-                libcudf.copying.columns_split([*self._data.columns], splits)[
+                libcudf.copying.columns_split(list(self._columns), splits)[
                     split_idx
                 ],
                 self._column_names,
@@ -1432,7 +1440,7 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode([*self._columns])
+        columns, indices = libcudf.transform.table_encode(list(self._columns))
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
@@ -1578,7 +1586,7 @@ def __neg__(self):
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
                     else -1 * col
-                    for col in self._data.columns
+                    for col in self._columns
                 )
             )
         )
@@ -1840,9 +1848,7 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            self._data._from_columns_like_self(
-                (~col for col in self._data.columns)
-            )
+            self._data._from_columns_like_self((~col for col in self._columns))
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6424c8af877..cb8cd0cd28b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(
-                    self.grouping.keys.drop_duplicates()._data.columns
-                )
-                right_cols = list(result_index._data.columns)
+                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
+                right_cols = list(result_index._columns)
                 join_keys = [
                     _match_join_keys(lcol, rcol, "left")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -1483,7 +1481,7 @@ def _post_process_chunk_results(
                     # the column name should be, especially if we applied
                     # a nameless UDF.
                     result = result.to_frame(
-                        name=grouped_values._data.names[0]
+                        name=grouped_values._column_names[0]
                     )
                 else:
                     index_data = group_keys._data.copy(deep=True)
@@ -1632,7 +1630,7 @@ def mult(df):
             if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
-                for name, col in res._data.items():
+                for name, col in res._column_labels_and_values:
                     if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
@@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     def _reduce_numeric_only(self, op: str):
         columns = list(
             name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
-            )
+            for name, dtype in self.obj._dtypes
+            if (is_numeric_dtype(dtype) and name not in self.grouping.names)
         )
         return self[columns].agg(op)
 
@@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame:
         """
         # If the key columns are in `obj`, filter them out
         value_column_names = [
-            x for x in self._obj._data.names if x not in self._named_columns
+            x for x in self._obj._column_names if x not in self._named_columns
         ]
         value_columns = self._obj._data.select_by_label(value_column_names)
         return self._obj.__class__._from_data(value_columns)
@@ -3224,8 +3219,8 @@ def _handle_series(self, by):
         self.names.append(by.name)
 
     def _handle_index(self, by):
-        self._key_columns.extend(by._data.columns)
-        self.names.extend(by._data.names)
+        self._key_columns.extend(by._columns)
+        self.names.extend(by._column_names)
 
     def _handle_mapping(self, by):
         by = cudf.Series(by.values(), index=by.keys())
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b2bd20c4982..cd07c58c5d9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -122,13 +122,13 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
@@ -286,6 +286,20 @@ def name(self):
     def name(self, value):
         self._name = value
 
+    @property
+    @_performance_tracking
+    def _column_names(self) -> tuple[Any]:
+        return (self.name,)
+
+    @property
+    @_performance_tracking
+    def _columns(self) -> tuple[ColumnBase]:
+        return (self._values,)
+
+    @property
+    def _column_labels_and_values(self) -> Iterable:
+        return zip(self._column_names, self._columns)
+
     @property  # type: ignore
     @_performance_tracking
     def start(self) -> int:
@@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             else:
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
 
             data = self._apply_cupy_ufunc_to_operands(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fd6bf37f0e6..810d4ad74e7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -294,7 +294,7 @@ def _num_rows(self) -> int:
 
     @property
     def _index_names(self) -> tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self.index._data.names
+        return self.index._column_names
 
     @classmethod
     def _from_data(
@@ -307,6 +307,7 @@ def _from_data(
             raise ValueError(
                 f"index must be None or a cudf.Index not {type(index).__name__}"
             )
+        # out._num_rows requires .index to be defined
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -882,7 +883,7 @@ def replace(
                 columns_dtype_map=dict(self._dtypes),
             )
             copy_data = []
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 try:
                     replaced = col.find_and_replace(
                         to_replace_per_column[name],
@@ -2703,11 +2704,11 @@ def sort_index(
                         by.extend(
                             filter(
                                 lambda n: n not in handled,
-                                self.index._data.names,
+                                self.index._column_names,
                             )
                         )
                 else:
-                    by = list(idx._data.names)
+                    by = list(idx._column_names)
 
                 inds = idx._get_sorted_inds(
                     by=by, ascending=ascending, na_position=na_position
@@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self.index._data.columns
+                self.index._columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self.index._data.columns if keep_index else ()),
+                    *(self.index._columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
@@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self.index._data.columns if keep_index else []),
+                *(self.index._columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3763,8 +3764,8 @@ def _reindex(
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
                 _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
-                    (col.dtype for col in df.index._data.columns),
-                    (col.dtype for col in index._data.columns),
+                    (dtype for _, dtype in df.index._dtypes),
+                    (dtype for _, dtype in index._dtypes),
                 )
             )
 
@@ -3783,7 +3784,7 @@ def _reindex(
                         (name or 0)
                         if isinstance(self, cudf.Series)
                         else name: col
-                        for name, col in df._data.items()
+                        for name, col in df._column_labels_and_values
                     },
                     index=df.index,
                 )
@@ -3794,7 +3795,7 @@ def _reindex(
         index = index if index is not None else df.index
 
         if column_names is None:
-            names = list(df._data.names)
+            names = list(df._column_names)
             level_names = self._data.level_names
             multiindex = self._data.multiindex
             rangeindex = self._data.rangeindex
@@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"):
             col.round(decimals[name], how=how)
             if name in decimals and col.dtype.kind in "fiu"
             else col.copy(deep=True)
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         )
         return self._from_data_like_self(
             self._data._from_columns_like_self(cols)
@@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
             else:
                 thresh = len(df)
 
-        for name, col in df._data.items():
+        for name, col in df._column_labels_and_values:
             check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
@@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self.index._data.columns, *data_columns],
+                [*self.index._columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 # This works for Index too
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
                 index = self.index
 
@@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self.index._data.columns, *self._columns], repeats, axis
+                [*self.index._columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset):
             not np.iterable(subset)
             or isinstance(subset, str)
             or isinstance(subset, tuple)
-            and subset in self._data.names
+            and subset in self._column_names
         ):
             subset = (subset,)
         diff = set(subset) - set(self._data)
@@ -6306,8 +6307,8 @@ def rank(
                 )
             numeric_cols = (
                 name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
+                for name, dtype in self._dtypes
+                if _is_non_decimal_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index b65bc7af832..cfeaca00888 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -140,11 +140,15 @@ def __init__(
         # right_on.
         self._using_left_index = bool(left_index)
         left_on = (
-            lhs.index._data.names if left_index else left_on if left_on else on
+            lhs.index._column_names
+            if left_index
+            else left_on
+            if left_on
+            else on
         )
         self._using_right_index = bool(right_index)
         right_on = (
-            rhs.index._data.names
+            rhs.index._column_names
             if right_index
             else right_on
             if right_on
@@ -334,18 +338,18 @@ def _merge_results(
         # All columns from the left table make it into the output. Non-key
         # columns that share a name with a column in the right table are
         # suffixed with the provided suffix.
-        common_names = set(left_result._data.names) & set(
-            right_result._data.names
+        common_names = set(left_result._column_names) & set(
+            right_result._column_names
         )
         cols_to_suffix = common_names - self._key_columns_with_same_name
         data = {
             (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
-            for name, col in left_result._data.items()
+            for name, col in left_result._column_labels_and_values
         }
 
         # The right table follows the same rule as the left table except that
         # key columns from the right table are removed.
-        for name, col in right_result._data.items():
+        for name, col in right_result._column_labels_and_values:
             if name in common_names:
                 if name not in self._key_columns_with_same_name:
                     data[f"{name}{self.rsuffix}"] = col
@@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: list[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result.index._data.columns)
+            by.extend(result.index._columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e00890ac5c3..6de3981ba66 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Generator, Hashable
 
     from typing_extensions import Self
 
@@ -233,8 +233,8 @@ def names(self, value):
             # to unexpected behavior in some cases. This is
             # definitely buggy, but we can't disallow non-unique
             # names either...
-            self._data = self._data.__class__(
-                dict(zip(value, self._data.values())),
+            self._data = type(self._data)(
+                dict(zip(value, self._columns)),
                 level_names=self._data.level_names,
                 verify=False,
             )
@@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False):
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
-        lookup = cudf.DataFrame()
+        lookup_dict = {}
         for i, row in enumerate(row_tuple):
             if isinstance(row, slice) and row == slice(None):
                 continue
-            lookup[i] = cudf.Series(row)
-        frame = cudf.DataFrame(dict(enumerate(index._data.columns)))
+            lookup_dict[i] = row
+        lookup = cudf.DataFrame(lookup_dict)
+        frame = cudf.DataFrame._from_data(
+            ColumnAccessor(dict(enumerate(index._columns)), verify=False)
+        )
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             data_table = cudf.concat(
                 [
                     frame,
                     cudf.DataFrame._from_data(
-                        {"idx": column.as_column(range(len(frame)))}
+                        ColumnAccessor(
+                            {"idx": column.as_column(range(len(frame)))},
+                            verify=False,
+                        )
                     ),
                 ],
                 axis=1,
@@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         # TODO: Remove this after merge/join
         # obtain deterministic ordering.
         if cudf.get_option("mode.pandas_compatible"):
-            lookup_order = "_" + "_".join(map(str, lookup._data.names))
+            lookup_order = "_" + "_".join(map(str, lookup._column_names))
             lookup[lookup_order] = column.as_column(range(len(lookup)))
             postprocess = operator.methodcaller(
                 "sort_values", by=[lookup_order, "idx"]
@@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_column(index._data.columns[k]),
+                cudf.Series._from_column(index._columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key):
         )
         if need_downcast:
             result = result.T
-            return result[result._data.names[0]]
+            return result[result._column_names[0]]
 
         if len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             result = cudf.Series._from_data(
-                {}, name=tuple(col[0] for col in index._data.columns)
+                {}, name=tuple(col[0] for col in index._columns)
             )
         elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            *_, last_column = index._data.columns
+            last_column = index._columns[-1]
             out_index = cudf.Index._from_column(
                 last_column, name=index.names[-1]
             )
@@ -894,7 +900,7 @@ def __eq__(self, other):
                 [
                     self_col.equals(other_col)
                     for self_col, other_col in zip(
-                        self._data.values(), other._data.values()
+                        self._columns, other._columns
                     )
                 ]
             )
@@ -1041,9 +1047,11 @@ def to_frame(
         )
 
     @_performance_tracking
-    def get_level_values(self, level) -> cudf.Index:
+    def _level_to_ca_label(self, level) -> tuple[Hashable, int]:
         """
-        Return the values at the requested level
+        Convert a level to a ColumAccessor label and an integer position.
+
+        Useful if self._column_names != self.names.
 
         Parameters
         ----------
@@ -1051,10 +1059,13 @@ def get_level_values(self, level) -> cudf.Index:
 
         Returns
         -------
-        An Index containing the values at the requested level.
+        tuple[Hashable, int]
+            (ColumnAccessor label corresponding to level, integer position of the level)
         """
-        colnames = self._data.names
-        if level not in colnames:
+        colnames = self._column_names
+        try:
+            level_idx = colnames.index(level)
+        except ValueError:
             if isinstance(level, int):
                 if level < 0:
                     level = level + len(colnames)
@@ -1067,8 +1078,22 @@ def get_level_values(self, level) -> cudf.Index:
                 level = colnames[level_idx]
             else:
                 raise KeyError(f"Level not found: '{level}'")
-        else:
-            level_idx = colnames.index(level)
+        return level, level_idx
+
+    @_performance_tracking
+    def get_level_values(self, level) -> cudf.Index:
+        """
+        Return the values at the requested level
+
+        Parameters
+        ----------
+        level : int or label
+
+        Returns
+        -------
+        An Index containing the values at the requested level.
+        """
+        level, level_idx = self._level_to_ca_label(level)
         level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
@@ -1420,57 +1445,6 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_performance_tracking
-    def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
-        """
-        Remove and return the specified levels from self.
-
-        Parameters
-        ----------
-        level : level name or index, list
-            One or more levels to remove
-
-        Returns
-        -------
-        Index composed of the removed levels. If only a single level
-        is removed, a flat index is returned. If no levels are specified
-        (empty list), None is returned.
-        """
-        if not pd.api.types.is_list_like(level):
-            level = (level,)
-
-        ilevels = sorted(self._level_index_from_level(lev) for lev in level)
-
-        if not ilevels:
-            return None
-
-        popped_data = {}
-        popped_names = []
-        names = list(self.names)
-
-        # build the popped data and names
-        for i in ilevels:
-            n = self._data.names[i]
-            popped_data[n] = self._data[n]
-            popped_names.append(self.names[i])
-
-        # pop the levels out from self
-        # this must be done iterating backwards
-        for i in reversed(ilevels):
-            n = self._data.names[i]
-            names.pop(i)
-            popped_data[n] = self._data.pop(n)
-
-        # construct the popped result
-        popped = cudf.core.index._index_from_data(popped_data)
-        popped.names = popped_names
-
-        # update self
-        self.names = names
-        self._levels, self._codes = _compute_levels_and_codes(self._data)
-
-        return popped
-
     @_performance_tracking
     def swaplevel(self, i=-2, j=-1) -> Self:
         """
@@ -1507,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             ('aa', 'b')],
            )
         """
-        name_i = self._data.names[i] if isinstance(i, int) else i
-        name_j = self._data.names[j] if isinstance(j, int) else j
+        name_i = self._column_names[i] if isinstance(i, int) else i
+        name_j = self._column_names[j] if isinstance(j, int) else j
         new_data = {}
-        for k, v in self._data.items():
+        for k, v in self._column_labels_and_values:
             if k not in (name_i, name_j):
                 new_data[k] = v
             elif k == name_i:
@@ -1523,7 +1497,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
         return midx
 
     @_performance_tracking
-    def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
+    def droplevel(self, level=-1) -> Self | cudf.Index:
         """
         Removes the specified levels from the MultiIndex.
 
@@ -1578,11 +1552,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
         >>> idx.droplevel(["first", "second"])
         Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third')
         """
-        mi = self.copy(deep=False)
-        mi._poplevels(level)
-        if mi.nlevels == 1:
-            return mi.get_level_values(mi.names[0])
+        if is_scalar(level):
+            level = (level,)
+        elif len(level) == 0:
+            return self
+
+        new_names = list(self.names)
+        new_data = self._data.copy(deep=False)
+        for i in sorted(
+            (self._level_index_from_level(lev) for lev in level), reverse=True
+        ):
+            new_names.pop(i)
+            new_data.pop(self._data.names[i])
+
+        if len(new_data) == 1:
+            return cudf.core.index._index_from_data(new_data)
         else:
+            mi = MultiIndex._from_data(new_data)
+            mi.names = new_names
             return mi
 
     @_performance_tracking
@@ -1886,7 +1873,7 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    def _level_index_from_level(self, level):
+    def _level_index_from_level(self, level) -> int:
         """
         Return level index from given level name or index
         """
@@ -1935,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
-            for lcol, rcol in zip(target._data.columns, self._data.columns)
+            for lcol, rcol in zip(target._columns, self._columns)
         ]
         join_keys = map(list, zip(*join_keys))
         scatter_map, indices = libcudf.join.join(
@@ -2132,7 +2119,7 @@ def _split_columns_by_levels(
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-        for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
+        for i, (name, col) in enumerate(zip(self.names, self._columns)):
             if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
                 yield name, col
@@ -2173,9 +2160,7 @@ def _columns_for_reset_index(
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         """Return the columns and column names for .reset_index"""
         if levels is None:
-            for i, (col, name) in enumerate(
-                zip(self._data.columns, self.names)
-            ):
+            for i, (col, name) in enumerate(zip(self._columns, self.names)):
                 yield f"level_{i}" if name is None else name, col
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 3d205957126..401fef67ee6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,6 +12,7 @@
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
+from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
 from cudf.core.column_accessor import ColumnAccessor
@@ -409,7 +410,7 @@ def concat(
         result_columns = None
         if keys_objs is None:
             for o in objs:
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
@@ -437,7 +438,7 @@ def concat(
         else:
             # All levels in the multiindex label must have the same type
             has_multiple_level_types = (
-                len({type(name) for o in objs for name in o._data.keys()}) > 1
+                len({type(name) for o in objs for name in o._column_names}) > 1
             )
             if has_multiple_level_types:
                 raise NotImplementedError(
@@ -446,7 +447,7 @@ def concat(
                     "the labels to the same type."
                 )
             for k, o in zip(keys_objs, objs):
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
@@ -738,7 +739,8 @@ def get_dummies(
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
-        Right now this is NON-FUNCTIONAL argument in rapids.
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
     columns : sequence of str, optional
         Names of columns to encode. If not provided, will attempt to encode all
         columns. Note this is different from pandas default behavior, which
@@ -806,9 +808,6 @@ def get_dummies(
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
-    if drop_first:
-        raise NotImplementedError("drop_first is not supported yet")
-
     if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -844,7 +843,7 @@ def get_dummies(
         else:
             result_data = {
                 col_name: col
-                for col_name, col in data._data.items()
+                for col_name, col in data._column_labels_and_values
                 if col_name not in columns
             }
 
@@ -862,6 +861,7 @@ def get_dummies(
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
+                    drop_first=drop_first,
                 )
                 result_data.update(col_enc_data)
             return cudf.DataFrame._from_data(result_data, index=data.index)
@@ -874,6 +874,7 @@ def get_dummies(
             prefix=prefix,
             prefix_sep=prefix_sep,
             dtype=dtype,
+            drop_first=drop_first,
         )
         return cudf.DataFrame._from_data(data, index=ser.index)
 
@@ -942,7 +943,7 @@ def _merge_sorted(
 
     columns = [
         [
-            *(obj.index._data.columns if not ignore_index else ()),
+            *(obj.index._columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
@@ -984,7 +985,7 @@ def as_tuple(x):
             return x if isinstance(x, tuple) else (x,)
 
         nrows = len(index_labels)
-        for col_label, col in df._data.items():
+        for col_label, col in df._column_labels_and_values:
             names = [
                 as_tuple(col_label) + as_tuple(name) for name in column_labels
             ]
@@ -1008,7 +1009,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._data.names,
+        level_names=(None,) + columns._column_names,
         verify=False,
     )
     return cudf.DataFrame._from_data(
@@ -1086,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     # Create a DataFrame composed of columns from both
     # columns and index
     ca = ColumnAccessor(
-        dict(
-            enumerate(
-                itertools.chain(index._data.columns, columns._data.columns)
-            )
-        ),
+        dict(enumerate(itertools.chain(index._columns, columns._columns))),
         verify=False,
     )
     columns_index = cudf.DataFrame._from_data(ca)
@@ -1227,13 +1224,24 @@ def unstack(df, level, fill_value=None, sort: bool = True):
         )
         return res
     else:
-        df = df.copy(deep=False)
-        columns = df.index._poplevels(level)
-        index = df.index
-    result = _pivot(df, index, columns)
-    if result.index.nlevels == 1:
-        result.index = result.index.get_level_values(result.index.names[0])
-    return result
+        index = df.index.droplevel(level)
+        if is_scalar(level):
+            columns = df.index.get_level_values(level)
+        else:
+            new_names = []
+            ca_data = {}
+            for lev in level:
+                ca_level, level_idx = df.index._level_to_ca_label(lev)
+                new_names.append(df.index.names[level_idx])
+                ca_data[ca_level] = df.index._data[ca_level]
+            columns = type(df.index)._from_data(
+                ColumnAccessor(ca_data, verify=False)
+            )
+            columns.names = new_names
+        result = _pivot(df, index, columns)
+        if result.index.nlevels == 1:
+            result.index = result.index.get_level_values(result.index.names[0])
+        return result
 
 
 def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:
@@ -1256,6 +1264,7 @@ def _one_hot_encode_column(
     prefix: str | None,
     prefix_sep: str | None,
     dtype: Dtype | None,
+    drop_first: bool,
 ) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
@@ -1276,6 +1285,8 @@ def _one_hot_encode_column(
         )
     data = one_hot_encode(column, categories)
 
+    if drop_first and len(data):
+        data.pop(next(iter(data)))
     if prefix is not None and prefix_sep is not None:
         data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
     if dtype:
@@ -1545,7 +1556,7 @@ def pivot_table(
     if values_passed and not values_multi and table._data.multiindex:
         column_names = table._data.level_names[1:]
         table_columns = tuple(
-            map(lambda column: column[1:], table._data.names)
+            map(lambda column: column[1:], table._column_names)
         )
         table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7197560b5a4..68f34fa28ff 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -186,7 +186,7 @@ def to_datetime(
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
             required = ["year", "month", "day"]
-            req = list(set(required) - set(arg._data.names))
+            req = list(set(required) - set(arg._column_names))
             if len(req):
                 err_req = ",".join(req)
                 raise ValueError(
@@ -196,7 +196,7 @@ def to_datetime(
                 )
 
             # replace passed column name with values in _unit_map
-            got_units = {k: get_units(k) for k in arg._data.names}
+            got_units = {k: get_units(k) for k in arg._column_names}
             unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 265b87350ae..3af662b62ea 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    if any(col.has_nulls() for col in frame._data.values()):
+    if any(col.has_nulls() for col in frame._columns):
         return False
     np_field_types = np.dtype(
         list(
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 6d7362952c9..bfe716f0afc 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()):
 
 def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        if str(col.dtype) in supported_types
-        else np.dtype("O")
-        for colname, col in frame._data.items()
+        colname: dtype if str(dtype) in supported_types else np.dtype("O")
+        for colname, dtype in frame._dtypes
     }
 
 
 def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if str(col.dtype) in supported_types
+        colname: dtype
+        for colname, dtype in frame._dtypes
+        if str(dtype) in supported_types
     }
 
 
 def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
         colname: col
-        for colname, col in frame._data.items()
+        for colname, col in frame._column_labels_and_values
         if str(col.dtype) in supported_types
     }
 
@@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
         *cudautils.make_cache_key(
             func, tuple(_all_dtypes_from_frame(frame).values())
         ),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
+        *(col.mask is None for col in frame._columns),
+        *frame._column_names,
         scalar_argtypes,
         suffix,
     )
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index a9c20150930..3dc8915bfd1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -186,13 +186,13 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.ListColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.ListDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "list columns."
             )
-        elif isinstance(col, cudf.core.column.StructColumn):
+        elif isinstance(dtype, cudf.StructDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "Struct columns."
@@ -203,12 +203,11 @@ def to_csv(
     # workaround once following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/6661
     if any(
-        isinstance(col, cudf.core.column.CategoricalColumn)
-        for col in df._data.columns
+        isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes
     ) or isinstance(df.index, cudf.CategoricalIndex):
         df = df.copy(deep=False)
-        for col_name, col in df._data.items():
-            if isinstance(col, cudf.core.column.CategoricalColumn):
+        for col_name, col in df._column_labels_and_values:
+            if isinstance(col.dtype, cudf.CategoricalDtype):
                 df._data[col_name] = col.astype(col.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 1347b2cc38f..fe8e446f9c0 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -79,13 +79,13 @@ def to_dlpack(cudf_obj):
         )
 
     if any(
-        not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype)
-        for col in gdf._data.columns
+        not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
+        for _, dtype in gdf._dtypes
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [col.dtype for col in gdf._data.columns]
+        [dtype for _, dtype in gdf._dtypes]
     )
     gdf = gdf.astype(dtype)
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index fd246c6215f..c54293badbe 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -396,8 +396,8 @@ def to_orc(
 ):
     """{docstring}"""
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.CategoricalColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.CategoricalDtype):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 62be7378e9e..ce99f98b559 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -64,7 +64,7 @@ def _write_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -149,7 +149,7 @@ def write_to_dataset(
     return_metadata=False,
     statistics="ROWGROUP",
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -205,7 +205,7 @@ def write_to_dataset(
         If ``False``, timestamps will not be altered.
     row_group_size_bytes: integer or None, default None
         Maximum size of each stripe of the output.
-        If None, 134217728 (128MB) will be used.
+        If None, no limit on row group stripe size will be used.
     row_group_size_rows: integer or None, default None
         Maximum number of rows of each stripe of the output.
         If None, 1000000 will be used.
@@ -980,7 +980,7 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 3a82829eb7a..e0d3d9101a9 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import code
 import runpy
 import sys
 import tempfile
@@ -21,6 +22,8 @@
 
 @contextmanager
 def profile(function_profile, line_profile, fn):
+    if fn is None and (line_profile or function_profile):
+        raise RuntimeError("Enabling the profiler requires a script name.")
     if line_profile:
         with open(fn) as f:
             lines = f.readlines()
@@ -54,6 +57,11 @@ def main():
         dest="module",
         nargs=1,
     )
+    parser.add_argument(
+        "-c",
+        dest="cmd",
+        nargs=1,
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -72,9 +80,18 @@ def main():
 
     args = parser.parse_args()
 
+    if args.cmd:
+        f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py")
+        f.write(args.cmd[0].encode())
+        f.seek(0)
+        args.args.insert(0, f.name)
+
     install()
-    with profile(args.profile, args.line_profile, args.args[0]) as fn:
-        args.args[0] = fn
+
+    script_name = args.args[0] if len(args.args) > 0 else None
+    with profile(args.profile, args.line_profile, script_name) as fn:
+        if script_name is not None:
+            args.args[0] = fn
         if args.module:
             (module,) = args.module
             # run the module passing the remaining arguments
@@ -85,6 +102,21 @@ def main():
             # Remove ourself from argv and continue
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
+        else:
+            if sys.stdin.isatty():
+                banner = f"Python {sys.version} on {sys.platform}"
+                site_import = not sys.flags.no_site
+                if site_import:
+                    cprt = 'Type "help", "copyright", "credits" or "license" for more information.'
+                    banner += "\n" + cprt
+            else:
+                # Don't show prompts or banners if stdin is not a TTY
+                sys.ps1 = ""
+                sys.ps2 = ""
+                banner = ""
+
+            # Launch an interactive interpreter
+            code.interact(banner=banner, exitmsg="")
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index afa1ce5f86c..bf2ee6ae624 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+def _fast_function_call():
+    """
+    Placeholder fast function for pytest profiling purposes.
+    """
+    return None
+
+
+def _slow_function_call():
+    """
+    Placeholder slow function for pytest profiling purposes.
+    """
+    return None
+
+
 def _fast_slow_function_call(
     func: Callable,
     /,
@@ -910,6 +924,7 @@ def _fast_slow_function_call(
                 # try slow path
                 raise Exception()
             fast = True
+            _fast_function_call()
             if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
                 try:
                     with nvtx.annotate(
@@ -952,6 +967,7 @@ def _fast_slow_function_call(
                 from ._logger import log_fallback
 
                 log_fallback(slow_args, slow_kwargs, err)
+            _slow_function_call()
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 505a40b0bfa..d12d2697729 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1,10 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import json
 import os
 import sys
+import traceback
+from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -36,4 +39,58 @@ def patch_testing_functions():
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
+# Dictionary to store function call counts
+function_call_counts = {}  # type: ignore
+
+# The specific functions to track
+FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
+
+
+def find_pytest_file(frame):
+    stack = traceback.extract_stack()
+    absolute_paths = [frame.filename for frame in stack]
+    for file in absolute_paths:
+        if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
+            -1
+        ].startswith("test_"):
+            return str(file).rsplit("pandas-tests/", 1)[-1]
+    return None
+
+
+def trace_calls(frame, event, arg):
+    if event != "call":
+        return
+    code = frame.f_code
+    func_name = code.co_name
+
+    if func_name in FUNCTION_NAME:
+        filename = find_pytest_file(frame)
+        if filename is None:
+            return
+        if filename not in function_call_counts:
+            function_call_counts[filename] = defaultdict(int)
+        function_call_counts[filename][func_name] += 1
+
+
+def pytest_sessionstart(session):
+    # Set the profile function to trace calls
+    sys.setprofile(trace_calls)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # Remove the profile function
+    sys.setprofile(None)
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_unconfigure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker, write the counts before exiting
+        worker_id = config.workerinput["workerid"]
+        output_file = f"function_call_counts_worker_{worker_id}.json"
+        with open(output_file, "w") as f:
+            json.dump(function_call_counts, f, indent=4)
+        print(f"Function call counts have been written to {output_file}")
+
+
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9c65b74d081..9b9ce026571 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -64,8 +64,6 @@ markers = [
   "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
-    # append the contents of patch-confest.py to conftest.py
-    cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py
 
     # Substitute `pandas.tests` with a relative import.
     # This will depend on the location of the test module relative to
@@ -137,7 +135,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
@@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
 
 mv *.json ..
 cd ..
-
 rm -rf pandas-testing/pandas-tests/
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index ffd2abb960d..4ea0b3b4413 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -12,7 +12,9 @@
 """
 
 import argparse
+import glob
 import json
+import os
 
 from rich.console import Console
 from rich.table import Table
@@ -57,6 +59,44 @@ def get_per_module_results(log_file_name):
                 per_module_results[module_name].setdefault(outcome, 0)
                 per_module_results[module_name]["total"] += 1
                 per_module_results[module_name][outcome] += 1
+
+    directory = os.path.dirname(log_file_name)
+    pattern = os.path.join(directory, "function_call_counts_worker_*.json")
+    matching_files = glob.glob(pattern)
+    function_call_counts = {}
+
+    for file in matching_files:
+        with open(file) as f:
+            function_call_count = json.load(f)
+        if not function_call_counts:
+            function_call_counts.update(function_call_count)
+        else:
+            for key, value in function_call_count.items():
+                if key not in function_call_counts:
+                    function_call_counts[key] = value
+                else:
+                    if "_slow_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_slow_function_call"] = 0
+                    if "_fast_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_fast_function_call"] = 0
+                    function_call_counts[key]["_slow_function_call"] += (
+                        value.get("_slow_function_call", 0)
+                    )
+                    function_call_counts[key]["_fast_function_call"] += (
+                        value.get("_fast_function_call", 0)
+                    )
+
+    for key, value in per_module_results.items():
+        if key in function_call_counts:
+            per_module_results[key]["_slow_function_call"] = (
+                function_call_counts[key].get("_slow_function_call", 0)
+            )
+            per_module_results[key]["_fast_function_call"] = (
+                function_call_counts[key].get("_fast_function_call", 0)
+            )
+        else:
+            per_module_results[key]["_slow_function_call"] = 0
+            per_module_results[key]["_fast_function_call"] = 0
     return per_module_results
 
 
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 31ad24a4664..668e7a77454 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -676,7 +676,7 @@ def assert_frame_equal(
 
     if check_like:
         left, right = left.reindex(index=right.index), right
-        right = right[list(left._data.names)]
+        right = right[list(left._column_names)]
 
     # index comparison
     assert_index_equal(
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 2136bca0e28..8a594794fac 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -14,3 +14,4 @@ filterwarnings =
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
     ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
+addopts = --tb=native
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b1e095e8853..c41be3e4428 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         mi1 = gdf.groupby(["Date", "Symbol"]).mean().index
         mi2 = mi1.copy(deep=deep)
 
-        lchildren = [col.children for _, col in mi1._data.items()]
-        rchildren = [col.children for _, col in mi2._data.items()]
+        lchildren = [col.children for col in mi1._columns]
+        rchildren = [col.children for col in mi2._columns]
 
         # Flatten
         lchildren = reduce(operator.add, lchildren)
@@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
-        lptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()
-        ]
-        rptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()
-        ]
+        lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns]
+        rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
     cudf.set_option("copy_on_write", original_cow_setting)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cc17dc46e0a..e054143b438 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated():
     df = cudf.DataFrame(range(3))
     with pytest.warns(FutureWarning):
         cudf.get_dummies(df, cats={0: [0, 1, 2]})
+
+
+def test_get_dummies_drop_first_series():
+    result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True)
+    expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)
+    assert_eq(result, expected)
+
+
+def test_get_dummies_drop_first_dataframe():
+    result = cudf.get_dummies(
+        cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}),
+        drop_first=True,
+    )
+    expected = pd.get_dummies(
+        pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True
+    )
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b59a7eef08..7f1b0b1cd46 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     df1 = cudf.DataFrame(
         {
             "i32": cudf.Series([None, None, None], dtype="int32"),
-            "i64": cudf.Series([1234, None, 123], dtype="int64"),
-            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "i64": cudf.Series([1234, 467, 123], dtype="int64"),
+            "list": list([[1, 2], None, [None, 6]]),
             "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "str": ["vfd", None, "ghu"],
             "d_list": list(
@@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
 
     df2 = cudf.DataFrame(
         {
-            "str": ["abc", "def", None],
+            "str": ["abc", "def", "ghi"],
             "i64": cudf.Series([None, 65, 98], dtype="int64"),
             "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
-            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "list": list([[7, 8], [9, 10], [11, 12]]),
             "d_list": list(
                 [
                     [pd.Timedelta(minutes=4), None],
-                    [None, None],
+                    None,
                     [pd.Timedelta(minutes=6), None],
                 ]
             ),
@@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs():
         {
             "a": 1,
             "b": {
-                "inner_a": 10,
-                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+                "a_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 2},
             },
             "c": 2,
         },
         {
             "a": 3,
-            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "b": {"b_a": 30, "b_b": {"b_b_a": 210}},
             "c": 4,
         },
-        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6},
         {"a": 7, "b": None, "c": 8},
-        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
-        None,
-        {
-            "a": None,
-            "b": {
-                "inner_a": None,
-                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
-            },
-            "c": 10,
-        },
+        {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None},
     ]
 
     data2 = [
-        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
-        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
-        {"a": 5, "b": {"inner_b": None}},
-        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
-        {"a": None, "b": {"inner_b": None}},
+        {"a": 1, "b": {"b_b": {"b_b_a": None}}},
+        {"a": 5, "b": {"b_b": None}},
+        {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}},
+        {"a": None, "b": {"b_b": None}},
         None,
-        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
     ]
 
     # cuDF tables from struct data
@@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs():
     # Read the struct.b.inner_b.inner_inner_a column from parquet
     got = cudf.read_parquet(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         allow_mismatched_pq_schemas=True,
     )
     got = (
         cudf.Series(got["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Read with chunked reader
     got_chunked = read_parquet_chunked(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         chunk_read_limit=240,
         pass_read_limit=240,
         allow_mismatched_pq_schemas=True,
@@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs():
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Construct the expected series
@@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs():
         [
             cudf.Series(df1["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
             cudf.Series(df2["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
         ]
     ).reset_index(drop=True)
 
@@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
     data1 = [
-        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
-        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+        {"a": 1, "b": {"b_a": 1, "b_b": 6}},
+        {"a": 3, "b": {"b_a": None, "b_b": 2}},
     ]
     data2 = [
-        {"b": {"inner_a": 1}, "c": "str"},
-        {"b": {"inner_a": None}, "c": None},
+        {"b": {"b_a": 1}, "c": "str"},
+        {"b": {"b_a": None}, "c": None},
     ]
 
     # cuDF tables from struct data
@@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error():
     ):
         cudf.read_parquet(
             [buf1, buf2],
-            columns=["struct.b.inner_b"],
+            columns=["struct.b.b_b"],
             allow_mismatched_pq_schemas=True,
         )
+
+
+def test_parquet_reader_mismatched_nullability():
+    # Ensure that we can faithfully read the tables with mismatched nullabilities
+    df1 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            None,
+                            [pd.Timedelta(minutes=8), None],
+                        ],
+                        None,
+                    ],
+                    None,
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, None, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, 123, 4123], dtype="int32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "string": cudf.Series(["kitten", "puppy", "cub"]),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series(
+                [None, None, None], dtype="timedelta64[ms]"
+            ),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)],
+                        ],
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, None, 4123], dtype="int32"),
+            "list": list([[1, 2], None, [1, 2]]),
+            "datetime": cudf.Series(
+                [1234, None, 4123], dtype="datetime64[ms]"
+            ),
+            "string": cudf.Series(["kitten", None, "cub"]),
+        }
+    )
+
+    # Write tables to parquet with arrow schema for compatibility for duration column(s)
+    fname1 = BytesIO()
+    df1.to_parquet(fname1, store_schema=True)
+    fname2 = BytesIO()
+    df2.to_parquet(fname2, store_schema=True)
+
+    # Read tables back with cudf and arrow in either order and compare
+    assert_eq(
+        cudf.read_parquet([fname1, fname2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([fname2, fname1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
+
+
+def test_parquet_reader_mismatched_nullability_structs(tmpdir):
+    data1 = [
+        {
+            "a": "a",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 12},
+            },
+            "c": [1, 2],
+        },
+        {
+            "a": "b",
+            "b": {
+                "b_a": 30,
+                "b_b": {"b_b_b": 2, "b_b_a": 2},
+            },
+            "c": [3, 4],
+        },
+        {
+            "a": "c",
+            "b": {
+                "b_a": 50,
+                "b_b": {"b_b_b": 4, "b_b_a": 5},
+            },
+            "c": [5, 6],
+        },
+        {
+            "a": "d",
+            "b": {
+                "b_a": 135,
+                "b_b": {"b_b_b": 12, "b_b_a": 32},
+            },
+            "c": [7, 8],
+        },
+        {
+            "a": "e",
+            "b": {
+                "b_a": 1,
+                "b_b": {"b_b_b": 1, "b_b_a": 5},
+            },
+            "c": [9, 10],
+        },
+        {
+            "a": "f",
+            "b": {
+                "b_a": 32,
+                "b_b": {"b_b_b": 1, "b_b_a": 6},
+            },
+            "c": [11, 12],
+        },
+    ]
+
+    data2 = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ]
+
+    pa_table1 = pa.Table.from_pydict({"struct": data1})
+    df1 = cudf.DataFrame.from_arrow(pa_table1)
+
+    pa_table2 = pa.Table.from_pydict({"struct": data2})
+    df2 = cudf.DataFrame.from_arrow(pa_table2)
+
+    # Write tables to parquet
+    buf1 = BytesIO()
+    df1.to_parquet(buf1)
+    buf2 = BytesIO()
+    df2.to_parquet(buf2)
+
+    # Read tables back with cudf and compare with expected.
+    assert_eq(
+        cudf.read_parquet([buf1, buf2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([buf2, buf1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 52179f55da3..997ca357986 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -946,6 +946,66 @@ def test_minhash():
         strings.str.minhash64(seeds=seeds)
 
 
+def test_word_minhash():
+    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
+            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash(seeds=seeds)
+    assert_eq(expected, actual)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([2603139454418834912], dtype=np.uint64),
+            cudf.Series([5240044617220523711], dtype=np.uint64),
+        ]
+    )
+    actual = ls.str.word_minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    2603139454418834912,
+                    8644371945174847701,
+                    5541030711534384340,
+                ],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [5240044617220523711, 5847101123925041457, 153762819128779913],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = ls.str.word_minhash64(seeds=seeds)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        ls.str.word_minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        ls.str.word_minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        ls.str.word_minhash64(seeds=seeds)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1627107b57d..1180da321e6 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -27,7 +27,7 @@
     fsspec_parquet = None
 
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
+_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -275,10 +275,9 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight of 1970-01-01.
     If ``False``, timestamps will not be altered.
-row_group_size_bytes: integer, default {row_group_size_bytes_val}
+row_group_size_bytes: integer, default None
     Maximum size of each stripe of the output.
-    If None, {row_group_size_bytes_val}
-    ({row_group_size_bytes_val_in_mb} MB) will be used.
+    If None, no limit on row group stripe size will be used.
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
@@ -346,10 +345,7 @@
 See Also
 --------
 cudf.read_parquet
-""".format(
-    row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-    row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
-)
+"""
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
 _docstring_merge_parquet_filemetadata = """
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
new file mode 100644
index 00000000000..326224c8fc0
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import tempfile
+import textwrap
+
+
+def _run_python(*, cudf_pandas, command):
+    executable = "python "
+    if cudf_pandas:
+        executable += "-m cudf.pandas "
+    return subprocess.run(
+        executable + command,
+        shell=True,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def test_run_cudf_pandas_with_script():
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f:
+        code = textwrap.dedent(
+            """
+            import pandas as pd
+            df = pd.DataFrame({'a': [1, 2, 3]})
+            print(df['a'].sum())
+            """
+        )
+        f.write(code)
+        f.flush()
+
+        res = _run_python(cudf_pandas=True, command=f.name)
+        expect = _run_python(cudf_pandas=False, command=f.name)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args():
+    input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
+    """Verify that cudf is active with -m cudf.pandas."""
+    input_args_and_code = """-c 'import pandas as pd; print(pd)'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert "cudf" in res.stdout
+    assert "cudf" not in expect.stdout
+
+
+def test_cudf_pandas_script_repl():
+    def start_repl_process(cmd):
+        return subprocess.Popen(
+            cmd.split(),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    def get_repl_output(process, commands):
+        for command in commands:
+            process.stdin.write(command)
+            process.stdin.flush()
+        return process.communicate()
+
+    p1 = start_repl_process("python -m cudf.pandas")
+    p2 = start_repl_process("python")
+    commands = [
+        "import pandas as pd\n",
+        "print(pd.Series(range(2)).sum())\n",
+        "print(pd.Series(range(5)).sum())\n",
+        "import sys\n",
+        "print(pd.Series(list('abcd')), out=sys.stderr)\n",
+    ]
+
+    res = get_repl_output(p1, commands)
+    expect = get_repl_output(p2, commands)
+
+    # Check stdout
+    assert res[0] != ""
+    assert res[0] == expect[0]
+
+    # Check stderr
+    assert res[1] != ""
+    assert res[1] == expect[1]
+
+    p1.kill()
+    p2.kill()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
index 817d98e6ba2..98459035298 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
 [pytest]
 xfail_strict=true
 markers=
@@ -5,3 +7,4 @@ markers=
     xfail_gold: this test is expected to fail in the gold pass
     xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
     xfail_compare: this test is expected to fail in the comparison pass
+addopts = --tb=native
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_polars/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 1509dac9e61..8da17ef09dc 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel:
+cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 Release:
 ```bash
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/custreamz/custreamz/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 6edb9f87d48..4655d2165f0 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -1,135 +1,63 @@
 # <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
 
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
-
-## Using Dask cuDF
-
-### The Dask DataFrame API (Recommended)
-
-Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
-
-```python
-import dask
-dask.config.set({"dataframe.backend": "cudf"})
-
-import dask.dataframe as dd
-# This gives us a cuDF-backed dataframe
-df = dd.read_parquet("data.parquet", ...)
-```
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
 
 > [!IMPORTANT]
-> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
-
-```python
-import pandas as pd
-import cudf
-
-# This gives us a Pandas-backed dataframe
-dd.from_pandas(pd.DataFrame({"a": range(10)}))
-
-# This gives us a cuDF-backed dataframe
-dd.from_pandas(cudf.DataFrame({"a": range(10)}))
-```
-
-A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
-
-```python
-df = df.to_backend("pandas")
-```
-
-Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
-
-```python
-df = df.to_backend("cudf")
-```
-
-### The Explicit Dask cuDF API
-
-In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
-
-```python
-import dask_cudf
-
-# This always gives us a cuDF-backed dataframe
-df = dask_cudf.read_parquet("data.parquet", ...)
-```
-
-> [!NOTE]
-> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently.
 
-See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
-
-## Query Planning
-
-Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
-
-For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
-
-```python
-df = dd.read_parquet("/my/parquet/dataset/")
-result = df.sort_values('B')['A']
-```
-
-Unoptimized expression graph (`df.pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
-```
+## Using Dask cuDF
 
-Simplified expression graph (`df.simplify().pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
-```
+Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF.
 
-> [!NOTE]
-> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+## Installation
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
 
-## Using Multiple GPUs and Multiple Nodes
+## Resources
 
-Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
-> [!IMPORTANT]
-> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+### Quick-start example
 
-In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern:
 
 ```python
-
+import dask
+import dask.dataframe as dd
 from dask_cuda import LocalCUDACluster
 from distributed import Client
 
-client = Client(
+if __name__ == "__main__":
+
+  # Define a GPU-aware cluster to leverage multiple GPUs
+  client = Client(
     LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
-        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
-        enable_cudf_spill=True,  # Improve device memory stability
-        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+      rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+      enable_cudf_spill=True,  # Improve device memory stability
+      local_directory="/fast/scratch/",  # Use fast local storage for spilling
     )
-)
+  )
 
-df = dd.read_parquet("/my/parquet/dataset/")
-agg = df.groupby('B').sum()
-agg.compute()  # This will use the cluster defined above
-```
+  # Set the default dataframe backend to "cudf"
+  dask.config.set({"dataframe.backend": "cudf"})
 
-> [!NOTE]
-> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+  # Create your DataFrame collection from on-disk
+  # or in-memory data
+  df = dd.read_parquet("/my/parquet/dataset/")
 
-Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+  # Use cudf-like syntax to transform and/or query your data
+  query = df.groupby('item')['price'].mean()
 
-## Install
-
-See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+  # Compute, persist, or write out the result
+  query.head()
+```
 
-## Resources
+If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory).
 
-- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
-- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
-- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
-- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
-- [Deployment](https://docs.rapids.ai/deployment/stable/)
-- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
+If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions.
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index f60e4ff81ef..97e1dffc65b 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs):
 
         return self.to_backend("pandas", **kwargs)
 
+    def _prepare_cov_corr(self, min_periods, numeric_only):
+        # Upstream version of this method sets min_periods
+        # to 2 by default (which is not supported by cudf)
+        # TODO: Remove when cudf supports both min_periods
+        # and numeric_only
+        # See: https://github.com/rapidsai/cudf/issues/12626
+        # See: https://github.com/rapidsai/cudf/issues/9009
+        self._meta.cov(min_periods=min_periods)
+
+        frame = self
+        if numeric_only:
+            numerics = self._meta._get_numeric_data()
+            if len(numerics.columns) != len(self.columns):
+                frame = frame[list(numerics.columns)]
+        return frame, min_periods
+
     # var can be removed if cudf#15179 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15179
+    # See: https://github.com/rapidsai/cudf/issues/14935
     def var(
         self,
         axis=0,
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index e793d4381d1..a781b8242fe 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,7 +23,6 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -341,9 +340,7 @@ def write_partition(
                 return_metadata=return_metadata,
                 statistics=kwargs.get("statistics", "ROWGROUP"),
                 int96_timestamps=kwargs.get("int96_timestamps", False),
-                row_group_size_bytes=kwargs.get(
-                    "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
-                ),
+                row_group_size_bytes=kwargs.get("row_group_size_bytes", None),
                 row_group_size_rows=kwargs.get("row_group_size_rows", None),
                 max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
                 max_page_size_rows=kwargs.get("max_page_size_rows", None),
@@ -365,7 +362,7 @@ def write_partition(
                     statistics=kwargs.get("statistics", "ROWGROUP"),
                     int96_timestamps=kwargs.get("int96_timestamps", False),
                     row_group_size_bytes=kwargs.get(
-                        "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                        "row_group_size_bytes", None
                     ),
                     row_group_size_rows=kwargs.get(
                         "row_group_size_rows", None
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 905d8c08135..7aa0f6320f2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1007,3 +1007,20 @@ def test_to_backend_simplify():
         df2 = df.to_backend("cudf")[["y"]].simplify()
         df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
         assert df2._name == df3._name
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("op", ["corr", "cov"])
+def test_cov_corr(op, numeric_only):
+    df = cudf.DataFrame.from_dict(
+        {
+            "x": np.random.randint(0, 5, size=10),
+            "y": np.random.normal(size=10),
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=2)
+    res = getattr(ddf, op)(numeric_only=numeric_only)
+    # Use to_pandas until cudf supports numeric_only
+    # (See: https://github.com/rapidsai/cudf/issues/12626)
+    expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
+    dd.assert_eq(res, expect)
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 0c352a5068b..f2dd22f43aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &seeds,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index c2fb5f0dce4..eac0f748257 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] like(
         column_view source_strings,
         string_scalar pattern,
-        string_scalar escape) except +
+        string_scalar escape_character) except +
+
+    cdef unique_ptr[column] like(
+        column_view source_strings,
+        column_view patterns,
+        string_scalar escape_character) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
index 12cd628fc1f..b7166167cfd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table
 cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
+
+    cdef unique_ptr[column] extract_all_record(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 410ff58f299..59262820411 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        size_type repeat) except +
+        column_view input,
+        size_type repeat_times) except +
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        column_view repeats) except +
+        column_view input,
+        column_view repeat_times) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index b499a127541..d3065cf8667 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx slice.pyx
+set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx
+                   regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d1f632d6d8e..6848c8e6e86 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,6 +5,7 @@ from . cimport (
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index ef102aff2af..bba86e818cc 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,9 +5,11 @@
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
+    repeat,
     replace,
     slice,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
index 2cd4891a0ea..6146a1119d6 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Column contains_re(Column input, RegexProgram prog)
+
+cpdef Column count_re(Column input, RegexProgram prog)
+
+cpdef Column matches_re(Column input, RegexProgram prog)
+
+cpdef Column like(
+    Column input,
+    ColumnOrScalar pattern,
+    Scalar escape_character = *
+)
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 1a2446f6e2c..82bd1fbea32 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,8 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from cython.operator import dereference
+
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
@@ -32,9 +38,131 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_contains.contains_re(
+        result = move(cpp_contains.contains_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column count_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns the number of times the given regex_program's pattern
+    matches in each string.
+
+    For details, see :cpp:func:`cudf::strings::count_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of match counts for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column matches_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns a boolean column identifying rows which
+    matching the given regex_program object but only at
+    the beginning the string.
+
+    For details, see :cpp:func:`cudf::strings::matches_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.matches_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
+    """
+    Returns a boolean column identifying rows which
+    match the given like pattern.
+
+    For details, see :cpp:func:`cudf::strings::like`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    pattern : Column or Scalar
+        Like patterns to match within each string
+    escape_character : Scalar
+        Optional character specifies the escape prefix.
+        Default is no escape character.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] result
+
+    if escape_character is None:
+        escape_character = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
         )
 
+    cdef const string_scalar* c_escape_character = <const string_scalar*>(
+        escape_character.c_obj.get()
+    )
+    cdef const string_scalar* c_pattern
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                pattern.view(),
+                dereference(c_escape_character)
+            ))
+    elif ColumnOrScalar is Scalar:
+        c_pattern = <const string_scalar*>(pattern.c_obj.get())
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                dereference(c_pattern),
+                dereference(c_escape_character)
+            ))
+    else:
+        raise ValueError("pattern must be a Column or a Scalar")
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd
new file mode 100644
index 00000000000..3871f5a0e4e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog)
+
+cpdef Column extract_all_record(Column input, RegexProgram prog)
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
new file mode 100644
index 00000000000..dcb11ca10ce
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport extract as cpp_extract
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog):
+    """
+    Returns a table of strings columns where each column
+    corresponds to the matching group specified in the given
+    egex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Table
+        Columns of strings extracted from the input column.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column extract_all_record(Column input, RegexProgram prog):
+    """
+    Returns a lists column of strings where each string column
+    row corresponds to the matching group specified in the given
+    regex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract_all_record`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Column
+        Lists column containing strings extracted from the input column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract_all_record(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd
new file mode 100644
index 00000000000..bc70926b6fa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnorSizeType:
+    Column
+    size_type
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times)
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
new file mode 100644
index 00000000000..5f627218f6e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
+    """
+    Repeat each string in the given strings column by the numbers
+    of times given in another numeric column.
+
+    For details, see :cpp:func:`cudf::strings::repeat`.
+
+    Parameters
+    ----------
+    input : Column
+        The column containing strings to repeat.
+    repeat_times : Column or int
+        Number(s) of times that the corresponding input strings
+        for each row are repeated.
+
+    Returns
+    -------
+    Column
+        New column containing the repeated strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    if ColumnorSizeType is Column:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times.view()
+                )
+            )
+    elif ColumnorSizeType is size_type:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times
+                )
+            )
+    else:
+        raise ValueError("repeat_times must be size_type or integer")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
index 1761c0f011c..f572f85ca49 100644
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/tests/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+addopts = --tb=native
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4f88e09183f..4e4dd7cbb00 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
+
+
+def test_count_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["A1a2A3a4", "A1A2A3", None])
+    result = plc.strings.contains.count_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.count_substring_regex(arr, pattern)
+    assert_column_eq(result, expected)
+
+
+def test_match_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["1a2b", "b1a2", None])
+    result = plc.strings.contains.matches_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.match_substring_regex(arr, f"^{pattern}")
+    assert_column_eq(result, expected)
+
+
+def test_like():
+    pattern = "%a"
+    arr = pa.array(["1a2aa3aaa"])
+    result = plc.strings.contains.like(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(pa.array([pattern])),
+    )
+    expected = pc.match_like(arr, pattern)
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
new file mode 100644
index 00000000000..788b86423c4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+
+def test_extract():
+    pattern = "([ab])(\\d)"
+    pa_pattern = "(?P<letter>[ab])(?P<digit>\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pc.extract_regex(arr, pa_pattern)
+    for i, result_col in enumerate(result.itercolumns()):
+        expected_col = pa.chunked_array(expected.field(i))
+        assert result_col.fill_null("").equals(expected_col)
+
+
+def test_extract_all_record():
+    pattern = "([ab])(\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract_all_record(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(
+        [pa.array([["a", "1"], ["b", "2"], None], type=result.type)]
+    )
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
new file mode 100644
index 00000000000..18b5d8bf4d0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+
+
+@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
+def test_repeat_strings(repeats):
+    arr = pa.array(["1", None])
+    plc_result = plc.strings.repeat.repeat_strings(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(repeats)
+        if not isinstance(repeats, int)
+        else repeats,
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.binary_repeat(arr, repeats))
+    assert result.equals(expected)