From c31aaac30ae07f8b57c80380e6d517c48f591fe0 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 16 Jul 2024 16:03:06 -0400
Subject: [PATCH 01/32] Build and test with CUDA 12.5.1 (#1357)

This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1357
---
 .github/workflows/build.yaml                         | 10 +++++-----
 .github/workflows/pr.yaml                            | 12 ++++++------
 .github/workflows/test.yaml                          |  2 +-
 ...rch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} |  4 ++--
 dependencies.yaml                                    |  6 +++++-
 docs/source/install.rst                              |  4 ++--
 6 files changed, 21 insertions(+), 17 deletions(-)
 rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (94%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 69b0de5f5..237f5595a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4e56d24d2..e7fbb2926 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7a884c5c6..f5bd04c5a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 94%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index 4db52a6d6..a27dea728 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - click >=8.1
 - cuda-nvcc-impl
 - cuda-nvrtc
-- cuda-version=12.2
+- cuda-version=12.5
 - cudf==24.8.*,>=0.0.0a0
 - dask-cudf==24.8.*,>=0.0.0a0
 - distributed-ucxx==0.39.*,>=0.0.0a0
@@ -35,4 +35,4 @@ dependencies:
 - ucx-py==0.39.*,>=0.0.0a0
 - ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index c7f552836..910edc086 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.4", "11.8", "12.2"]
+      cuda: ["11.4", "11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_python
@@ -100,6 +100,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
diff --git a/docs/source/install.rst b/docs/source/install.rst
index e522ae3c1..43082a671 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 12.0:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 12.5:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.0
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.5
 
 Pip
 ---

From b27920021a085de5bd12fe317fa2585248665f5d Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 12:57:59 -0400
Subject: [PATCH 02/32] Use workflow branch 24.08 again (#1359)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1359
---
 .github/workflows/build.yaml | 10 +++++-----
 .github/workflows/pr.yaml    | 12 ++++++------
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 237f5595a..69b0de5f5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e7fbb2926..4e56d24d2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index f5bd04c5a..7a884c5c6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 6db4b71bbc4b8b04caae0a70b1c52118ef993be8 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 Jul 2024 15:00:43 -0400
Subject: [PATCH 03/32] DOC v24.10 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 10 +++----
 .github/workflows/pr.yaml                     | 12 ++++-----
 .github/workflows/test.yaml                   |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-114_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-118_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-125_arch-x86_64.yaml             | 14 +++++-----
 dependencies.yaml                             | 26 +++++++++----------
 docs/source/explicit_comms.rst                |  2 +-
 pyproject.toml                                | 10 +++----
 11 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 69b0de5f5..67bbd027b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4e56d24d2..76014652e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7a884c5c6..1a0e7d876 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index ec8489fda..7c7ba0443 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index c2a65a414..42103004b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.08"
+export RAPIDS_VERSION_NUMBER="24.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index c0fed8e57..785437589 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.8.*,>=0.0.0a0
-- dask-cudf==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
-- kvikio==24.8.*,>=0.0.0a0
+- cudf==24.10.*,>=0.0.0a0
+- dask-cudf==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
+- kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.9,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*,>=0.0.0a0
-- ucxx==0.39.*,>=0.0.0a0
+- ucx-py==24.10.*,>=0.0.0a0
+- ucxx==0.40.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d1f6933cd..7499af726 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.8.*,>=0.0.0a0
-- dask-cudf==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
-- kvikio==24.8.*,>=0.0.0a0
+- cudf==24.10.*,>=0.0.0a0
+- dask-cudf==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
+- kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.9,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*,>=0.0.0a0
-- ucxx==0.39.*,>=0.0.0a0
+- ucx-py==24.10.*,>=0.0.0a0
+- ucxx==0.40.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a27dea728..e5afaa6a3 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.5
-- cudf==24.8.*,>=0.0.0a0
-- dask-cudf==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
-- kvikio==24.8.*,>=0.0.0a0
+- cudf==24.10.*,>=0.0.0a0
+- dask-cudf==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
+- kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -26,13 +26,13 @@ dependencies:
 - pytest-cov
 - python>=3.9,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*,>=0.0.0a0
-- ucxx==0.39.*,>=0.0.0a0
+- ucx-py==24.10.*,>=0.0.0a0
+- ucxx==0.40.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 910edc086..251606db9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -158,7 +158,7 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
@@ -168,13 +168,13 @@ dependencies:
           - pytest-cov
       - output_types: [conda]
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
-          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
-          - distributed-ucxx==0.39.*,>=0.0.0a0
-          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
-          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
+          - &cudf_conda cudf==24.10.*,>=0.0.0a0
+          - &dask_cudf_conda dask-cudf==24.10.*,>=0.0.0a0
+          - distributed-ucxx==0.40.*,>=0.0.0a0
+          - &kvikio_conda kvikio==24.10.*,>=0.0.0a0
+          - &ucx_py_conda ucx-py==0.40.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.39.*,>=0.0.0a0
+          - ucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -192,14 +192,14 @@ dependencies:
           # ref: https://github.com/rapidsai/kvikio/pull/369
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
-              - dask-cudf-cu12==24.8.*,>=0.0.0a0
-              - ucx-py-cu12==0.39.*,>=0.0.0a0
+              - cudf-cu12==24.10.*,>=0.0.0a0
+              - dask-cudf-cu12==24.10.*,>=0.0.0a0
+              - ucx-py-cu12==0.40.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
-              - dask-cudf-cu11==24.8.*,>=0.0.0a0
-              - ucx-py-cu11==0.39.*,>=0.0.0a0
+              - cudf-cu11==24.10.*,>=0.0.0a0
+              - dask-cudf-cu11==24.10.*,>=0.0.0a0
+              - ucx-py-cu11==0.40.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_conda
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index 9fde8756a..af3170565 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.08/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.10/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/pyproject.toml b/pyproject.toml
index 126efba6d..64a36a558 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -50,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.8.*,>=0.0.0a0",
-    "dask-cudf==24.8.*,>=0.0.0a0",
-    "kvikio==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "dask-cudf==24.10.*,>=0.0.0a0",
+    "kvikio==24.10.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.39.*,>=0.0.0a0",
+    "ucx-py==0.40.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From fa226b1f1ea788c44db0de202098700f6a5a73e3 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 18:27:08 -0400
Subject: [PATCH 04/32] Use verify-alpha-spec hook (#1360)

With the deployment of rapids-build-backend, we need to make sure our dependencies have alpha specs.

Contributes to https://github.com/rapidsai/build-planning/issues/31

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - https://github.com/jakirkham
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/dask-cuda/pull/1360
---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b10be12af..335080816 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,6 +32,10 @@ repos:
                 additional_dependencies: [types-cachetools]
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
+      - repo: https://github.com/rapidsai/pre-commit-hooks
+        rev: v0.3.0
+        hooks:
+            - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
         rev: v1.13.11
         hooks:

From d6cafc152f8dfb46201c644dfcbdcf11d7c14f3e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 24 Jul 2024 13:50:54 +0200
Subject: [PATCH 05/32] Add arguments to enable cuDF spilling and set
 statistics (#1362)

Add arguments to enable cuDF spilling and set statistics in `dask cuda worker`/`LocalCUDACluster`. This is implemented as a Dask plugin, and does not require users anymore to rely on `client.run` to do that.

Closes #1280

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1362
---
 dask_cuda/cli.py                           | 18 +++++++
 dask_cuda/cuda_worker.py                   | 11 +++-
 dask_cuda/local_cuda_cluster.py            | 21 +++++++-
 dask_cuda/plugins.py                       | 15 ++++++
 dask_cuda/tests/test_dask_cuda_worker.py   | 58 ++++++++++++++++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 48 ++++++++++++++++++
 6 files changed, 169 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index ba58fe3e5..6a3518e07 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -101,6 +101,20 @@ def cuda():
     total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
     disable spilling to host (i.e. allow full device memory usage).""",
 )
+@click.option(
+    "--enable-cudf-spill/--disable-cudf-spill",
+    default=False,
+    show_default=True,
+    help="""Enable automatic cuDF spilling. WARNING: This should NOT be used with
+    JIT-Unspill.""",
+)
+@click.option(
+    "--cudf-spill-stats",
+    type=int,
+    default=0,
+    help="""Set the cuDF spilling statistics level. This option has no effect if
+    `--enable-cudf-spill` is not specified.""",
+)
 @click.option(
     "--rmm-pool-size",
     default=None,
@@ -330,6 +344,8 @@ def worker(
     name,
     memory_limit,
     device_memory_limit,
+    enable_cudf_spill,
+    cudf_spill_stats,
     rmm_pool_size,
     rmm_maximum_pool_size,
     rmm_managed_memory,
@@ -402,6 +418,8 @@ def worker(
             name,
             memory_limit,
             device_memory_limit,
+            enable_cudf_spill,
+            cudf_spill_stats,
             rmm_pool_size,
             rmm_maximum_pool_size,
             rmm_managed_memory,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index e25a7c142..b88c9bc98 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -20,7 +20,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -41,6 +41,8 @@ def __init__(
         name=None,
         memory_limit="auto",
         device_memory_limit="auto",
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         rmm_pool_size=None,
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
@@ -166,6 +168,12 @@ def del_pid_file():
         if device_memory_limit is None and memory_limit is None:
             data = lambda _: {}
         elif jit_unspill:
+            if enable_cudf_spill:
+                warnings.warn(
+                    "Enabling cuDF spilling and JIT-Unspill together is not "
+                    "safe, consider disabling JIT-Unspill."
+                )
+
             data = lambda i: (
                 ProxifyHostFile,
                 {
@@ -217,6 +225,7 @@ def del_pid_file():
                         track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
+                    CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
                 },
                 name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
                 local_directory=local_directory,
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 1b81c7703..202373e9d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -10,7 +10,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -73,6 +73,14 @@ class LocalCUDACluster(LocalCluster):
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
         total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
         or ``None`` to disable spilling to host (i.e. allow full device memory usage).
+    enable_cudf_spill : bool, default False
+        Enable automatic cuDF spilling.
+
+        .. warning::
+            This should NOT be used together with JIT-Unspill.
+    cudf_spill_stats : int, default 0
+        Set the cuDF spilling statistics level. This option has no effect if
+        ``enable_cudf_spill=False``.
     local_directory : str or None, default None
         Path on local machine to store temporary files. Can be a string (like
         ``"path/to/files"``) or ``None`` to fall back on the value of
@@ -209,6 +217,8 @@ def __init__(
         threads_per_worker=1,
         memory_limit="auto",
         device_memory_limit=0.8,
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         data=None,
         local_directory=None,
         shared_filesystem=None,
@@ -259,6 +269,8 @@ def __init__(
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )
+        self.enable_cudf_spill = enable_cudf_spill
+        self.cudf_spill_stats = cudf_spill_stats
 
         self.rmm_pool_size = rmm_pool_size
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
@@ -302,6 +314,12 @@ def __init__(
             if device_memory_limit is None and memory_limit is None:
                 data = {}
             elif jit_unspill:
+                if enable_cudf_spill:
+                    warnings.warn(
+                        "Enabling cuDF spilling and JIT-Unspill together is not "
+                        "safe, consider disabling JIT-Unspill."
+                    )
+
                 data = (
                     ProxifyHostFile,
                     {
@@ -414,6 +432,7 @@ def new_worker_spec(self):
                         track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
+                    CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
                 },
             }
         )
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
index 4eba97f2b..122f93ffa 100644
--- a/dask_cuda/plugins.py
+++ b/dask_cuda/plugins.py
@@ -14,6 +14,21 @@ def setup(self, worker=None):
         os.sched_setaffinity(0, self.cores)
 
 
+class CUDFSetup(WorkerPlugin):
+    def __init__(self, spill, spill_stats):
+        self.spill = spill
+        self.spill_stats = spill_stats
+
+    def setup(self, worker=None):
+        try:
+            import cudf
+
+            cudf.set_option("spill", self.spill)
+            cudf.set_option("spill_stats", self.spill_stats)
+        except ImportError:
+            pass
+
+
 class RMMSetup(WorkerPlugin):
     def __init__(
         self,
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 974ad1319..505af12f1 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -231,6 +231,64 @@ def test_rmm_logging(loop):  # noqa: F811
                     assert v is rmm.mr.LoggingResourceAdaptor
 
 
+def test_cudf_spill_disabled(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(
+                    cudf.get_option,
+                    "spill",
+                )
+                for v in cudf_spill.values():
+                    assert v is False
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 0
+
+
+def test_cudf_spill(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+                "--enable-cudf-spill",
+                "--cudf-spill-stats",
+                "2",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(cudf.get_option, "spill")
+                for v in cudf_spill.values():
+                    assert v is True
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 2
+
+
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b05389e4c..b144d1114 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -500,6 +500,54 @@ async def test_worker_fraction_limits():
             )
 
 
+@gen_test(timeout=20)
+async def test_cudf_spill_disabled():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is False
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 0
+
+
+@gen_test(timeout=20)
+async def test_cudf_spill():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        enable_cudf_spill=True,
+        cudf_spill_stats=2,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is True
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 2
+
+
 @pytest.mark.parametrize(
     "protocol",
     ["ucx", "ucxx"],

From 13a5f474c02399a975b9f315ab6010d933bdf077 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 10:03:56 -0500
Subject: [PATCH 06/32] split up CUDA-suffixed dependencies in
 dependencies.yaml (#1364)

Contributes to https://github.com/rapidsai/build-planning/issues/31

In short, RAPIDS DLFW builds want to produce wheels with unsuffixed dependencies, e.g. `cudf` depending on `rmm`, not `rmm-cu12`.

This PR is part of a series across all of RAPIDS to try to support that type of build by setting up CUDA-suffixed and CUDA-unsuffixed dependency lists in `dependencies.yaml`.

For more details, see:
* https://github.com/rapidsai/build-planning/issues/31#issuecomment-2245815818
* https://github.com/rapidsai/cudf/pull/16183

## Notes for Reviewers

### Why target 24.08?

This is targeting 24.08 because:

1. it should be very low-risk
2. getting these changes into 24.08 prevents the need to carry around patches for every library in DLFW builds using RAPIDS 24.08

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/dask-cuda/pull/1364
---
 dependencies.yaml | 22 ++++++++++++++++++++--
 pyproject.toml    |  1 +
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 910edc086..f547df6ba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -190,16 +190,34 @@ dependencies:
         matrices:
           # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
           # ref: https://github.com/rapidsai/kvikio/pull/369
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu12==24.8.*,>=0.0.0a0
               - dask-cudf-cu12==24.8.*,>=0.0.0a0
               - ucx-py-cu12==0.39.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "false"
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *ucx_py_conda
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu11==24.8.*,>=0.0.0a0
               - dask-cudf-cu11==24.8.*,>=0.0.0a0
               - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "false"
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *ucx_py_conda
           - matrix:
             packages:
               - *cudf_conda
diff --git a/pyproject.toml b/pyproject.toml
index 126efba6d..b6c431d61 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,6 +134,7 @@ filterwarnings = [
 build-backend = "setuptools.build_meta"
 dependencies-file = "dependencies.yaml"
 disable-cuda = true
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]

From 064e2544e3545a4f7478b41a1c7459212a7768ce Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 18:31:32 -0500
Subject: [PATCH 07/32] consolidate cuda_suffixed=false blocks in
 dependencies.yaml, fix update-version.sh (#1367)

Contributes to https://github.com/rapidsai/build-planning/issues/31.

Follow-up to #1364.

Implements some of the suggestions made in https://github.com/rapidsai/cudf/pull/16183 (after #1364 was already merged):

* removing `cuda_suffixed: "false"` blocks in `dependencies.yaml` wherever they're identical to each other and the fallback matrix
* changing `dependencies.yaml` anchors with names like `*_conda` to `*_unsuffixed`, to reflect the fact that they're not conda-specific
* checking that `update-version.sh` catches all changes to versions

## Notes for Reviewers

### How I tested this

Looked for `update-versions.sh` issues manually like this:

```shell
git fetch upstream --tags
ci/release/update-version.sh '24.10.0'
git grep -E '24\.8|24\.08|0\.39'
```

The did find a few problems (like UCX dependency versions not being updated). This fixes those issues.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/dask-cuda/pull/1367
---
 ci/release/update-version.sh | 17 +++++++++++++++--
 dependencies.yaml            | 30 ++++++++----------------------
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ac834e5e8..a9fe1d02e 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -45,10 +45,23 @@ DEPENDENCIES=(
   kvikio
   rapids-dask-dependency
 )
-for FILE in dependencies.yaml conda/environments/*.yaml; do
-  for DEP in "${DEPENDENCIES[@]}"; do
+for DEP in "${DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" pyproject.toml
+done
+
+UCX_DEPENDENCIES=(
+  distributed-ucxx
+  ucx-py
+  ucxx
+)
+for DEP in "${UCX_DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+  done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0\"/g" pyproject.toml
 done
 
 # CI files
diff --git a/dependencies.yaml b/dependencies.yaml
index f547df6ba..c3b629654 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -168,11 +168,11 @@ dependencies:
           - pytest-cov
       - output_types: [conda]
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
-          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.8.*,>=0.0.0a0
           - distributed-ucxx==0.39.*,>=0.0.0a0
-          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
-          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
+          - &kvikio_unsuffixed kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
           - ucxx==0.39.*,>=0.0.0a0
     specific:
@@ -197,13 +197,6 @@ dependencies:
               - cudf-cu12==24.8.*,>=0.0.0a0
               - dask-cudf-cu12==24.8.*,>=0.0.0a0
               - ucx-py-cu12==0.39.*,>=0.0.0a0
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "false"
-            packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *ucx_py_conda
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
@@ -211,16 +204,9 @@ dependencies:
               - cudf-cu11==24.8.*,>=0.0.0a0
               - dask-cudf-cu11==24.8.*,>=0.0.0a0
               - ucx-py-cu11==0.39.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "false"
-            packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *ucx_py_conda
           - matrix:
             packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *kvikio_conda
-              - *ucx_py_conda
+              - *cudf_unsuffixed
+              - *dask_cudf_unsuffixed
+              - *kvikio_unsuffixed
+              - *ucx_py_unsuffixed

From c0cd4656037bf54fe34b45c283e31d97098b8c25 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 31 Jul 2024 09:39:41 +0200
Subject: [PATCH 08/32] Replace cuDF (de)serializer with cuDF spill-aware
 (de)serializer (#1369)

Replace cuDF (de)serializer with cuDF spill-aware (de)serializer, using both together should be avoided as that will cause excessive spilling.

Additionally add:

- Missing test of cuDF internal spill mechanism with `LocalCUDACluster`;
- `dask cuda worker` warning to alert the user that cuDF spilling mechanism requires client/scheduler to enable it as well.

Closes #1363 .

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1369
---
 dask_cuda/__init__.py                    |  19 ++++
 dask_cuda/cuda_worker.py                 |   8 ++
 dask_cuda/local_cuda_cluster.py          |   7 ++
 dask_cuda/tests/test_dask_cuda_worker.py |  27 +++++
 dask_cuda/tests/test_spill.py            | 132 ++++++++++++++++++++---
 5 files changed, 177 insertions(+), 16 deletions(-)

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index 516599da3..5711ac08b 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -9,6 +9,8 @@
 import dask.dataframe.shuffle
 import dask.dataframe.multi
 import dask.bag.core
+from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
+from distributed.protocol.serialize import dask_deserialize, dask_serialize
 
 from ._version import __git_commit__, __version__
 from .cuda_worker import CUDAWorker
@@ -48,3 +50,20 @@
     dask.dataframe.shuffle.shuffle_group
 )
 dask.dataframe.core._concat = unproxify_decorator(dask.dataframe.core._concat)
+
+
+def _register_cudf_spill_aware():
+    import cudf
+
+    # Only enable Dask/cuDF spilling if cuDF spilling is disabled, see
+    # https://github.com/rapidsai/dask-cuda/issues/1363
+    if not cudf.get_option("spill"):
+        # This reproduces the implementation of `_register_cudf`, see
+        # https://github.com/dask/distributed/blob/40fcd65e991382a956c3b879e438be1b100dff97/distributed/protocol/__init__.py#L106-L115
+        from cudf.comm import serialize
+
+
+for registry in [cuda_serialize, cuda_deserialize, dask_serialize, dask_deserialize]:
+    for lib in ["cudf", "dask_cudf"]:
+        if lib in registry._lazy:
+            registry._lazy[lib] = _register_cudf_spill_aware
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index b88c9bc98..3e03ed297 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -195,6 +195,14 @@ def del_pid_file():
                 },
             )
 
+        cudf_spill_warning = dask.config.get("cudf-spill-warning", default=True)
+        if enable_cudf_spill and cudf_spill_warning:
+            warnings.warn(
+                "cuDF spilling is enabled, please ensure the client and scheduler "
+                "processes set `CUDF_SPILL=on` as well. To disable this warning "
+                "set `DASK_CUDF_SPILL_WARNING=False`."
+            )
+
         self.nannies = [
             Nanny(
                 scheduler,
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 202373e9d..c037223b2 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -244,6 +244,13 @@ def __init__(
         # initialization happens before we can set CUDA_VISIBLE_DEVICES
         os.environ["RAPIDS_NO_INITIALIZE"] = "True"
 
+        if enable_cudf_spill:
+            import cudf
+
+            # cuDF spilling must be enabled in the client/scheduler process too.
+            cudf.set_option("spill", enable_cudf_spill)
+            cudf.set_option("spill_stats", cudf_spill_stats)
+
         if threads_per_worker < 1:
             raise ValueError("threads_per_worker must be higher than 0.")
 
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 505af12f1..049fe85f4 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -567,3 +567,30 @@ def test_worker_timeout():
         assert "reason: nanny-close" in ret.stderr.lower()
 
     assert ret.returncode == 0
+
+
+@pytest.mark.parametrize("enable_cudf_spill_warning", [False, True])
+def test_worker_cudf_spill_warning(enable_cudf_spill_warning):  # noqa: F811
+    pytest.importorskip("rmm")
+
+    environ = {"CUDA_VISIBLE_DEVICES": "0"}
+    if not enable_cudf_spill_warning:
+        environ["DASK_CUDF_SPILL_WARNING"] = "False"
+
+    with patch.dict(os.environ, environ):
+        ret = subprocess.run(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--enable-cudf-spill",
+                "--death-timeout",
+                "1",
+            ],
+            capture_output=True,
+        )
+        if enable_cudf_spill_warning:
+            assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
+        else:
+            assert b"UserWarning: cuDF spilling is enabled" not in ret.stderr
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index f8df7e04f..bdd012d50 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -11,6 +11,8 @@
 from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 
+import dask_cudf
+
 from dask_cuda import LocalCUDACluster, utils
 from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
@@ -18,6 +20,57 @@
     pytest.skip("Not enough GPU memory", allow_module_level=True)
 
 
+def _set_cudf_device_limit():
+    """Ensure spilling for objects of all sizes"""
+    import cudf
+
+    cudf.set_option("spill_device_limit", 0)
+
+
+def _assert_cudf_spill_stats(enable_cudf_spill, dask_worker=None):
+    """Ensure cuDF has spilled data with its internal mechanism"""
+    import cudf
+
+    global_manager = cudf.core.buffer.spill_manager.get_global_manager()
+
+    if enable_cudf_spill:
+        stats = global_manager.statistics
+        buffers = global_manager.buffers()
+        assert stats.spill_totals[("gpu", "cpu")][0] > 1000
+        assert stats.spill_totals[("cpu", "gpu")][0] > 1000
+        assert len(buffers) > 0
+    else:
+        assert global_manager is None
+
+
+@pytest.fixture(params=[False, True])
+def cudf_spill(request):
+    """Fixture to enable and clear cuDF spill manager in client process"""
+    cudf = pytest.importorskip("cudf")
+
+    enable_cudf_spill = request.param
+
+    if enable_cudf_spill:
+        # If the global spill manager was previously set, fail.
+        assert cudf.core.buffer.spill_manager._global_manager is None
+
+        cudf.set_option("spill", True)
+        cudf.set_option("spill_stats", True)
+
+        # This change is to prevent changing RMM resource stack in cuDF,
+        # workers do not need this because they are spawned as new
+        # processes for every new test that runs.
+        cudf.set_option("spill_on_demand", False)
+
+        _set_cudf_device_limit()
+
+    yield enable_cudf_spill
+
+    cudf.set_option("spill", False)
+    cudf.core.buffer.spill_manager._global_manager_uninitialized = True
+    cudf.core.buffer.spill_manager._global_manager = None
+
+
 def device_host_file_size_matches(
     dhf, total_bytes, device_chunk_overhead=0, serialized_chunk_overhead=1024
 ):
@@ -244,9 +297,11 @@ async def test_cupy_cluster_device_spill(params):
     ],
 )
 @gen_test(timeout=30)
-async def test_cudf_cluster_device_spill(params):
+async def test_cudf_cluster_device_spill(params, cudf_spill):
     cudf = pytest.importorskip("cudf")
 
+    enable_cudf_spill = cudf_spill
+
     with dask.config.set(
         {
             "distributed.comm.compression": False,
@@ -266,6 +321,7 @@ async def test_cudf_cluster_device_spill(params):
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
             worker_class=IncreasedCloseTimeoutNanny,
+            enable_cudf_spill=enable_cudf_spill,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
@@ -294,21 +350,28 @@ async def test_cudf_cluster_device_spill(params):
                 del cdf
                 gc.collect()
 
-                await client.run(
-                    assert_host_chunks,
-                    params["spills_to_disk"],
-                )
-                await client.run(
-                    assert_disk_chunks,
-                    params["spills_to_disk"],
-                )
-
-                await client.run(
-                    worker_assert,
-                    nbytes,
-                    32,
-                    2048,
-                )
+                if enable_cudf_spill:
+                    await client.run(
+                        worker_assert,
+                        0,
+                        0,
+                        0,
+                    )
+                else:
+                    await client.run(
+                        assert_host_chunks,
+                        params["spills_to_disk"],
+                    )
+                    await client.run(
+                        assert_disk_chunks,
+                        params["spills_to_disk"],
+                    )
+                    await client.run(
+                        worker_assert,
+                        nbytes,
+                        32,
+                        2048,
+                    )
 
                 del cdf2
 
@@ -324,3 +387,40 @@ async def test_cudf_cluster_device_spill(params):
                         gc.collect()
                     else:
                         break
+
+
+@gen_test(timeout=30)
+async def test_cudf_spill_cluster(cudf_spill):
+    cudf = pytest.importorskip("cudf")
+    enable_cudf_spill = cudf_spill
+
+    async with LocalCUDACluster(
+        n_workers=1,
+        scheduler_port=0,
+        silence_logs=False,
+        dashboard_address=None,
+        asynchronous=True,
+        device_memory_limit=None,
+        memory_limit=None,
+        worker_class=IncreasedCloseTimeoutNanny,
+        enable_cudf_spill=enable_cudf_spill,
+        cudf_spill_stats=enable_cudf_spill,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+
+            await client.wait_for_workers(1)
+            await client.run(_set_cudf_device_limit)
+
+            cdf = cudf.DataFrame(
+                {
+                    "a": list(range(200)),
+                    "b": list(reversed(range(200))),
+                    "c": list(range(200)),
+                }
+            )
+
+            ddf = dask_cudf.from_cudf(cdf, npartitions=2).sum().persist()
+            await wait(ddf)
+
+            await client.run(_assert_cudf_spill_stats, enable_cudf_spill)
+            _assert_cudf_spill_stats(enable_cudf_spill)

From bf84f99de5f477c4a230a6b9890b49c06908f55f Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 7 Aug 2024 10:42:31 -0400
Subject: [PATCH 09/32] Update Changelog [skip ci]

---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ea704c1f..37c588511 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+# dask-cuda 24.08.00 (7 Aug 2024)
+
+## 🐛 Bug Fixes
+
+- Fix partitioning in explicit-comms shuffle ([#1356](https://github.com/rapidsai/dask-cuda/pull/1356)) [@rjzamora](https://github.com/rjzamora)
+- Update cuDF&#39;s `assert_eq` import ([#1353](https://github.com/rapidsai/dask-cuda/pull/1353)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Add arguments to enable cuDF spilling and set statistics ([#1362](https://github.com/rapidsai/dask-cuda/pull/1362)) [@pentschev](https://github.com/pentschev)
+- Allow disabling RMM in benchmarks ([#1352](https://github.com/rapidsai/dask-cuda/pull/1352)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- consolidate cuda_suffixed=false blocks in dependencies.yaml, fix update-version.sh ([#1367](https://github.com/rapidsai/dask-cuda/pull/1367)) [@jameslamb](https://github.com/jameslamb)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#1364](https://github.com/rapidsai/dask-cuda/pull/1364)) [@jameslamb](https://github.com/jameslamb)
+- Use verify-alpha-spec hook ([#1360](https://github.com/rapidsai/dask-cuda/pull/1360)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Use workflow branch 24.08 again ([#1359](https://github.com/rapidsai/dask-cuda/pull/1359)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Build and test with CUDA 12.5.1 ([#1357](https://github.com/rapidsai/dask-cuda/pull/1357)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop `setup.py` ([#1354](https://github.com/rapidsai/dask-cuda/pull/1354)) [@jakirkham](https://github.com/jakirkham)
+- remove .gitattributes ([#1350](https://github.com/rapidsai/dask-cuda/pull/1350)) [@jameslamb](https://github.com/jameslamb)
+- make conda recipe data-loading stricter ([#1349](https://github.com/rapidsai/dask-cuda/pull/1349)) [@jameslamb](https://github.com/jameslamb)
+- Adopt CI/packaging codeowners ([#1347](https://github.com/rapidsai/dask-cuda/pull/1347)) [@bdice](https://github.com/bdice)
+- Remove text builds of documentation ([#1346](https://github.com/rapidsai/dask-cuda/pull/1346)) [@vyasr](https://github.com/vyasr)
+- use rapids-build-backend ([#1343](https://github.com/rapidsai/dask-cuda/pull/1343)) [@jameslamb](https://github.com/jameslamb)
+
 # dask-cuda 24.06.00 (5 Jun 2024)
 
 ## 🐛 Bug Fixes

From 00c37dc55bee1a34f7d9f6599a89a3f89c15651b Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 8 Aug 2024 11:50:49 -0400
Subject: [PATCH 10/32] Update pre-commit hooks (#1373)

This PR updates pre-commit hooks to the latest versions that are supported without causing style check errors.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1373
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 335080816..1def5e1aa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.3.0
+        rev: v0.3.1
         hooks:
             - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator

From 49ebabc5b76db2c1f1b9bbf89774689d69d129ee Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 22 Aug 2024 18:17:51 -0500
Subject: [PATCH 11/32] Drop Python 3.9 support (#1377)

Contributes to https://github.com/rapidsai/build-planning/issues/88

Finishes the work of dropping Python 3.9 support.

This project stopped building / testing against Python 3.9 as of https://github.com/rapidsai/shared-workflows/pull/235.
This PR updates configuration and docs to reflect that.

## Notes for Reviewers

### How I tested this

Checked that there were no remaining uses like this:

```shell
git grep -E '3\.9'
git grep '39'
git grep 'py39'
```

And similar for variations on Python 3.8 (to catch things that were missed the last time this was done).

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1377
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 6 +-----
 pyproject.toml                                   | 3 +--
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 45d26cf62..45f7f5035 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 804784547..e64776af2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 21c2dc6d1..128cf9aa4 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 627756190..616d0a1dd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -134,10 +134,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.9"
-            packages:
-              - python=3.9
           - matrix:
               py: "3.10"
             packages:
@@ -148,7 +144,7 @@ dependencies:
               - python=3.11
           - matrix:
             packages:
-              - python>=3.9,<3.12
+              - python>=3.10,<3.12
   run_python:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/pyproject.toml b/pyproject.toml
index 914d2cfe8..5e3657d87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "click >=8.1",
     "numba>=0.57",
@@ -30,7 +30,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]

From 4a02fcca3a85f4fc6e2bf5700fba19084681753e Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Fri, 23 Aug 2024 02:55:36 +0200
Subject: [PATCH 12/32] Remove NumPy <2 pin (#1375)

This PR removes the NumPy<2 pin which is expected to work for
RAPIDS projects once CuPy 13.3.0 is released (CuPy 13.2.0 had
some issues preventing the use with NumPy 2).

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - https://github.com/jakirkham

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1375
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 2 +-
 pyproject.toml                                   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 45f7f5035..1ece978ed 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e64776af2..542865b19 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 128cf9aa4..adb858950 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - kvikio==24.10.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
diff --git a/dependencies.yaml b/dependencies.yaml
index 616d0a1dd..97cd5f48e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,7 @@ dependencies:
         packages:
           - click >=8.1
           - numba>=0.57
-          - numpy>=1.23,<2.0a0
+          - numpy>=1.23,<3.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
           - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/pyproject.toml b/pyproject.toml
index 5e3657d87..9238ca665 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires-python = ">=3.10"
 dependencies = [
     "click >=8.1",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
     "rapids-dask-dependency==24.10.*,>=0.0.0a0",

From b519e39ac3fe6f441b8353b56f650c408aaaaa58 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 13:01:59 -0400
Subject: [PATCH 13/32] Update rapidsai/pre-commit-hooks (#1379)

This PR updates rapidsai/pre-commit-hooks to the version 0.4.0.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1379
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1def5e1aa..c1157be19 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.3.1
+        rev: v0.4.0
         hooks:
             - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator

From 1cc4d0b84132c4d618de4ace954c4f3fc74ec5ef Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 30 Aug 2024 07:43:34 -0700
Subject: [PATCH 14/32] [Benchmark] Add  parquet read benchmark (#1371)

Adds new benchmark for parquet read performance using a `LocalCUDACluster`. The user can pass in `--key` and `--secret` options to specify S3 credentials.

E.g.
```
$ python ./local_read_parquet.py --devs 0,1,2,3,4,5,6,7 --filesystem fsspec --type gpu --file-count 48 --aggregate-files

Parquet read benchmark
--------------------------------------------------------------------------------
Path                      | s3://dask-cudf-parquet-testing/dedup_parquet
Columns                   | None
Backend                   | cudf
Filesystem                | fsspec
Blocksize                 | 244.14 MiB
Aggregate files           | True
Row count                 | 372066
Size on disk              | 1.03 GiB
Number of workers         | 8
================================================================================
Wall clock                | Throughput
--------------------------------------------------------------------------------
36.75 s                   | 28.78 MiB/s
21.29 s                   | 49.67 MiB/s
17.91 s                   | 59.05 MiB/s
================================================================================
Throughput                | 41.77 MiB/s +/- 7.81 MiB/s
Bandwidth                 | 0 B/s +/- 0 B/s
Wall clock                | 25.32 s +/- 8.20 s
================================================================================
...
```

**Notes**:
- S3 Performance generally scales with the number of workers (multiplied the number of threads per worker)
- The example shown above was not executed from an EC2 instance
- The example shown above *should* perform better after https://github.com/rapidsai/cudf/pull/16657
- Using `--filesystem arrow` together with `--type gpu` performs well, but depends on https://github.com/rapidsai/cudf/pull/16684

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1371
---
 dask_cuda/benchmarks/local_cudf_groupby.py    |   9 +-
 dask_cuda/benchmarks/local_cudf_merge.py      |   9 +-
 dask_cuda/benchmarks/local_cudf_shuffle.py    |   7 -
 dask_cuda/benchmarks/local_cupy.py            |   9 +-
 .../benchmarks/local_cupy_map_overlap.py      |   9 +-
 dask_cuda/benchmarks/read_parquet.py          | 268 ++++++++++++++++++
 dask_cuda/benchmarks/utils.py                 |   7 +
 7 files changed, 279 insertions(+), 39 deletions(-)
 create mode 100644 dask_cuda/benchmarks/read_parquet.py

diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index 2f07e3df7..f094ff185 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -7,7 +7,7 @@
 import dask
 import dask.dataframe as dd
 from dask.distributed import performance_report, wait
-from dask.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
@@ -260,13 +260,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--ignore-size",
-            "default": "1 MiB",
-            "metavar": "nbytes",
-            "type": parse_bytes,
-            "help": "Ignore messages smaller than this (default '1 MB')",
-        },
         {
             "name": "--runs",
             "default": 3,
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index 6a68ad788..e2b035204 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -9,7 +9,7 @@
 import dask
 import dask.dataframe as dd
 from dask.distributed import performance_report, wait
-from dask.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
@@ -335,13 +335,6 @@ def parse_args():
             "action": "store_true",
             "help": "Use shuffle join (takes precedence over '--broadcast-join').",
         },
-        {
-            "name": "--ignore-size",
-            "default": "1 MiB",
-            "metavar": "nbytes",
-            "type": parse_bytes,
-            "help": "Ignore messages smaller than this (default '1 MB')",
-        },
         {
             "name": "--frac-match",
             "default": 0.3,
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index a1129dd37..25f42e59d 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -228,13 +228,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--ignore-size",
-            "default": "1 MiB",
-            "metavar": "nbytes",
-            "type": parse_bytes,
-            "help": "Ignore messages smaller than this (default '1 MB')",
-        },
         {
             "name": "--runs",
             "default": 3,
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index 22c51556f..c9c8fe1c1 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -8,7 +8,7 @@
 
 from dask import array as da
 from dask.distributed import performance_report, wait
-from dask.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
@@ -297,13 +297,6 @@ def parse_args():
             "type": int,
             "help": "Chunk size (default 2500).",
         },
-        {
-            "name": "--ignore-size",
-            "default": "1 MiB",
-            "metavar": "nbytes",
-            "type": parse_bytes,
-            "help": "Ignore messages smaller than this (default '1 MB').",
-        },
         {
             "name": "--runs",
             "default": 3,
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index 8250c9f9f..8b975a24a 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -10,7 +10,7 @@
 
 from dask import array as da
 from dask.distributed import performance_report, wait
-from dask.utils import format_bytes, parse_bytes
+from dask.utils import format_bytes
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
@@ -168,13 +168,6 @@ def parse_args():
             "type": int,
             "help": "Kernel size, 2*k+1, in each dimension (default 1)",
         },
-        {
-            "name": "--ignore-size",
-            "default": "1 MiB",
-            "metavar": "nbytes",
-            "type": parse_bytes,
-            "help": "Ignore messages smaller than this (default '1 MB')",
-        },
         {
             "name": "--runs",
             "default": 3,
diff --git a/dask_cuda/benchmarks/read_parquet.py b/dask_cuda/benchmarks/read_parquet.py
new file mode 100644
index 000000000..bce696737
--- /dev/null
+++ b/dask_cuda/benchmarks/read_parquet.py
@@ -0,0 +1,268 @@
+import contextlib
+from collections import ChainMap
+from time import perf_counter as clock
+
+import fsspec
+import pandas as pd
+
+import dask
+import dask.dataframe as dd
+from dask.base import tokenize
+from dask.distributed import performance_report
+from dask.utils import format_bytes, parse_bytes
+
+from dask_cuda.benchmarks.common import Config, execute_benchmark
+from dask_cuda.benchmarks.utils import (
+    parse_benchmark_args,
+    print_key_value,
+    print_separator,
+    print_throughput_bandwidth,
+)
+
+DISK_SIZE_CACHE = {}
+OPTIONS_CACHE = {}
+
+
+def _noop(df):
+    return df
+
+
+def read_data(paths, columns, backend, **kwargs):
+    with dask.config.set({"dataframe.backend": backend}):
+        return dd.read_parquet(
+            paths,
+            columns=columns,
+            **kwargs,
+        )
+
+
+def get_fs_paths_kwargs(args):
+    kwargs = {}
+
+    storage_options = {}
+    if args.key:
+        storage_options["key"] = args.key
+    if args.secret:
+        storage_options["secret"] = args.secret
+
+    if args.filesystem == "arrow":
+        import pyarrow.fs as pa_fs
+        from fsspec.implementations.arrow import ArrowFSWrapper
+
+        _mapping = {
+            "key": "access_key",
+            "secret": "secret_key",
+        }  # See: pyarrow.fs.S3FileSystem docs
+        s3_args = {}
+        for k, v in storage_options.items():
+            s3_args[_mapping[k]] = v
+
+        fs = pa_fs.FileSystem.from_uri(args.path)[0]
+        try:
+            region = {"region": fs.region}
+        except AttributeError:
+            region = {}
+        kwargs["filesystem"] = type(fs)(**region, **s3_args)
+        fsspec_fs = ArrowFSWrapper(kwargs["filesystem"])
+
+        if args.type == "gpu":
+            kwargs["blocksize"] = args.blocksize
+    else:
+        fsspec_fs = fsspec.core.get_fs_token_paths(
+            args.path, mode="rb", storage_options=storage_options
+        )[0]
+        kwargs["filesystem"] = fsspec_fs
+        kwargs["blocksize"] = args.blocksize
+        kwargs["aggregate_files"] = args.aggregate_files
+
+    # Collect list of paths
+    stripped_url_path = fsspec_fs._strip_protocol(args.path)
+    if stripped_url_path.endswith("/"):
+        stripped_url_path = stripped_url_path[:-1]
+    paths = fsspec_fs.glob(f"{stripped_url_path}/*.parquet")
+    if args.file_count:
+        paths = paths[: args.file_count]
+
+    return fsspec_fs, paths, kwargs
+
+
+def bench_once(client, args, write_profile=None):
+    global OPTIONS_CACHE
+    global DISK_SIZE_CACHE
+
+    # Construct kwargs
+    token = tokenize(args)
+    try:
+        fsspec_fs, paths, kwargs = OPTIONS_CACHE[token]
+    except KeyError:
+        fsspec_fs, paths, kwargs = get_fs_paths_kwargs(args)
+        OPTIONS_CACHE[token] = (fsspec_fs, paths, kwargs)
+
+    if write_profile is None:
+        ctx = contextlib.nullcontext()
+    else:
+        ctx = performance_report(filename=args.profile)
+
+    with ctx:
+        t1 = clock()
+        df = read_data(
+            paths,
+            columns=args.columns,
+            backend="cudf" if args.type == "gpu" else "pandas",
+            **kwargs,
+        )
+        num_rows = len(
+            # Use opaque `map_partitions` call to "block"
+            # dask-expr from using pq metadata to get length
+            df.map_partitions(
+                _noop,
+                meta=df._meta,
+                enforce_metadata=False,
+            )
+        )
+        t2 = clock()
+
+    # Extract total size of files on disk
+    token = tokenize(paths)
+    try:
+        disk_size = DISK_SIZE_CACHE[token]
+    except KeyError:
+        disk_size = sum(fsspec_fs.sizes(paths))
+        DISK_SIZE_CACHE[token] = disk_size
+
+    return (disk_size, num_rows, t2 - t1)
+
+
+def pretty_print_results(args, address_to_index, p2p_bw, results):
+    if args.markdown:
+        print("```")
+    print("Parquet read benchmark")
+    data_processed, row_count, durations = zip(*results)
+    print_separator(separator="-")
+    backend = "cudf" if args.type == "gpu" else "pandas"
+    print_key_value(key="Path", value=args.path)
+    print_key_value(key="Columns", value=f"{args.columns}")
+    print_key_value(key="Backend", value=f"{backend}")
+    print_key_value(key="Filesystem", value=f"{args.filesystem}")
+    print_key_value(key="Blocksize", value=f"{format_bytes(args.blocksize)}")
+    print_key_value(key="Aggregate files", value=f"{args.aggregate_files}")
+    print_key_value(key="Row count", value=f"{row_count[0]}")
+    print_key_value(key="Size on disk", value=f"{format_bytes(data_processed[0])}")
+    if args.markdown:
+        print("\n```")
+    args.no_show_p2p_bandwidth = True
+    print_throughput_bandwidth(
+        args, durations, data_processed, p2p_bw, address_to_index
+    )
+    print_separator(separator="=")
+
+
+def create_tidy_results(args, p2p_bw, results):
+    configuration = {
+        "path": args.path,
+        "columns": args.columns,
+        "backend": "cudf" if args.type == "gpu" else "pandas",
+        "filesystem": args.filesystem,
+        "blocksize": args.blocksize,
+        "aggregate_files": args.aggregate_files,
+    }
+    timing_data = pd.DataFrame(
+        [
+            pd.Series(
+                data=ChainMap(
+                    configuration,
+                    {
+                        "wallclock": duration,
+                        "data_processed": data_processed,
+                        "num_rows": num_rows,
+                    },
+                )
+            )
+            for data_processed, num_rows, duration in results
+        ]
+    )
+    return timing_data, p2p_bw
+
+
+def parse_args():
+    special_args = [
+        {
+            "name": "path",
+            "type": str,
+            "help": "Parquet directory to read from (must be a flat directory).",
+        },
+        {
+            "name": "--blocksize",
+            "default": "256MB",
+            "type": parse_bytes,
+            "help": "How to set the blocksize option",
+        },
+        {
+            "name": "--aggregate-files",
+            "default": False,
+            "action": "store_true",
+            "help": "How to set the aggregate_files option",
+        },
+        {
+            "name": "--file-count",
+            "type": int,
+            "help": "Maximum number of files to read.",
+        },
+        {
+            "name": "--columns",
+            "type": str,
+            "help": "Columns to read/select from data.",
+        },
+        {
+            "name": "--key",
+            "type": str,
+            "help": "Public S3 key.",
+        },
+        {
+            "name": "--secret",
+            "type": str,
+            "help": "Secret S3 key.",
+        },
+        {
+            "name": [
+                "-t",
+                "--type",
+            ],
+            "choices": ["cpu", "gpu"],
+            "default": "gpu",
+            "type": str,
+            "help": "Use GPU or CPU dataframes (default 'gpu')",
+        },
+        {
+            "name": "--filesystem",
+            "choices": ["arrow", "fsspec"],
+            "default": "fsspec",
+            "type": str,
+            "help": "Filesystem backend",
+        },
+        {
+            "name": "--runs",
+            "default": 3,
+            "type": int,
+            "help": "Number of runs",
+        },
+    ]
+
+    args = parse_benchmark_args(
+        description="Parquet read benchmark",
+        args_list=special_args,
+        check_explicit_comms=False,
+    )
+    args.no_show_p2p_bandwidth = True
+    return args
+
+
+if __name__ == "__main__":
+    execute_benchmark(
+        Config(
+            args=parse_args(),
+            bench_once=bench_once,
+            create_tidy_results=create_tidy_results,
+            pretty_print_results=pretty_print_results,
+        )
+    )
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 48e4755fb..5b9448d48 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -337,6 +337,13 @@ def parse_benchmark_args(
         "If the files already exist, new files are created with a uniquified "
         "BASENAME.",
     )
+    parser.add_argument(
+        "--ignore-size",
+        default="1 MiB",
+        metavar="nbytes",
+        type=parse_bytes,
+        help="Bandwidth statistics: ignore messages smaller than this (default '1 MB')",
+    )
 
     for args in args_list:
         name = args.pop("name")

From 5d9a4cccbc63a74f77ab14ba33ebcb7d645105de Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 5 Sep 2024 10:56:10 -0500
Subject: [PATCH 15/32] Add support for Python 3.12 (#1380)

Contributes to https://github.com/rapidsai/build-planning/issues/40

This PR adds support for Python 3.12.

## Notes for Reviewers

This is part of ongoing work to add Python 3.12 support across RAPIDS.
It temporarily introduces a build/test matrix including Python 3.12, from https://github.com/rapidsai/shared-workflows/pull/213.

A follow-up PR will revert back to pointing at the `branch-24.10` branch of `shared-workflows` once all
RAPIDS repos have added Python 3.12 support.

### This will fail until all dependencies have been updates to Python 3.12

CI here is expected to fail until all of this project's upstream dependencies support Python 3.12.

This can be merged whenever all CI jobs are passing.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1380
---
 .github/workflows/build.yaml                     | 10 +++++-----
 .github/workflows/pr.yaml                        | 15 +++++++++------
 .github/workflows/test.yaml                      |  2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml |  2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml |  2 +-
 dependencies.yaml                                |  6 +++++-
 pyproject.toml                                   |  1 +
 8 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67bbd027b..fd7ebff54 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 76014652e..85ec787dd 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,29 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: pull-request
+      # TODO: remove this matrix_filter once there are Python 3.12 ucxx and cudf wheels
+      #       (this helps publish dask-cuda wheels to resolve a circular dependency between those projects)
+      matrix_filter: map(select(.PY_VER != "3.12"))
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +49,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1a0e7d876..9c3da8490 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 1ece978ed..3cfd9cb28 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.10,<3.12
+- python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 542865b19..b7b997513 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.10,<3.12
+- python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index adb858950..652a8f0c5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - pynvml>=11.0.0,<11.5
 - pytest
 - pytest-cov
-- python>=3.10,<3.12
+- python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - setuptools>=64.0.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 97cd5f48e..9e6b3a108 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -143,8 +143,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   run_python:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/pyproject.toml b/pyproject.toml
index 9238ca665..730225adc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ classifiers = [
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.scripts]

From 72d51e91216a44f05f50da0d6d159089f4c1f90b Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 9 Sep 2024 09:26:31 -0700
Subject: [PATCH 16/32] enable Python 3.12 tests on PRs (#1382)

Follow-up to #1380.

Now that both `cudf` (https://github.com/rapidsai/cudf/pull/16745) and `ucxx` (https://github.com/rapidsai/ucxx/pull/276) have Python 3.12 wheels available, it should be possible to test `dask-cuda` against Python 3.12 in CI.

This proposes that.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1382
---
 .github/workflows/pr.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 85ec787dd..b4b5ba021 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,9 +34,6 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: pull-request
-      # TODO: remove this matrix_filter once there are Python 3.12 ucxx and cudf wheels
-      #       (this helps publish dask-cuda wheels to resolve a circular dependency between those projects)
-      matrix_filter: map(select(.PY_VER != "3.12"))
   docs-build:
     needs: conda-python-build
     secrets: inherit

From dc168d701721f3fb08a479e65511b9138205d05d Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 11 Sep 2024 17:01:45 -0500
Subject: [PATCH 17/32] Add notes on cudf spilling to docs (#1383)

Updates the dask-cuda documentation to include notes on native cuDF spilling, since it is often the best spilling approach for ETL with Dask cuDA (please feel free to correct me if I'm wrong).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1383
---
 docs/source/examples/best-practices.rst |  8 +++
 docs/source/spilling.rst                | 79 +++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
index 2de3809c8..fbfd8f0c3 100644
--- a/docs/source/examples/best-practices.rst
+++ b/docs/source/examples/best-practices.rst
@@ -44,6 +44,14 @@ We also recommend allocating most, though not all, of the GPU memory space. We d
 
 Additionally, when using `Accelerated Networking`_ , we only need to register a single IPC handle for the whole pool (which is expensive, but only done once) since from the IPC point of viewer there's only a single allocation. As opposed to just using RMM without a pool where each new allocation must be registered with IPC.
 
+Spilling from Device
+~~~~~~~~~~~~~~~~~~~~
+
+Dask-CUDA offers several different ways to enable automatic spilling from device memory.
+The best method often depends on the specific workflow. For classic ETL workloads using
+`Dask cuDF <https://docs.rapids.ai/api/dask-cudf/stable/>`_, cuDF spilling is usually the
+best place to start. See :ref:`Spilling from device <spilling-from-device>` for more details.
+
 Accelerated Networking
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst
index a237adf74..cfc6cfcf8 100644
--- a/docs/source/spilling.rst
+++ b/docs/source/spilling.rst
@@ -1,3 +1,5 @@
+.. _spilling-from-device:
+
 Spilling from device
 ====================
 
@@ -105,3 +107,80 @@ type checking doesn't:
 Thus, if encountering problems remember that it is always possible to use ``unproxy()``
 to access the proxied object directly, or set ``DASK_JIT_UNSPILL_COMPATIBILITY_MODE=True``
 to enable compatibility mode, which automatically calls ``unproxy()`` on all function inputs.
+
+
+cuDF Spilling
+-------------
+
+When executing an ETL workflow with `Dask cuDF <https://docs.rapids.ai/api/dask-cudf/stable/>`_
+(i.e. Dask DataFrame), it is usually best to leverage `native spilling support in cuDF
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`.
+
+Native cuDF spilling has an important advantage over the other methodologies mentioned
+above. When JIT-unspill or default spilling are used, the worker is only able to spill
+the input or output of a task. This means that any data that is created within the task
+is completely off limits until the task is done executing. When cuDF spilling is used,
+however, individual device buffers can be spilled/unspilled as needed while the task
+is executing.
+
+When deploying a ``LocalCUDACluster``, cuDF spilling can be enabled with the ``enable_cudf_spill`` argument:
+
+.. code-block::
+
+    >>> from distributed import Client​
+    >>> from dask_cuda import LocalCUDACluster​
+
+    >>> cluster = LocalCUDACluster(n_workers=10, enable_cudf_spill=True)​
+    >>> client = Client(cluster)​
+
+The same applies for ``dask cuda worker``:
+
+.. code-block::
+
+    $ dask scheduler
+    distributed.scheduler - INFO -   Scheduler at:  tcp://127.0.0.1:8786
+
+    $ dask cuda worker --enable-cudf-spill
+
+
+Statistics
+~~~~~~~~~~
+
+When cuDF spilling is enabled, it is also possible to have cuDF collect basic
+spill statistics. Collecting this information can be a useful way to understand
+the performance of memory-intensive workflows using cuDF.
+
+When deploying a ``LocalCUDACluster``, cuDF spilling can be enabled with the
+``cudf_spill_stats`` argument:
+
+.. code-block::
+
+    >>> cluster = LocalCUDACluster(n_workers=10, enable_cudf_spill=True, cudf_spill_stats=1)​
+
+The same applies for ``dask cuda worker``:
+
+.. code-block::
+
+    $ dask cuda worker --enable-cudf-spill --cudf-spill-stats 1
+
+To have each dask-cuda worker print spill statistics within the workflow, do something like:
+
+.. code-block::
+
+    def spill_info():
+        from cudf.core.buffer.spill_manager import get_global_manager
+        print(get_global_manager().statistics)
+    client.submit(spill_info)
+
+See the `cuDF spilling documentation
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#statistics>`_
+for more information on the available spill-statistics options.
+
+Limitations
+~~~~~~~~~~~
+
+Although cuDF spilling is the best option for most ETL workflows using Dask cuDF,
+it will be much less effective if that workflow converts between ``cudf.DataFrame``
+and other data formats (e.g. ``cupy.ndarray``). Once the underlying device buffers
+are "exposed" to external memory references, they become "unspillable" by cuDF.
+In cases like this (e.g., Dask-CUDA + XGBoost), JIT-Unspill is usually a better choice.

From d5b70f0fab8378b9cb1b85d0880a84742131e46e Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 18:06:09 -0500
Subject: [PATCH 18/32] Fix typo in spilling documentation (#1384)

Small follow-up to #1383

- Fixes a typo in a link that references the "Spilling from device" page
- Small tweaks to the spilling discussion on the "best practices" page

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1384
---
 docs/source/examples/best-practices.rst | 5 +++--
 docs/source/spilling.rst                | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
index fbfd8f0c3..d0ddc5108 100644
--- a/docs/source/examples/best-practices.rst
+++ b/docs/source/examples/best-practices.rst
@@ -49,8 +49,9 @@ Spilling from Device
 
 Dask-CUDA offers several different ways to enable automatic spilling from device memory.
 The best method often depends on the specific workflow. For classic ETL workloads using
-`Dask cuDF <https://docs.rapids.ai/api/dask-cudf/stable/>`_, cuDF spilling is usually the
-best place to start. See :ref:`Spilling from device <spilling-from-device>` for more details.
+`Dask cuDF <https://docs.rapids.ai/api/dask-cudf/stable/>`_, native cuDF spilling is usually
+the best place to start. See :ref:`Dask-CUDA's spilling documentation <spilling-from-device>`
+for more details.
 
 Accelerated Networking
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst
index cfc6cfcf8..c86b5ce42 100644
--- a/docs/source/spilling.rst
+++ b/docs/source/spilling.rst
@@ -114,7 +114,7 @@ cuDF Spilling
 
 When executing an ETL workflow with `Dask cuDF <https://docs.rapids.ai/api/dask-cudf/stable/>`_
 (i.e. Dask DataFrame), it is usually best to leverage `native spilling support in cuDF
-<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`.
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`_.
 
 Native cuDF spilling has an important advantage over the other methodologies mentioned
 above. When JIT-unspill or default spilling are used, the worker is only able to spill

From dbb50a5b964ae1caf945e026f186292f05316cac Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 15:58:01 -0500
Subject: [PATCH 19/32] Update to flake8 7.1.1. (#1385)

We need to update flake8 to fix a false-positive that appears with older flake8 versions on Python 3.12.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1385
---
 .pre-commit-config.yaml                   |  2 +-
 dask_cuda/benchmarks/utils.py             |  2 +-
 dask_cuda/cli.py                          |  9 +++++----
 dask_cuda/tests/test_gds.py               |  2 +-
 dask_cuda/tests/test_proxify_host_file.py |  2 +-
 dask_cuda/tests/test_proxy.py             | 10 +++++-----
 6 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c1157be19..4707492ac 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
         hooks:
               - id: black
       - repo: https://github.com/PyCQA/flake8
-        rev: 3.8.3
+        rev: 7.1.1
         hooks:
               - id: flake8
       - repo: https://github.com/codespell-project/codespell
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5b9448d48..de7e2ae1d 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -772,7 +772,7 @@ def print_throughput_bandwidth(
     )
     print_key_value(
         key="Wall clock",
-        value=f"{format_time(durations.mean())} +/- {format_time(durations.std()) }",
+        value=f"{format_time(durations.mean())} +/- {format_time(durations.std())}",
     )
     if not args.no_show_p2p_bandwidth:
         print_separator(separator="=")
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index 6a3518e07..a8c6d972c 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -167,10 +167,11 @@ def cuda():
 @click.option(
     "--rmm-release-threshold",
     default=None,
-    help="""When ``rmm.async`` is ``True`` and the pool size grows beyond this value, unused
-    memory held by the pool will be released at the next synchronization point. Can be
-    an integer (bytes), float (fraction of total device memory), string (like ``"5GB"``
-    or ``"5000M"``) or ``None``. By default, this feature is disabled.
+    help="""When ``rmm.async`` is ``True`` and the pool size grows beyond this
+    value, unused memory held by the pool will be released at the next
+    synchronization point. Can be an integer (bytes), float (fraction of total
+    device memory), string (like ``"5GB"`` or ``"5000M"``) or ``None``. By
+    default, this feature is disabled.
 
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
diff --git a/dask_cuda/tests/test_gds.py b/dask_cuda/tests/test_gds.py
index c86670252..262369e64 100644
--- a/dask_cuda/tests/test_gds.py
+++ b/dask_cuda/tests/test_gds.py
@@ -38,7 +38,7 @@ def test_gds(gds_enabled, cuda_lib):
         a = data_create()
         header, frames = serialize(a, serializers=("disk",))
         b = deserialize(header, frames)
-        assert type(a) == type(b)
+        assert type(a) is type(b)
         assert data_compare(a, b)
     finally:
         ProxifyHostFile.register_disk_spilling()  # Reset disk spilling options
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 2683ea36d..56fe7f8d6 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -252,7 +252,7 @@ def task(x):
             assert "ProxyObject" in str(type(x))
             assert x._pxy_get().serializer == "dask"
         else:
-            assert type(x) == cudf.DataFrame
+            assert type(x) is cudf.DataFrame
         assert len(x) == 10  # Trigger deserialization
         return x
 
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 31a9e9962..90b84e90d 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -114,7 +114,7 @@ def test_proxy_object_of_array(serializers, backend):
         pxy = proxy_object.asproxy(org.copy(), serializers=serializers)
         expect = op(org)
         got = op(pxy)
-        assert type(expect) == type(got)
+        assert type(expect) is type(got)
         assert expect == got
 
     # Check unary operators
@@ -124,7 +124,7 @@ def test_proxy_object_of_array(serializers, backend):
         pxy = proxy_object.asproxy(org.copy(), serializers=serializers)
         expect = op(org)
         got = op(pxy)
-        assert type(expect) == type(got)
+        assert type(expect) is type(got)
         assert all(expect == got)
 
     # Check binary operators that takes a scalar as second argument
@@ -134,7 +134,7 @@ def test_proxy_object_of_array(serializers, backend):
         pxy = proxy_object.asproxy(org.copy(), serializers=serializers)
         expect = op(org, 2)
         got = op(pxy, 2)
-        assert type(expect) == type(got)
+        assert type(expect) is type(got)
         assert all(expect == got)
 
     # Check binary operators
@@ -192,7 +192,7 @@ def test_proxy_object_of_array(serializers, backend):
         pxy = proxy_object.asproxy(org.copy(), serializers=serializers)
         expect = op(org)
         got = op(pxy)
-        assert type(expect) == type(got)
+        assert type(expect) is type(got)
         assert expect == got
 
     # Check reflected methods
@@ -297,7 +297,7 @@ def task(x):
             assert "ProxyObject" in str(type(x))
             assert x._pxy_get().serializer == "dask"
         else:
-            assert type(x) == cudf.DataFrame
+            assert type(x) is cudf.DataFrame
         assert len(x) == 10  # Trigger deserialization
         return x
 

From 637c50480e697d342de310d8e4fcf6bca56a9610 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:43:12 -0500
Subject: [PATCH 20/32] Use CI workflow branch 'branch-24.10' again [skip ci]
 (#1386)

---
 .github/workflows/build.yaml | 10 +++++-----
 .github/workflows/pr.yaml    | 12 ++++++------
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index fd7ebff54..67bbd027b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b4b5ba021..76014652e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9c3da8490..1a0e7d876 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 95f0a33e377eff36bf2b20f25748489dbdb5e5b2 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 19 Sep 2024 11:46:22 -0400
Subject: [PATCH 21/32] DOC v24.12 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 10 +++----
 .github/workflows/pr.yaml                     | 12 ++++-----
 .github/workflows/test.yaml                   |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-114_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-118_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-125_arch-x86_64.yaml             | 14 +++++-----
 dependencies.yaml                             | 26 +++++++++----------
 docs/source/explicit_comms.rst                |  2 +-
 pyproject.toml                                | 10 +++----
 11 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67bbd027b..3d097bcd7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 76014652e..0e20bdafa 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1a0e7d876..631a61739 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 42103004b..7850211e3 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.10"
+export RAPIDS_VERSION_NUMBER="24.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 3cfd9cb28..3c327ff08 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b7b997513..3931f3bf2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 652a8f0c5..760ae9719 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.5
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -26,13 +26,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 9e6b3a108..59ac8c01a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -158,7 +158,7 @@ dependencies:
           - numpy>=1.23,<3.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.10.*,>=0.0.0a0
+          - rapids-dask-dependency==24.12.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
@@ -168,13 +168,13 @@ dependencies:
           - pytest-cov
       - output_types: [conda]
         packages:
-          - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0
-          - &dask_cudf_unsuffixed dask-cudf==24.10.*,>=0.0.0a0
-          - distributed-ucxx==0.40.*,>=0.0.0a0
-          - &kvikio_unsuffixed kvikio==24.10.*,>=0.0.0a0
-          - &ucx_py_unsuffixed ucx-py==0.40.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.12.*,>=0.0.0a0
+          - distributed-ucxx==0.41.*,>=0.0.0a0
+          - &kvikio_unsuffixed kvikio==24.12.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.40.*,>=0.0.0a0
+          - ucxx==0.41.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -194,16 +194,16 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==24.10.*,>=0.0.0a0
-              - dask-cudf-cu12==24.10.*,>=0.0.0a0
-              - ucx-py-cu12==0.40.*,>=0.0.0a0
+              - cudf-cu12==24.12.*,>=0.0.0a0
+              - dask-cudf-cu12==24.12.*,>=0.0.0a0
+              - ucx-py-cu12==0.41.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu11==24.10.*,>=0.0.0a0
-              - dask-cudf-cu11==24.10.*,>=0.0.0a0
-              - ucx-py-cu11==0.40.*,>=0.0.0a0
+              - cudf-cu11==24.12.*,>=0.0.0a0
+              - dask-cudf-cu11==24.12.*,>=0.0.0a0
+              - ucx-py-cu11==0.41.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_unsuffixed
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index af3170565..db621977e 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.10/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.12/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/pyproject.toml b/pyproject.toml
index 730225adc..fcf572764 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<3.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -50,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.10.*,>=0.0.0a0",
-    "dask-cudf==24.10.*,>=0.0.0a0",
-    "kvikio==24.10.*,>=0.0.0a0",
+    "cudf==24.12.*,>=0.0.0a0",
+    "dask-cudf==24.12.*,>=0.0.0a0",
+    "kvikio==24.12.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.40.*,>=0.0.0a0",
+    "ucx-py==0.41.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 1c84a6a5031294276ef5aee2792cd195513179c2 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:10:25 -0500
Subject: [PATCH 22/32] update update-version.sh to use packaging lib (#1387)

---
 ci/release/update-version.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a9fe1d02e..2dbe504ca 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,7 +22,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 NEXT_UCXPY_VERSION="$(curl -s https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"

From 45924dfc91cf9b2ec9ad7070a05161cb83d17777 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:39:12 -0400
Subject: [PATCH 23/32] Update Changelog [skip ci]

---
 CHANGELOG.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37c588511..f8c992fbd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,32 @@
+# dask-cuda 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Replace cuDF (de)serializer with cuDF spill-aware (de)serializer ([#1369](https://github.com/rapidsai/dask-cuda/pull/1369)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Fix typo in spilling documentation ([#1384](https://github.com/rapidsai/dask-cuda/pull/1384)) [@rjzamora](https://github.com/rjzamora)
+- Add notes on cudf spilling to docs ([#1383](https://github.com/rapidsai/dask-cuda/pull/1383)) [@rjzamora](https://github.com/rjzamora)
+
+## 🚀 New Features
+
+- [Benchmark] Add  parquet read benchmark ([#1371](https://github.com/rapidsai/dask-cuda/pull/1371)) [@rjzamora](https://github.com/rjzamora)
+- Replace cuDF (de)serializer with cuDF spill-aware (de)serializer ([#1369](https://github.com/rapidsai/dask-cuda/pull/1369)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Update update-version.sh to use packaging lib ([#1387](https://github.com/rapidsai/dask-cuda/pull/1387)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#1386](https://github.com/rapidsai/dask-cuda/pull/1386)) [@jameslamb](https://github.com/jameslamb)
+- Update to flake8 7.1.1. ([#1385](https://github.com/rapidsai/dask-cuda/pull/1385)) [@bdice](https://github.com/bdice)
+- enable Python 3.12 tests on PRs ([#1382](https://github.com/rapidsai/dask-cuda/pull/1382)) [@jameslamb](https://github.com/jameslamb)
+- Add support for Python 3.12 ([#1380](https://github.com/rapidsai/dask-cuda/pull/1380)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#1379](https://github.com/rapidsai/dask-cuda/pull/1379)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop Python 3.9 support ([#1377](https://github.com/rapidsai/dask-cuda/pull/1377)) [@jameslamb](https://github.com/jameslamb)
+- Remove NumPy &lt;2 pin ([#1375](https://github.com/rapidsai/dask-cuda/pull/1375)) [@seberg](https://github.com/seberg)
+- Update pre-commit hooks ([#1373](https://github.com/rapidsai/dask-cuda/pull/1373)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Merge branch-24.08 into branch-24.10 ([#1368](https://github.com/rapidsai/dask-cuda/pull/1368)) [@jameslamb](https://github.com/jameslamb)
+
 # dask-cuda 24.08.00 (7 Aug 2024)
 
 ## 🐛 Bug Fixes

From 93a1ee23a43563f33fba8a5a8761c03ccef25a1c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 9 Oct 2024 18:12:23 +0200
Subject: [PATCH 24/32] Limit output of pytest durations (#1393)

Durations output were previously increased to show all tests to allow us debugging of timeouts. However, now they have not been as important so limiting to only the 50 longer running tests is best to decrease log lengths, we may soon remove it entirely if they are not currently important.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1393
---
 ci/test_python.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 78330a403..32c0d940f 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -52,7 +52,7 @@ UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
 timeout 60m pytest \
   -vv \
-  --durations=0 \
+  --durations=50 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \
@@ -73,7 +73,7 @@ UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
 timeout 30m pytest \
   -vv \
-  --durations=0 \
+  --durations=50 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda-legacy.xml" \

From f775d883c1149b00a462a041cf6589f9081aa4fb Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 10 Oct 2024 12:59:31 -0500
Subject: [PATCH 25/32] make conda installs in CI stricter (#1395)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1395
---
 ci/build_docs.sh             | 7 ++++---
 ci/release/update-version.sh | 1 -
 ci/test_python.sh            | 4 +++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 7850211e3..58da36c7c 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-dependency-file-generator \
     --output conda \
     --file-key docs \
@@ -21,9 +23,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
-    dask-cuda
+    "dask-cuda=${RAPIDS_VERSION}"
 
-export RAPIDS_VERSION_NUMBER="24.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
@@ -33,4 +34,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="$(rapids-version-major-minor)" rapids-upload-docs
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2dbe504ca..b229d2808 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -68,7 +68,6 @@ done
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 # Docs referencing source code
 find docs/source/ -type f -name *.rst -print0 | while IFS= read -r -d '' filename; do
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 32c0d940f..339141728 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -29,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
-  dask-cuda
+  "dask-cuda=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 8d88006a6a064165e8408dcb9c288059c6f98a7f Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Sat, 12 Oct 2024 13:51:38 -0600
Subject: [PATCH 26/32] Enable Pytorch to share same memory pool as RMM via cli
 (#1392)

This PR closes: https://github.com/rapidsai/dask-cuda/issues/1281

Usage example:
```
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster(rmm_allocator_external_lib_list=["torch", "cupy"])
client = Client(cluster)
```

Verify working
```
def get_torch_allocator():
    import torch
    return torch.cuda.get_allocator_backend()

client.run(get_torch_allocator)
```

```
client.run(get_torch_allocator)
```

```
{'tcp://127.0.0.1:37167': 'pluggable',
 'tcp://127.0.0.1:38749': 'pluggable',
 'tcp://127.0.0.1:43109': 'pluggable',
 'tcp://127.0.0.1:44259': 'pluggable',
 'tcp://127.0.0.1:44953': 'pluggable',
 'tcp://127.0.0.1:45087': 'pluggable',
 'tcp://127.0.0.1:45623': 'pluggable',
 'tcp://127.0.0.1:45847': 'pluggable'}
```

Without it its `native`.


Context: This helps NeMo-Curator to have a  more stable use of Pytorch+dask-cuda

CC: @pentschev .

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1392
---
 dask_cuda/cli.py                | 14 ++++++-
 dask_cuda/cuda_worker.py        |  2 +
 dask_cuda/local_cuda_cluster.py | 22 +++++++++++
 dask_cuda/plugins.py            | 67 +++++++++++++++++++++++++++++++++
 dask_cuda/utils.py              | 11 ++++++
 5 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index a8c6d972c..8101f0209 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -13,7 +13,7 @@
 from distributed.utils import import_term
 
 from .cuda_worker import CUDAWorker
-from .utils import print_cluster_config
+from .utils import CommaSeparatedChoice, print_cluster_config
 
 logger = logging.getLogger(__name__)
 
@@ -164,6 +164,16 @@ def cuda():
         incompatible with RMM pools and managed memory, trying to enable both will
         result in failure.""",
 )
+@click.option(
+    "--set-rmm-allocator-for-libs",
+    "rmm_allocator_external_lib_list",
+    type=CommaSeparatedChoice(["cupy", "torch"]),
+    default=None,
+    show_default=True,
+    help="""
+    Set RMM as the allocator for external libraries. Provide a comma-separated
+    list of libraries to set, e.g., "torch,cupy".""",
+)
 @click.option(
     "--rmm-release-threshold",
     default=None,
@@ -351,6 +361,7 @@ def worker(
     rmm_maximum_pool_size,
     rmm_managed_memory,
     rmm_async,
+    rmm_allocator_external_lib_list,
     rmm_release_threshold,
     rmm_log_directory,
     rmm_track_allocations,
@@ -425,6 +436,7 @@ def worker(
             rmm_maximum_pool_size,
             rmm_managed_memory,
             rmm_async,
+            rmm_allocator_external_lib_list,
             rmm_release_threshold,
             rmm_log_directory,
             rmm_track_allocations,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 3e03ed297..30c14450c 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -47,6 +47,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_allocator_external_lib_list=None,
         rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
@@ -231,6 +232,7 @@ def del_pid_file():
                         release_threshold=rmm_release_threshold,
                         log_directory=rmm_log_directory,
                         track_allocations=rmm_track_allocations,
+                        external_lib_list=rmm_allocator_external_lib_list,
                     ),
                     PreImport(pre_import),
                     CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index c037223b2..7a24df437 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -143,6 +143,11 @@ class LocalCUDACluster(LocalCluster):
             The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
             incompatible with RMM pools and managed memory. Trying to enable both will
             result in an exception.
+    rmm_allocator_external_lib_list: str, list or None, default None
+        List of external libraries for which to set RMM as the allocator.
+        Supported options are: ``["torch", "cupy"]``. Can be a comma-separated string
+        (like ``"torch,cupy"``) or a list of strings (like ``["torch", "cupy"]``).
+        If ``None``, no external libraries will use RMM as their allocator.
     rmm_release_threshold: int, str or None, default None
         When ``rmm.async is True`` and the pool size grows beyond this value, unused
         memory held by the pool will be released at the next synchronization point.
@@ -231,6 +236,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_allocator_external_lib_list=None,
         rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
@@ -265,6 +271,19 @@ def __init__(
             n_workers = len(CUDA_VISIBLE_DEVICES)
         if n_workers < 1:
             raise ValueError("Number of workers cannot be less than 1.")
+
+        if rmm_allocator_external_lib_list is not None:
+            if isinstance(rmm_allocator_external_lib_list, str):
+                rmm_allocator_external_lib_list = [
+                    v.strip() for v in rmm_allocator_external_lib_list.split(",")
+                ]
+            elif not isinstance(rmm_allocator_external_lib_list, list):
+                raise ValueError(
+                    "rmm_allocator_external_lib_list must be either a comma-separated "
+                    "string or a list of strings. Examples: 'torch,cupy' "
+                    "or ['torch', 'cupy']"
+                )
+
         # Set nthreads=1 when parsing mem_limit since it only depends on n_workers
         logger = logging.getLogger(__name__)
         self.memory_limit = parse_memory_limit(
@@ -284,6 +303,8 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         self.rmm_async = rmm_async
         self.rmm_release_threshold = rmm_release_threshold
+        self.rmm_allocator_external_lib_list = rmm_allocator_external_lib_list
+
         if rmm_pool_size is not None or rmm_managed_memory or rmm_async:
             try:
                 import rmm  # noqa F401
@@ -437,6 +458,7 @@ def new_worker_spec(self):
                         release_threshold=self.rmm_release_threshold,
                         log_directory=self.rmm_log_directory,
                         track_allocations=self.rmm_track_allocations,
+                        external_lib_list=self.rmm_allocator_external_lib_list,
                     ),
                     PreImport(self.pre_import),
                     CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
index 122f93ffa..cd1928af9 100644
--- a/dask_cuda/plugins.py
+++ b/dask_cuda/plugins.py
@@ -1,5 +1,6 @@
 import importlib
 import os
+from typing import Callable, Dict
 
 from distributed import WorkerPlugin
 
@@ -39,6 +40,7 @@ def __init__(
         release_threshold,
         log_directory,
         track_allocations,
+        external_lib_list,
     ):
         if initial_pool_size is None and maximum_pool_size is not None:
             raise ValueError(
@@ -61,6 +63,7 @@ def __init__(
         self.logging = log_directory is not None
         self.log_directory = log_directory
         self.rmm_track_allocations = track_allocations
+        self.external_lib_list = external_lib_list
 
     def setup(self, worker=None):
         if self.initial_pool_size is not None:
@@ -123,6 +126,70 @@ def setup(self, worker=None):
             mr = rmm.mr.get_current_device_resource()
             rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
 
+        if self.external_lib_list is not None:
+            for lib in self.external_lib_list:
+                enable_rmm_memory_for_library(lib)
+
+
+def enable_rmm_memory_for_library(lib_name: str) -> None:
+    """Enable RMM memory pool support for a specified third-party library.
+
+    This function allows the given library to utilize RMM's memory pool if it supports
+    integration with RMM. The library name is passed as a string argument, and if the
+    library is compatible, its memory allocator will be configured to use RMM.
+
+    Parameters
+    ----------
+    lib_name : str
+        The name of the third-party library to enable RMM memory pool support for.
+        Supported libraries are "cupy" and "torch".
+
+    Raises
+    ------
+    ValueError
+        If the library name is not supported or does not have RMM integration.
+    ImportError
+        If the required library is not installed.
+    """
+
+    # Mapping of supported libraries to their respective setup functions
+    setup_functions: Dict[str, Callable[[], None]] = {
+        "torch": _setup_rmm_for_torch,
+        "cupy": _setup_rmm_for_cupy,
+    }
+
+    if lib_name not in setup_functions:
+        supported_libs = ", ".join(setup_functions.keys())
+        raise ValueError(
+            f"The library '{lib_name}' is not supported for RMM integration. "
+            f"Supported libraries are: {supported_libs}."
+        )
+
+    # Call the setup function for the specified library
+    setup_functions[lib_name]()
+
+
+def _setup_rmm_for_torch() -> None:
+    try:
+        import torch
+    except ImportError as e:
+        raise ImportError("PyTorch is not installed.") from e
+
+    from rmm.allocators.torch import rmm_torch_allocator
+
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+
+def _setup_rmm_for_cupy() -> None:
+    try:
+        import cupy
+    except ImportError as e:
+        raise ImportError("CuPy is not installed.") from e
+
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
 
 class PreImport(WorkerPlugin):
     def __init__(self, libraries):
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index ff4dbbae3..74596fe26 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -9,6 +9,7 @@
 from multiprocessing import cpu_count
 from typing import Optional
 
+import click
 import numpy as np
 import pynvml
 import toolz
@@ -764,3 +765,13 @@ def get_rmm_memory_resource_stack(mr) -> list:
         if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
             return mr.allocation_counts["current_bytes"]
     return None
+
+
+class CommaSeparatedChoice(click.Choice):
+    def convert(self, value, param, ctx):
+        values = [v.strip() for v in value.split(",")]
+        for v in values:
+            if v not in self.choices:
+                choices_str = ", ".join(f"'{c}'" for c in self.choices)
+                self.fail(f"invalid choice(s): {v}. (choices are: {choices_str})")
+        return values

From dfcd399171cdaca93155fe7a1f47812db63c780c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 14 Oct 2024 19:06:16 +0200
Subject: [PATCH 27/32] Reenable UCXX in CI (#1396)

UCXX CI tests had been previously disabled due to instabilities, see https://github.com/rapidsai/dask-cuda/pull/1270#issuecomment-1806295358, it should now be much more resilient so we should reenable them in preparation for the permanent migration to UCXX.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1396
---
 ci/test_python.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 339141728..18dd88cf1 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -52,7 +52,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 60m pytest \
+timeout 90m pytest \
   -vv \
   --durations=50 \
   --capture=no \
@@ -62,7 +62,7 @@ timeout 60m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
   --cov-report=term \
-  tests -k "not ucxx"
+  tests
 popd
 
 rapids-logger "pytest explicit-comms (legacy dd)"
@@ -73,7 +73,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 30m pytest \
+timeout 60m pytest \
   -vv \
   --durations=50 \
   --capture=no \
@@ -83,7 +83,7 @@ timeout 30m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage-legacy.xml" \
   --cov-report=term \
-  tests/test_explicit_comms.py -k "not ucxx"
+  tests/test_explicit_comms.py
 popd
 
 rapids-logger "Run local benchmark (dask-expr)"

From 0f78f5d23029313ecb3647faca6c28933b52d130 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 22 Oct 2024 23:39:51 +0200
Subject: [PATCH 28/32] Ignore legacy Dask dataframe warnings (#1397)

Ignore legacy Dask dataframe warnings that the implementation is going to be soon removed, introduced in
https://github.com/dask/dask/pull/11437 .

The warning is only raised for `DASK_DATAFRAME__QUERY_PLANNING=False` cases.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1397
---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index fcf572764..2266fb5b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -128,6 +128,9 @@ filterwarnings = [
     # is enabled in both dask-cudf and dask-cuda.
     # See: https://github.com/rapidsai/dask-cuda/issues/1311
     "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 
 [tool.rapids-build-backend]

From 4639a968bcbf9837085be5c8df40ef27d00bf009 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 23 Oct 2024 14:12:46 -0500
Subject: [PATCH 29/32] remove unnecessary cmake and sccache configuration
 (#1400)

Contributes to https://github.com/rapidsai/build-planning/issues/108

This is a pure Python project, so it doesn't need configuration about CMake or `sccache`.

This proposes removing them to simplify build scripts a bit.

It also proposes updating the `rapids-dependency-file-generator` pre-commit hook to it's latest version, something I'm trying to roll out across RAPIDS as part of https://github.com/rapidsai/build-planning/issues/108.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1400
---
 .pre-commit-config.yaml | 2 +-
 ci/build_python.sh      | 4 ----
 ci/build_wheel.sh       | 3 +--
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4707492ac..a2202df3b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,7 @@ repos:
         hooks:
             - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.13.11
+        rev: v1.16.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 48cece328..c12a0dde8 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -5,12 +5,8 @@ set -euo pipefail
 
 rapids-configure-conda-channels
 
-source rapids-configure-sccache
-
 source rapids-date-string
 
-export CMAKE_GENERATOR=Ninja
-
 rapids-print-env
 
 rapids-generate-version > ./VERSION
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 828972dc2..91c572318 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,11 +3,10 @@
 
 set -euo pipefail
 
-source rapids-configure-sccache
 source rapids-date-string
 
 rapids-generate-version > ./VERSION
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
 
 RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist

From fc80d43bf22db405fe123be8324aaee7978d4956 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 1 Nov 2024 13:02:58 -0500
Subject: [PATCH 30/32] Switch pytest `traceback` to `native` (#1389)

In cudf & cuml we have observed a ~10% to ~20% respectively speed up of pytest suite execution by switching pytest traceback to `--native`:

```
currently:

102474 passed, 2117 skipped, 902 xfailed in 892.16s (0:14:52)

--tb=short:

102474 passed, 2117 skipped, 902 xfailed in 898.99s (0:14:58)

--tb=no:

102474 passed, 2117 skipped, 902 xfailed in 815.98s (0:13:35)

--tb=native:

102474 passed, 2117 skipped, 902 xfailed in 820.92s (0:13:40)
```

This PR makes similar change to `dask-cuda` repo.

xref: https://github.com/rapidsai/cudf/pull/16851

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1389
---
 dask_cuda/tests/pytest.ini | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 dask_cuda/tests/pytest.ini

diff --git a/dask_cuda/tests/pytest.ini b/dask_cuda/tests/pytest.ini
new file mode 100644
index 000000000..7b0a9f29f
--- /dev/null
+++ b/dask_cuda/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native

From 233376d8f111e2571f745e6f31729db9bc2183ac Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 5 Nov 2024 16:37:40 +0100
Subject: [PATCH 31/32] Add warmup runs and profile all iterations to
 benchmarks (#1402)

Add support for initial warmup runs in benchmarks and allows profiling all iterations or just the last one.

This is technically a breaking change since `--profile` now profiles all iterations, and the new `--profile-last` option profiles only the last one as `--profile` used to behave.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1402
---
 dask_cuda/benchmarks/common.py                | 21 ++++++++++++-----
 dask_cuda/benchmarks/local_cudf_groupby.py    | 13 +++--------
 dask_cuda/benchmarks/local_cudf_merge.py      |  8 +------
 dask_cuda/benchmarks/local_cudf_shuffle.py    | 13 +++--------
 dask_cuda/benchmarks/local_cupy.py            | 15 ++++--------
 .../benchmarks/local_cupy_map_overlap.py      | 15 ++++--------
 dask_cuda/benchmarks/utils.py                 | 23 ++++++++++++++++++-
 7 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 7f48d4fae..49676fee1 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -1,3 +1,4 @@
+import contextlib
 from argparse import Namespace
 from functools import partial
 from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
@@ -7,7 +8,7 @@
 import pandas as pd
 
 import dask
-from distributed import Client
+from distributed import Client, performance_report
 
 from dask_cuda.benchmarks.utils import (
     address_to_index,
@@ -87,12 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config):
 
     If ``args.profile`` is set, the final run is profiled.
     """
+
     results = []
-    for _ in range(max(1, args.runs) - 1):
-        res = config.bench_once(client, args, write_profile=None)
-        results.append(res)
-    results.append(config.bench_once(client, args, write_profile=args.profile))
-    return results
+    for _ in range(max(0, args.warmup_runs)):
+        config.bench_once(client, args, write_profile=None)
+
+    ctx = contextlib.nullcontext()
+    if args.profile is not None:
+        ctx = performance_report(filename=args.profile)
+    with ctx:
+        for _ in range(max(1, args.runs) - 1):
+            res = config.bench_once(client, args, write_profile=None)
+            results.append(res)
+        results.append(config.bench_once(client, args, write_profile=args.profile_last))
+        return results
 
 
 def gather_bench_results(client: Client, args: Namespace, config: Config):
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index f094ff185..a9e7d833e 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None):
         "False": False,
     }.get(args.shuffle, args.shuffle)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         t1 = clock()
@@ -260,12 +259,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
     ]
 
     return parse_benchmark_args(
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index e2b035204..6ebe005a7 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None):
     if args.backend == "explicit-comms":
         ctx1 = dask.config.set(explicit_comms=True)
     if write_profile is not None:
-        ctx2 = performance_report(filename=args.profile)
+        ctx2 = performance_report(filename=write_profile)
 
     with ctx1:
         with ctx2:
@@ -346,12 +346,6 @@ def parse_args():
             "action": "store_true",
             "help": "Don't shuffle the keys of the left (base) dataframe.",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": [
                 "-s",
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 25f42e59d..3a0955c4f 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -121,10 +121,9 @@ def create_data(
 def bench_once(client, args, write_profile=None):
     data_processed, df = create_data(client, args)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         if args.backend in {"dask", "dask-noop"}:
@@ -228,12 +227,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": "--ignore-index",
             "action": "store_true",
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index c9c8fe1c1..ba88db30d 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None):
     chunksize = x.chunksize
     data_processed = sum(arg.nbytes for arg in func_args)
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         rng = start_range(message=args.operation, color="purple")
         result = func(*func_args)
@@ -297,12 +296,6 @@ def parse_args():
             "type": int,
             "help": "Chunk size (default 2500).",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs (default 3).",
-        },
         {
             "name": [
                 "-b",
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index 8b975a24a..ecefa52a1 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None):
 
     data_processed = x.nbytes
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         result = x.map_overlap(mean_filter, args.kernel_size, shape=ks)
         if args.backend == "dask-noop":
@@ -168,12 +167,6 @@ def parse_args():
             "type": int,
             "help": "Kernel size, 2*k+1, in each dimension (default 1)",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": [
                 "-b",
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index de7e2ae1d..4f87a0256 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -323,7 +323,16 @@ def parse_benchmark_args(
         metavar="PATH",
         default=None,
         type=str,
-        help="Write dask profile report (E.g. dask-report.html)",
+        help="Write dask profile report (E.g. dask-report.html) on all "
+        "iterations (excluding warmup).",
+    )
+    parser.add_argument(
+        "--profile-last",
+        metavar="PATH",
+        default=None,
+        type=str,
+        help="Write dask profile report (E.g. dask-report.html) on last "
+        "iteration only.",
     )
     # See save_benchmark_data for more information
     parser.add_argument(
@@ -344,6 +353,18 @@ def parse_benchmark_args(
         type=parse_bytes,
         help="Bandwidth statistics: ignore messages smaller than this (default '1 MB')",
     )
+    parser.add_argument(
+        "--runs",
+        default=3,
+        type=int,
+        help="Number of runs",
+    )
+    parser.add_argument(
+        "--warmup-runs",
+        default=1,
+        type=int,
+        help="Number of warmup runs",
+    )
 
     for args in args_list:
         name = args.pop("name")

From 9e7a926bc305f32aa0463a44eb96d4494fe55fc0 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 13 Nov 2024 09:13:11 -0600
Subject: [PATCH 32/32] enforce wheel size limits, README formatting in CI
 (#1404)

Contributes to https://github.com/rapidsai/build-planning/issues/110

Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI.

* checks on wheel size (compressed),
  - *to be sure they're under PyPI limits*
  - *and to prompt discussion on PRs that significantly increase wheel sizes*
* checks on README formatting
  - *to ensure they'll render properly as the PyPI project homepages*
  - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1404
---
 ci/build_wheel.sh    |  1 +
 ci/validate_wheel.sh | 18 ++++++++++++++++++
 pyproject.toml       |  8 ++++++++
 3 files changed, 27 insertions(+)
 create mode 100755 ci/validate_wheel.sh

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 91c572318..760e46e38 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -8,5 +8,6 @@ source rapids-date-string
 rapids-generate-version > ./VERSION
 
 python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+./ci/validate_wheel.sh dist
 
 RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..60a80fce6
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+wheel_dir_relative_path=$1
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/pyproject.toml b/pyproject.toml
index 2266fb5b5..7025ca4ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -152,3 +152,11 @@ exclude = [
     "docs.*",
     "tests.*",
 ]
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'