diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index cf10aa23f..000000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-dask_cuda/_version.py export-subst
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9bfa630e1..be9daacfb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,10 +1,14 @@
 #python code owners
 dask_cuda/  @rapidsai/daskcuda-python-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 000000000..895ba83ee
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c59..2ed5231ae 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,5 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true
+forward_merger: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 59e188881..69b0de5f5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,53 +22,60 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   docs-build:
-    if: github.ref_type == 'branch' && github.event_name == 'push'
+    if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
-      build_type: branch
-      node_type: "gpu-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
+      container_image: "rapidsai/ci-conda:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
+      sha: ${{ inputs.sha }}
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   wheel-build:
-    runs-on: ubuntu-latest
-    container:
-      image: rapidsai/ci:latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Build wheel
-        run: ci/build_python_pypi.sh
-      - name: Publish distribution 📦 to PyPI
-        if: inputs.build_type == 'nightly'
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel.sh
+      # Package is pure Python and only ever requires one build.
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]
+  wheel-publish:
+    needs: wheel-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: dask-cuda
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 55117f774..1ddd5b5cc 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
  triage:
    runs-on: ubuntu-latest
    steps:
-   - uses: actions/labeler@main
+   - uses: actions/labeler@v4
      with:
        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index abcd0c66c..4e56d24d2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,43 +18,38 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build:
-    needs: checks
-    runs-on: ubuntu-latest
-    container:
-      image: rapidsai/ci:latest
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Build wheel
-        run: ci/build_python_pypi.sh
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      build_type: pull-request
+      # Package is pure Python and only ever requires one build.
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]) | [.]
+      script: "ci/build_wheel.sh"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3a6641d81..7a884c5c6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 030c454b6..b10be12af 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,6 @@ repos:
                   (?x)^(
                     .*test.*|
                     ^CHANGELOG.md$|
-                    ^.*versioneer.py$
                   )
       - repo: https://github.com/pre-commit/mirrors-mypy
         rev: 'v0.991'
@@ -33,6 +32,11 @@ repos:
                 additional_dependencies: [types-cachetools]
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
+      - repo: https://github.com/rapidsai/dependency-file-generator
+        rev: v1.13.11
+        hooks:
+            - id: rapids-dependency-file-generator
+              args: ["--clean"]
 
 default_language_version:
       python: python3
diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index fd5ccf688..000000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-version: 2
-
-sphinx:
-  configuration: rtd/conf.py
-
-formats:
-  - htmlzip
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f82b7e59d..3ea704c1f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,234 @@
+# dask-cuda 24.06.00 (5 Jun 2024)
+
+## 🐛 Bug Fixes
+
+- Fix license name ([#1337](https://github.com/rapidsai/dask-cuda/pull/1337)) [@raydouglass](https://github.com/raydouglass)
+- Skip TCP-only DGX tests with UCX 1.16 ([#1331](https://github.com/rapidsai/dask-cuda/pull/1331)) [@pentschev](https://github.com/pentschev)
+- Update explicit-comms for dask-expr support ([#1323](https://github.com/rapidsai/dask-cuda/pull/1323)) [@rjzamora](https://github.com/rjzamora)
+- Skip explicit-comms tests when dask-expr is active ([#1322](https://github.com/rapidsai/dask-cuda/pull/1322)) [@rjzamora](https://github.com/rjzamora)
+- Relax type-check in ``test_proxy.py`` ([#1321](https://github.com/rapidsai/dask-cuda/pull/1321)) [@rjzamora](https://github.com/rjzamora)
+
+## 📖 Documentation
+
+- Fix broken links in docs ([#1329](https://github.com/rapidsai/dask-cuda/pull/1329)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- remove &#39;tomli&#39; dependency ([#1338](https://github.com/rapidsai/dask-cuda/pull/1338)) [@jameslamb](https://github.com/jameslamb)
+- Trap CI test errors with their original exit codes ([#1330](https://github.com/rapidsai/dask-cuda/pull/1330)) [@pentschev](https://github.com/pentschev)
+- Prevent path conflict in builds ([#1325](https://github.com/rapidsai/dask-cuda/pull/1325)) [@AyodeAwe](https://github.com/AyodeAwe)
+
+# dask-cuda 24.04.00 (10 Apr 2024)
+
+## 🐛 Bug Fixes
+
+- handle more RAPIDS version formats in update-version.sh ([#1307](https://github.com/rapidsai/dask-cuda/pull/1307)) [@jameslamb](https://github.com/jameslamb)
+
+## 🚀 New Features
+
+- Allow using pandas 2 ([#1308](https://github.com/rapidsai/dask-cuda/pull/1308)) [@vyasr](https://github.com/vyasr)
+- Support CUDA 12.2 ([#1302](https://github.com/rapidsai/dask-cuda/pull/1302)) [@jameslamb](https://github.com/jameslamb)
+
+## 🛠️ Improvements
+
+- Use `conda env create --yes` instead of `--force` ([#1326](https://github.com/rapidsai/dask-cuda/pull/1326)) [@bdice](https://github.com/bdice)
+- Add upper bound to prevent usage of NumPy 2 ([#1320](https://github.com/rapidsai/dask-cuda/pull/1320)) [@bdice](https://github.com/bdice)
+- Generalize GHA selectors for pure Python testing ([#1318](https://github.com/rapidsai/dask-cuda/pull/1318)) [@jakirkham](https://github.com/jakirkham)
+- Requre NumPy 1.23+ ([#1316](https://github.com/rapidsai/dask-cuda/pull/1316)) [@jakirkham](https://github.com/jakirkham)
+- Add support for Python 3.11 ([#1315](https://github.com/rapidsai/dask-cuda/pull/1315)) [@jameslamb](https://github.com/jameslamb)
+- target branch-24.04 for GitHub Actions workflows ([#1314](https://github.com/rapidsai/dask-cuda/pull/1314)) [@jameslamb](https://github.com/jameslamb)
+- Filter dd deprecation ([#1312](https://github.com/rapidsai/dask-cuda/pull/1312)) [@rjzamora](https://github.com/rjzamora)
+- Update ops-bot.yaml ([#1310](https://github.com/rapidsai/dask-cuda/pull/1310)) [@AyodeAwe](https://github.com/AyodeAwe)
+
+# dask-cuda 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Publish nightly wheels to NVIDIA index instead of PyPI ([#1294](https://github.com/rapidsai/dask-cuda/pull/1294)) [@pentschev](https://github.com/pentschev)
+
+## 🐛 Bug Fixes
+
+- Fix get_device_memory_ids ([#1305](https://github.com/rapidsai/dask-cuda/pull/1305)) [@wence-](https://github.com/wence-)
+- Prevent double UCX initialization in `test_dgx` ([#1301](https://github.com/rapidsai/dask-cuda/pull/1301)) [@pentschev](https://github.com/pentschev)
+- Update to Dask&#39;s `shuffle_method` kwarg ([#1300](https://github.com/rapidsai/dask-cuda/pull/1300)) [@pentschev](https://github.com/pentschev)
+- Add timeout to `test_dask_use_explicit_comms` ([#1298](https://github.com/rapidsai/dask-cuda/pull/1298)) [@pentschev](https://github.com/pentschev)
+- Publish nightly wheels to NVIDIA index instead of PyPI ([#1294](https://github.com/rapidsai/dask-cuda/pull/1294)) [@pentschev](https://github.com/pentschev)
+- Make versions PEP440 compliant ([#1279](https://github.com/rapidsai/dask-cuda/pull/1279)) [@vyasr](https://github.com/vyasr)
+- Generate pyproject.toml with dfg ([#1276](https://github.com/rapidsai/dask-cuda/pull/1276)) [@vyasr](https://github.com/vyasr)
+- Fix rapids dask dependency version ([#1275](https://github.com/rapidsai/dask-cuda/pull/1275)) [@vyasr](https://github.com/vyasr)
+
+## 🛠️ Improvements
+
+- Remove usages of rapids-env-update ([#1304](https://github.com/rapidsai/dask-cuda/pull/1304)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- refactor CUDA versions in dependencies.yaml ([#1303](https://github.com/rapidsai/dask-cuda/pull/1303)) [@jameslamb](https://github.com/jameslamb)
+- Start generating conda test environments ([#1291](https://github.com/rapidsai/dask-cuda/pull/1291)) [@charlesbluca](https://github.com/charlesbluca)
+- Branch 24.02 merge branch 23.12 ([#1286](https://github.com/rapidsai/dask-cuda/pull/1286)) [@vyasr](https://github.com/vyasr)
+
+# dask-cuda 23.12.00 (6 Dec 2023)
+
+## 🐛 Bug Fixes
+
+- Update actions/labeler to v4 ([#1292](https://github.com/rapidsai/dask-cuda/pull/1292)) [@raydouglass](https://github.com/raydouglass)
+- Increase Nanny close timeout for `test_spilling_local_cuda_cluster` ([#1289](https://github.com/rapidsai/dask-cuda/pull/1289)) [@pentschev](https://github.com/pentschev)
+- Fix path ([#1277](https://github.com/rapidsai/dask-cuda/pull/1277)) [@vyasr](https://github.com/vyasr)
+- Add missing alpha spec ([#1273](https://github.com/rapidsai/dask-cuda/pull/1273)) [@vyasr](https://github.com/vyasr)
+- Set minimum click to 8.1 ([#1272](https://github.com/rapidsai/dask-cuda/pull/1272)) [@jacobtomlinson](https://github.com/jacobtomlinson)
+- Reenable tests that were segfaulting ([#1266](https://github.com/rapidsai/dask-cuda/pull/1266)) [@pentschev](https://github.com/pentschev)
+- Increase close timeout of `Nanny` in `LocalCUDACluster` ([#1260](https://github.com/rapidsai/dask-cuda/pull/1260)) [@pentschev](https://github.com/pentschev)
+- Small reorganization and fixes for `test_spill` ([#1255](https://github.com/rapidsai/dask-cuda/pull/1255)) [@pentschev](https://github.com/pentschev)
+- Update plugins to inherit from ``WorkerPlugin`` ([#1230](https://github.com/rapidsai/dask-cuda/pull/1230)) [@jrbourbeau](https://github.com/jrbourbeau)
+
+## 🚀 New Features
+
+- Add support for UCXX ([#1268](https://github.com/rapidsai/dask-cuda/pull/1268)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Fix license ([#1285](https://github.com/rapidsai/dask-cuda/pull/1285)) [@vyasr](https://github.com/vyasr)
+- Build concurrency for nightly and merge triggers ([#1282](https://github.com/rapidsai/dask-cuda/pull/1282)) [@bdice](https://github.com/bdice)
+- Use new `rapids-dask-dependency` metapackage for managing dask versions ([#1270](https://github.com/rapidsai/dask-cuda/pull/1270)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove `ucp.reset()` requirement from `test_dgx` ([#1269](https://github.com/rapidsai/dask-cuda/pull/1269)) [@pentschev](https://github.com/pentschev)
+- Generate proper, consistent nightly versions for pip and conda packages ([#1267](https://github.com/rapidsai/dask-cuda/pull/1267)) [@galipremsagar](https://github.com/galipremsagar)
+- Unpin `dask` and `distributed` for `23.12` development ([#1264](https://github.com/rapidsai/dask-cuda/pull/1264)) [@galipremsagar](https://github.com/galipremsagar)
+- Move some `dask_cuda.utils` pieces to their own modules ([#1263](https://github.com/rapidsai/dask-cuda/pull/1263)) [@pentschev](https://github.com/pentschev)
+- Update `shared-action-workflows` references ([#1261](https://github.com/rapidsai/dask-cuda/pull/1261)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use branch-23.12 workflows. ([#1259](https://github.com/rapidsai/dask-cuda/pull/1259)) [@bdice](https://github.com/bdice)
+- dask-cuda: Build CUDA 12.0 ARM conda packages. ([#1238](https://github.com/rapidsai/dask-cuda/pull/1238)) [@bdice](https://github.com/bdice)
+
+# dask-cuda 23.10.00 (11 Oct 2023)
+
+## 🐛 Bug Fixes
+
+- Monkeypatch protocol.loads ala dask/distributed#8216 ([#1247](https://github.com/rapidsai/dask-cuda/pull/1247)) [@wence-](https://github.com/wence-)
+- Explicit-comms: preserve partition IDs ([#1240](https://github.com/rapidsai/dask-cuda/pull/1240)) [@madsbk](https://github.com/madsbk)
+- Increase test timeouts further to reduce CI failures ([#1234](https://github.com/rapidsai/dask-cuda/pull/1234)) [@pentschev](https://github.com/pentschev)
+- Use `conda mambabuild` not `mamba mambabuild` ([#1231](https://github.com/rapidsai/dask-cuda/pull/1231)) [@bdice](https://github.com/bdice)
+- Increate timeouts of tests that frequently timeout in CI ([#1228](https://github.com/rapidsai/dask-cuda/pull/1228)) [@pentschev](https://github.com/pentschev)
+- Adapt to non-string task keys in distributed ([#1225](https://github.com/rapidsai/dask-cuda/pull/1225)) [@wence-](https://github.com/wence-)
+- Update `test_worker_timeout` ([#1223](https://github.com/rapidsai/dask-cuda/pull/1223)) [@pentschev](https://github.com/pentschev)
+- Avoid importing `loads_function` from distributed ([#1220](https://github.com/rapidsai/dask-cuda/pull/1220)) [@rjzamora](https://github.com/rjzamora)
+
+## 🚀 New Features
+
+- Enable maximum pool size for RMM async allocator ([#1221](https://github.com/rapidsai/dask-cuda/pull/1221)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.10` release ([#1251](https://github.com/rapidsai/dask-cuda/pull/1251)) [@galipremsagar](https://github.com/galipremsagar)
+- Update `test_spill.py` to avoid `FutureWarning`s ([#1243](https://github.com/rapidsai/dask-cuda/pull/1243)) [@pentschev](https://github.com/pentschev)
+- Remove obsolete pytest `filterwarnings` ([#1241](https://github.com/rapidsai/dask-cuda/pull/1241)) [@pentschev](https://github.com/pentschev)
+- Update image names ([#1233](https://github.com/rapidsai/dask-cuda/pull/1233)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use `copy-pr-bot` ([#1227](https://github.com/rapidsai/dask-cuda/pull/1227)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Unpin `dask` and `distributed` for `23.10` development ([#1222](https://github.com/rapidsai/dask-cuda/pull/1222)) [@galipremsagar](https://github.com/galipremsagar)
+
+# dask-cuda 23.08.00 (9 Aug 2023)
+
+## 🐛 Bug Fixes
+
+- Ensure plugin config can be passed from worker to client ([#1212](https://github.com/rapidsai/dask-cuda/pull/1212)) [@wence-](https://github.com/wence-)
+- Adjust to new `get_default_shuffle_method` name ([#1200](https://github.com/rapidsai/dask-cuda/pull/1200)) [@pentschev](https://github.com/pentschev)
+- Increase minimum timeout to wait for workers in CI ([#1192](https://github.com/rapidsai/dask-cuda/pull/1192)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Remove RTD configuration and references to RTD page ([#1211](https://github.com/rapidsai/dask-cuda/pull/1211)) [@charlesbluca](https://github.com/charlesbluca)
+- Clarify `memory_limit` docs ([#1207](https://github.com/rapidsai/dask-cuda/pull/1207)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Remove versioneer ([#1204](https://github.com/rapidsai/dask-cuda/pull/1204)) [@pentschev](https://github.com/pentschev)
+- Remove code for Distributed&lt;2023.5.1 compatibility ([#1191](https://github.com/rapidsai/dask-cuda/pull/1191)) [@pentschev](https://github.com/pentschev)
+- Specify disk spill compression based on Dask config ([#1190](https://github.com/rapidsai/dask-cuda/pull/1190)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.08` release ([#1214](https://github.com/rapidsai/dask-cuda/pull/1214)) [@galipremsagar](https://github.com/galipremsagar)
+- Revert CUDA 12.0 CI workflows to branch-23.08. ([#1210](https://github.com/rapidsai/dask-cuda/pull/1210)) [@bdice](https://github.com/bdice)
+- Use minimal Numba dependencies for CUDA 12 ([#1209](https://github.com/rapidsai/dask-cuda/pull/1209)) [@jakirkham](https://github.com/jakirkham)
+- Aggregate reads &amp; writes in `disk_io` ([#1205](https://github.com/rapidsai/dask-cuda/pull/1205)) [@jakirkham](https://github.com/jakirkham)
+- CUDA 12 Support ([#1201](https://github.com/rapidsai/dask-cuda/pull/1201)) [@quasiben](https://github.com/quasiben)
+- Remove explicit UCX config from tests ([#1199](https://github.com/rapidsai/dask-cuda/pull/1199)) [@pentschev](https://github.com/pentschev)
+- use rapids-upload-docs script ([#1194](https://github.com/rapidsai/dask-cuda/pull/1194)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Unpin `dask` and `distributed` for development ([#1189](https://github.com/rapidsai/dask-cuda/pull/1189)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove documentation build scripts for Jenkins ([#1187](https://github.com/rapidsai/dask-cuda/pull/1187)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Use KvikIO in Dask-CUDA ([#925](https://github.com/rapidsai/dask-cuda/pull/925)) [@jakirkham](https://github.com/jakirkham)
+
+# dask-cuda 23.06.00 (7 Jun 2023)
+
+## 🚨 Breaking Changes
+
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+
+## 🐛 Bug Fixes
+
+- Increase pytest CI timeout ([#1196](https://github.com/rapidsai/dask-cuda/pull/1196)) [@pentschev](https://github.com/pentschev)
+- Increase minimum timeout to wait for workers in CI ([#1193](https://github.com/rapidsai/dask-cuda/pull/1193)) [@pentschev](https://github.com/pentschev)
+- Disable `np.bool` deprecation warning ([#1182](https://github.com/rapidsai/dask-cuda/pull/1182)) [@pentschev](https://github.com/pentschev)
+- Always upload on branch/nightly builds ([#1177](https://github.com/rapidsai/dask-cuda/pull/1177)) [@raydouglass](https://github.com/raydouglass)
+- Workaround for `DeviceHostFile` tests with CuPy&gt;=12.0.0 ([#1175](https://github.com/rapidsai/dask-cuda/pull/1175)) [@pentschev](https://github.com/pentschev)
+- Temporarily relax Python constraint ([#1166](https://github.com/rapidsai/dask-cuda/pull/1166)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- [doc] Add document about main guard. ([#1157](https://github.com/rapidsai/dask-cuda/pull/1157)) [@trivialfis](https://github.com/trivialfis)
+
+## 🚀 New Features
+
+- Require Numba 0.57.0+ ([#1185](https://github.com/rapidsai/dask-cuda/pull/1185)) [@jakirkham](https://github.com/jakirkham)
+- Revert &quot;Temporarily relax Python constraint&quot; ([#1171](https://github.com/rapidsai/dask-cuda/pull/1171)) [@vyasr](https://github.com/vyasr)
+- Update to zict 3.0 ([#1160](https://github.com/rapidsai/dask-cuda/pull/1160)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Add `__main__` entrypoint to dask-cuda-worker CLI ([#1181](https://github.com/rapidsai/dask-cuda/pull/1181)) [@hmacdope](https://github.com/hmacdope)
+- run docs nightly too ([#1176](https://github.com/rapidsai/dask-cuda/pull/1176)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Fix GHAs Workflows ([#1172](https://github.com/rapidsai/dask-cuda/pull/1172)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove `matrix_filter` from workflows ([#1168](https://github.com/rapidsai/dask-cuda/pull/1168)) [@charlesbluca](https://github.com/charlesbluca)
+- Revert to branch-23.06 for shared-action-workflows ([#1167](https://github.com/rapidsai/dask-cuda/pull/1167)) [@shwina](https://github.com/shwina)
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+- Remove usage of rapids-get-rapids-version-from-git ([#1163](https://github.com/rapidsai/dask-cuda/pull/1163)) [@jjacobelli](https://github.com/jjacobelli)
+- Use ARC V2 self-hosted runners for GPU jobs ([#1159](https://github.com/rapidsai/dask-cuda/pull/1159)) [@jjacobelli](https://github.com/jjacobelli)
+
+# dask-cuda 23.04.00 (6 Apr 2023)
+
+## 🚨 Breaking Changes
+
+- Pin `dask` and `distributed` for release ([#1153](https://github.com/rapidsai/dask-cuda/pull/1153)) [@galipremsagar](https://github.com/galipremsagar)
+- Update minimum `pandas` and `numpy` pinnings ([#1139](https://github.com/rapidsai/dask-cuda/pull/1139)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🐛 Bug Fixes
+
+- Rectify `dask-core` pinning in pip requirements ([#1155](https://github.com/rapidsai/dask-cuda/pull/1155)) [@galipremsagar](https://github.com/galipremsagar)
+- Monkey patching all locations of `get_default_shuffle_algorithm` ([#1142](https://github.com/rapidsai/dask-cuda/pull/1142)) [@madsbk](https://github.com/madsbk)
+- Update usage of `get_worker()` in tests ([#1141](https://github.com/rapidsai/dask-cuda/pull/1141)) [@pentschev](https://github.com/pentschev)
+- Update `rmm_cupy_allocator` usage ([#1138](https://github.com/rapidsai/dask-cuda/pull/1138)) [@jakirkham](https://github.com/jakirkham)
+- Serialize of `ProxyObject` to pickle fixed attributes ([#1137](https://github.com/rapidsai/dask-cuda/pull/1137)) [@madsbk](https://github.com/madsbk)
+- Explicit-comms: update monkey patching of Dask ([#1135](https://github.com/rapidsai/dask-cuda/pull/1135)) [@madsbk](https://github.com/madsbk)
+- Fix for bytes/str discrepancy after PyNVML update ([#1118](https://github.com/rapidsai/dask-cuda/pull/1118)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Allow specifying dashboard address in benchmarks ([#1147](https://github.com/rapidsai/dask-cuda/pull/1147)) [@pentschev](https://github.com/pentschev)
+- Add argument to enable RMM alloaction tracking in benchmarks ([#1145](https://github.com/rapidsai/dask-cuda/pull/1145)) [@pentschev](https://github.com/pentschev)
+- Reinstate `--death-timeout` CLI option ([#1140](https://github.com/rapidsai/dask-cuda/pull/1140)) [@charlesbluca](https://github.com/charlesbluca)
+- Extend RMM async allocation support ([#1116](https://github.com/rapidsai/dask-cuda/pull/1116)) [@pentschev](https://github.com/pentschev)
+- Allow using stream-ordered and managed RMM allocators in benchmarks ([#1012](https://github.com/rapidsai/dask-cuda/pull/1012)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1153](https://github.com/rapidsai/dask-cuda/pull/1153)) [@galipremsagar](https://github.com/galipremsagar)
+- Update minimum `pandas` and `numpy` pinnings ([#1139](https://github.com/rapidsai/dask-cuda/pull/1139)) [@galipremsagar](https://github.com/galipremsagar)
+- Drop Python 3.7 handling for pickle protocol 4 ([#1132](https://github.com/rapidsai/dask-cuda/pull/1132)) [@jakirkham](https://github.com/jakirkham)
+- Adapt to rapidsai/rmm#1221 which moves allocator callbacks ([#1129](https://github.com/rapidsai/dask-cuda/pull/1129)) [@wence-](https://github.com/wence-)
+- Merge `branch-23.02` into `branch-23.04` ([#1128](https://github.com/rapidsai/dask-cuda/pull/1128)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Template Conda recipe&#39;s `about` metadata ([#1121](https://github.com/rapidsai/dask-cuda/pull/1121)) [@jakirkham](https://github.com/jakirkham)
+- Fix GHA build workflow ([#1120](https://github.com/rapidsai/dask-cuda/pull/1120)) [@AjayThorve](https://github.com/AjayThorve)
+- Reduce error handling verbosity in CI tests scripts ([#1113](https://github.com/rapidsai/dask-cuda/pull/1113)) [@AjayThorve](https://github.com/AjayThorve)
+- Update shared workflow branches ([#1112](https://github.com/rapidsai/dask-cuda/pull/1112)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove gpuCI scripts. ([#1111](https://github.com/rapidsai/dask-cuda/pull/1111)) [@bdice](https://github.com/bdice)
+- Unpin `dask` and `distributed` for development ([#1110](https://github.com/rapidsai/dask-cuda/pull/1110)) [@galipremsagar](https://github.com/galipremsagar)
+- Move date to build string in `conda` recipe ([#1103](https://github.com/rapidsai/dask-cuda/pull/1103)) [@ajschmidt8](https://github.com/ajschmidt8)
+
 # dask-cuda 23.02.00 (9 Feb 2023)
 
 ## 🚨 Breaking Changes
diff --git a/MANIFEST.in b/MANIFEST.in
index 344d51cc8..d97770d06 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
 include dask_cuda/_version.py
+include dask_cuda/VERSION
diff --git a/README.md b/README.md
index da343f7c2..7d42cef77 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[![RTD](https://readthedocs.org/projects/dask-cuda/badge/?version=latest)](https://dask-cuda.readthedocs.io/en/latest/?badge=latest)
-
 Dask CUDA
 =========
 
@@ -20,7 +18,7 @@ cluster = LocalCUDACluster()
 client = Client(cluster)
 ```
 
-Documentation is available [here](https://dask-cuda.readthedocs.io/).
+Documentation is available [here](https://docs.rapids.ai/api/dask-cuda/nightly/).
 
 What this is not
 ----------------
@@ -32,4 +30,4 @@ systems.  Parallelizing GPU libraries like [RAPIDS](https://rapids.ai) and
 [CuPy](https://cupy.chainer.org) with Dask is an ongoing effort.  You may wish
 to read about this effort at [blog.dask.org](https://blog.dask.org) for more
 information.  Additional information about Dask-CUDA can also be found in the
-[docs]( https://dask-cuda.readthedocs.io ).
+[docs](https://docs.rapids.ai/api/dask-cuda/nightly/).
diff --git a/VERSION b/VERSION
new file mode 100644
index 000000000..ec8489fda
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+24.08.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 338ff974c..c2a65a414 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -7,10 +7,10 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
     --output conda \
-    --file_key docs \
+    --file-key docs \
     --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n docs
+rapids-mamba-retry env create --yes -f env.yaml -n docs
 conda activate docs
 
 rapids-print-env
@@ -18,21 +18,19 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-# Build Python docs
+export RAPIDS_VERSION_NUMBER="24.08"
+export RAPIDS_DOCS_DIR="$(mktemp -d)"
+
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml ./source _html
-sphinx-build -b text ./source _text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
+mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
 popd
 
-if [[ "${RAPIDS_BUILD_TYPE}" == "branch" ]]; then
-  rapids-logger "Upload Docs to S3"
-  aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/html"
-  aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/txt"
-fi
+rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 4124a4c5a..48cece328 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -3,15 +3,22 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+rapids-generate-version > ./VERSION
+
 rapids-logger "Begin py build"
+conda config --set path_conflict prevent
 
-rapids-mamba-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
deleted file mode 100755
index 5fea926cd..000000000
--- a/ci/build_python_pypi.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-
-python -m pip install build --user
-
-# While conda provides these during conda-build, they are also necessary during
-# the setup.py build for PyPI
-export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
-export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
-
-# Compute/export VERSION_SUFFIX
-source rapids-env-update
-
-python -m build \
-  --sdist \
-  --wheel \
-  --outdir dist/ \
-  .
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 000000000..828972dc2
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-configure-sccache
+source rapids-date-string
+
+rapids-generate-version > ./VERSION
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+
+RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist
diff --git a/ci/check_style.sh b/ci/check_style.sh
index be3ac3f4b..f8bc16525 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,10 +8,10 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n checks
+rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
 # Run pre-commit checks
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
deleted file mode 100644
index 55e8041ce..000000000
--- a/ci/docs/build.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION.
-######################################
-# Dask-CUDA Docs build script for CI #
-######################################
-
-if [ -z "$PROJECT_WORKSPACE" ]; then
-    echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment"
-    echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally"
-    exit 1
-fi
-
-export DOCS_WORKSPACE=$WORKSPACE/docs
-export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
-export HOME=$WORKSPACE
-export PROJECT_WORKSPACE=/rapids/dask-cuda
-export PROJECTS=(dask-cuda)
-
-gpuci_logger "Check environment..."
-env
-
-gpuci_logger "Check GPU usage..."
-nvidia-smi
-
-gpuci_logger "Activate conda env..."
-. /opt/conda/etc/profile.d/conda.sh
-conda activate rapids
-
-gpuci_logger "Check versions..."
-python --version
-$CC --version
-$CXX --version
-conda info
-conda config --show-sources
-conda list --show-channel-urls
-
-# Dask-CUDA Sphinx build
-gpuci_logger "Build Dask-CUDA docs..."
-cd $PROJECT_WORKSPACE/docs
-make html
-
-# commit to website
-cd $DOCS_WORKSPACE
-
-if [ ! -d "api/dask-cuda/$BRANCH_VERSION" ]; then
-    mkdir -p api/dask-cuda/$BRANCH_VERSION
-fi
-rm -rf $DOCS_WORKSPACE/api/dask-cuda/$BRANCH_VERSION/*
-
-mv $PROJECT_WORKSPACE/docs/build/html/* $DOCS_WORKSPACE/api/dask-cuda/$BRANCH_VERSION
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b73037951..ac834e5e8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,6 +22,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 NEXT_UCXPY_VERSION="$(curl -s https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
@@ -31,12 +32,32 @@ function sed_runner() {
     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
 }
 
-# Bump cudf and dask-cudf testing dependencies
-sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
-sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
+# Centralized version file update
+echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
+
+# Bump testing dependencies
+sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
+sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
+
+DEPENDENCIES=(
+  cudf
+  dask-cudf
+  kvikio
+  rapids-dask-dependency
+)
+for FILE in dependencies.yaml conda/environments/*.yaml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
+  done
+done
 
+# CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+done
+sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# Docs referencing source code
+find docs/source/ -type f -name *.rst -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" "${filename}"
 done
diff --git a/ci/test_python.sh b/ci/test_python.sh
index b9610bcaf..78330a403 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,10 +8,10 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n test
+rapids-mamba-retry env create --yes -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u
@@ -35,17 +35,24 @@ rapids-logger "Check GPU usage"
 nvidia-smi
 
 EXITCODE=0
-trap "EXITCODE=1" ERR
+set_exit_code() {
+    EXITCODE=$?
+    rapids-logger "Test failed with error ${EXITCODE}"
+}
+trap set_exit_code ERR
 set +e
 
-rapids-logger "pytest dask-cuda"
+rapids-logger "pytest dask-cuda (dask-expr)"
 pushd dask_cuda
+DASK_DATAFRAME__QUERY_PLANNING=True \
 DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 30m pytest \
+timeout 60m pytest \
   -vv \
+  --durations=0 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \
@@ -53,21 +60,112 @@ timeout 30m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
   --cov-report=term \
-  tests
+  tests -k "not ucxx"
+popd
+
+rapids-logger "pytest explicit-comms (legacy dd)"
+pushd dask_cuda
+DASK_DATAFRAME__QUERY_PLANNING=False \
+DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
+UCXPY_IFNAME=eth0 \
+UCX_WARN_UNUSED_ENV_VARS=n \
+UCX_MEMTYPE_CACHE=n \
+timeout 30m pytest \
+  -vv \
+  --durations=0 \
+  --capture=no \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda-legacy.xml" \
+  --cov-config=../pyproject.toml \
+  --cov=dask_cuda \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage-legacy.xml" \
+  --cov-report=term \
+  tests/test_explicit_comms.py -k "not ucxx"
 popd
 
-rapids-logger "Run local benchmark"
+rapids-logger "Run local benchmark (dask-expr)"
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend dask
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm-pool \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-async \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-managed \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+rapids-logger "Run local benchmark (legacy dd)"
+DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --partition-size="1 KiB" \
   -d 0 \
   --runs 1 \
   --backend dask
 
+DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --partition-size="1 KiB" \
   -d 0 \
   --runs 1 \
   --backend explicit-comms
 
-rapids-logger "Test script exiting with value: $EXITCODE"
+rapids-logger "Test script exiting with latest error code: $EXITCODE"
 exit ${EXITCODE}
diff --git a/codecov.yml b/codecov.yml
index aec6b2854..80d06e720 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,4 +1,3 @@
 #Configuration File for CodeCov
 ignore:
-  - "dask_cuda/_version.py"
   - "dask_cuda/benchmarks/*"  # benchmarks aren't covered
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
new file mode 100644
index 000000000..c0fed8e57
--- /dev/null
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -0,0 +1,37 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-version=11.4
+- cudatoolkit
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.23,<2.0a0
+- numpydoc>=1.1.0
+- pandas>=1.3
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.12
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
+- zict>=2.0.0
+name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
new file mode 100644
index 000000000..d1f6933cd
--- /dev/null
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,37 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-version=11.8
+- cudatoolkit
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.23,<2.0a0
+- numpydoc>=1.1.0
+- pandas>=1.3
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.12
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
+- zict>=2.0.0
+name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
new file mode 100644
index 000000000..4db52a6d6
--- /dev/null
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -0,0 +1,38 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- click >=8.1
+- cuda-nvcc-impl
+- cuda-nvrtc
+- cuda-version=12.2
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
+- numactl-devel-cos7-x86_64
+- numba>=0.57
+- numpy>=1.23,<2.0a0
+- numpydoc>=1.1.0
+- pandas>=1.3
+- pre-commit
+- pynvml>=11.0.0,<11.5
+- pytest
+- pytest-cov
+- python>=3.9,<3.12
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- setuptools>=64.0.0
+- sphinx
+- sphinx-click>=2.7.1
+- sphinx-rtd-theme>=0.5.1
+- ucx-proc=*=gpu
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
+- zict>=2.0.0
+name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 8d233d4e2..eba1a4fc0 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -4,7 +4,7 @@
 #   conda build -c conda-forge .
 {% set data = load_file_data("pyproject.toml") %}
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].strip('""').lstrip('v') %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
@@ -13,7 +13,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
@@ -21,19 +21,18 @@ build:
   script:
     - {{ PYTHON }} -m pip install . -vv
   entry_points:
-    {% for e in data.get("project", {}).get("scripts", {}).items() %}
-    - {{ e|join(" = ") }}
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint ~ ' = ' ~ data["project"]["scripts"][entrypoint] }}
     {% endfor %}
 
 requirements:
   host:
     - python
     - pip
-    - tomli
-    - versioneer >=0.24
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
-    {% for r in data.get("project", {}).get("dependencies", []) %}
+    {% for r in data["project"]["dependencies"] %}
     - {{ r }}
     {% endfor %}
 
@@ -42,18 +41,18 @@ test:
     - dask_cuda
   commands:
     - dask cuda --help
-    {% for e in data.get("project", {}).get("scripts", {}).keys() %}
-    - {{ e }} --help
-    - {{ e|replace("-", " ") }} --help
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint }} --help
+    - {{ entrypoint|replace("-", " ") }} --help
     {% endfor %}
 
 about:
-  home: {{ data.get("project", {}).get("urls", {}).get("Homepage", "") }}
-  license: {{ data.get("project", {}).get("license", {}).get("text", "") }}
+  home: {{ data["project"]["urls"]["Homepage"] }}
+  license: {{ data["project"]["license"]["text"] }}
   license_file:
-    {% for e in data.get("tool", {}).get("setuptools", {}).get("license-files", []) %}
+    {% for e in data["tool"]["setuptools"]["license-files"] %}
     - ../../../{{ e }}
     {% endfor %}
-  summary: {{ data.get("project", {}).get("description", "") }}
-  dev_url: {{ data.get("project", {}).get("urls", {}).get("Source", "") }}
-  doc_url: {{ data.get("project", {}).get("urls", {}).get("Documentation", "") }}
+  summary: {{ data["project"]["description"] }}
+  dev_url: {{ data["project"]["urls"]["Source"] }}
+  doc_url: {{ data["project"]["urls"]["Documentation"] }}
diff --git a/dask_cuda/VERSION b/dask_cuda/VERSION
new file mode 120000
index 000000000..6ff19de4b
--- /dev/null
+++ b/dask_cuda/VERSION
@@ -0,0 +1 @@
+../VERSION
\ No newline at end of file
diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index dc971797f..516599da3 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -3,30 +3,45 @@
 if sys.platform != "linux":
     raise ImportError("Only Linux is supported by Dask-CUDA at this time")
 
-
 import dask
+import dask.utils
 import dask.dataframe.core
 import dask.dataframe.shuffle
 import dask.dataframe.multi
+import dask.bag.core
 
-from ._version import get_versions
+from ._version import __git_commit__, __version__
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
-    get_default_shuffle_algorithm,
+    get_default_shuffle_method,
 )
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
 
-__version__ = get_versions()["version"]
-del get_versions
+
+if dask.config.get("dataframe.query-planning", None) is not False and dask.config.get(
+    "explicit-comms", False
+):
+    raise NotImplementedError(
+        "The 'explicit-comms' config is not yet supported when "
+        "query-planning is enabled in dask. Please use the shuffle "
+        "API directly, or use the legacy dask-dataframe API "
+        "(set the 'dataframe.query-planning' config to `False`"
+        "before importing `dask.dataframe`).",
+    )
 
 
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column
 )
-dask.dataframe.multi.get_default_shuffle_algorithm = get_default_shuffle_algorithm
+# We have to replace all modules that imports Dask's `get_default_shuffle_method()`
+# TODO: introduce a shuffle-algorithm dispatcher in Dask so we don't need this hack
+dask.dataframe.shuffle.get_default_shuffle_method = get_default_shuffle_method
+dask.dataframe.multi.get_default_shuffle_method = get_default_shuffle_method
+dask.bag.core.get_default_shuffle_method = get_default_shuffle_method
+
 
 # Monkey patching Dask to make use of proxify and unproxify in compatibility mode
 dask.dataframe.shuffle.shuffle_group = proxify_decorator(
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
index 6310ff96f..820bf10ba 100644
--- a/dask_cuda/_version.py
+++ b/dask_cuda/_version.py
@@ -1,693 +1,30 @@
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain.
-# Generated by versioneer-0.28
-# https://github.com/python-versioneer/python-versioneer
-
-"""Git implementation of _version.py."""
-
-import errno
-import functools
-import os
-import re
-import subprocess
-import sys
-from typing import Callable, Dict
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = "$Format:%d$"
-    git_full = "$Format:%H$"
-    git_date = "$Format:%ci$"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "pep440"
-    cfg.tag_prefix = "v"
-    cfg.parentdir_prefix = "dask_cuda-"
-    cfg.versionfile_source = "dask_cuda/_version.py"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY: Dict[str, str] = {}
-HANDLERS: Dict[str, Dict[str, Callable]] = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Create decorator to mark a method as the handler of a VCS."""
-
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    process = None
-
-    popen_kwargs = {}
-    if sys.platform == "win32":
-        # This hides the console window if pythonw.exe is used
-        startupinfo = subprocess.STARTUPINFO()
-        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
-        popen_kwargs["startupinfo"] = startupinfo
-
-    for command in commands:
-        try:
-            dispcmd = str([command] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            process = subprocess.Popen(
-                [command] + args,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=(subprocess.PIPE if hide_stderr else None),
-                **popen_kwargs,
-            )
-            break
-        except OSError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = process.communicate()[0].strip().decode()
-    if process.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, process.returncode
-    return stdout, process.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for _ in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {
-                "version": dirname[len(parentdir_prefix) :],
-                "full-revisionid": None,
-                "dirty": False,
-                "error": None,
-                "date": None,
-            }
-        rootdirs.append(root)
-        root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print(
-            "Tried directories %s but none started with prefix %s"
-            % (str(rootdirs), parentdir_prefix)
-        )
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        with open(versionfile_abs, "r") as fobj:
-            for line in fobj:
-                if line.strip().startswith("git_refnames ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["refnames"] = mo.group(1)
-                if line.strip().startswith("git_full ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["full"] = mo.group(1)
-                if line.strip().startswith("git_date ="):
-                    mo = re.search(r'=\s*"(.*)"', line)
-                    if mo:
-                        keywords["date"] = mo.group(1)
-    except OSError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if "refnames" not in keywords:
-        raise NotThisMethod("Short version file found")
-    date = keywords.get("date")
-    if date is not None:
-        # Use only the last line.  Previous lines may contain GPG signature
-        # information.
-        date = date.splitlines()[-1]
-
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = {r.strip() for r in refnames.strip("()").split(",")}
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = {r for r in refs if re.search(r"\d", r)}
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix) :]
-            # Filter out refs that exactly match prefix or that don't start
-            # with a number once the prefix is stripped (mostly a concern
-            # when prefix is '')
-            if not re.match(r"\d", r):
-                continue
-            if verbose:
-                print("picking %s" % r)
-            return {
-                "version": r,
-                "full-revisionid": keywords["full"].strip(),
-                "dirty": False,
-                "error": None,
-                "date": date,
-            }
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {
-        "version": "0+unknown",
-        "full-revisionid": keywords["full"].strip(),
-        "dirty": False,
-        "error": "no suitable tags",
-        "date": None,
-    }
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    # GIT_DIR can interfere with correct operation of Versioneer.
-    # It may be intended to be passed to the Versioneer-versioned project,
-    # but that should not change where we get our version from.
-    env = os.environ.copy()
-    env.pop("GIT_DIR", None)
-    runner = functools.partial(runner, env=env)
-
-    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = runner(
-        GITS,
-        [
-            "describe",
-            "--tags",
-            "--dirty",
-            "--always",
-            "--long",
-            "--match",
-            f"{tag_prefix}[[:digit:]]*",
-        ],
-        cwd=root,
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
     )
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
-    # --abbrev-ref was added in git-1.6.3
-    if rc != 0 or branch_name is None:
-        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
-    branch_name = branch_name.strip()
-
-    if branch_name == "HEAD":
-        # If we aren't exactly on a branch, pick a branch which represents
-        # the current commit. If all else fails, we are on a branchless
-        # commit.
-        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
-        # --contains was added in git-1.5.4
-        if rc != 0 or branches is None:
-            raise NotThisMethod("'git branch --contains' returned error")
-        branches = branches.split("\n")
-
-        # Remove the first line if we're running detached
-        if "(" in branches[0]:
-            branches.pop(0)
-
-        # Strip off the leading "* " from the list of branches.
-        branches = [branch[2:] for branch in branches]
-        if "master" in branches:
-            branch_name = "master"
-        elif not branches:
-            branch_name = None
-        else:
-            # Pick the first branch that is returned. Good or bad.
-            branch_name = branches[0]
-
-    pieces["branch"] = branch_name
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[: git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
-        if not mo:
-            # unparsable. Maybe git-describe is misbehaving?
-            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
-                full_tag,
-                tag_prefix,
-            )
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
-        pieces["distance"] = len(out.split())  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
-    # Use only the last line.  Previous lines may contain GPG signature
-    # information.
-    date = date.splitlines()[-1]
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_branch(pieces):
-    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
-
-    The ".dev0" means not master branch. Note that .dev0 sorts backwards
-    (a feature branch will appear "older" than the master branch).
-
-    Exceptions:
-    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            if pieces["branch"] != "master":
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0"
-        if pieces["branch"] != "master":
-            rendered += ".dev0"
-        rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def pep440_split_post(ver):
-    """Split pep440 version string at the post-release segment.
-
-    Returns the release segments before the post-release and the
-    post-release version number (or -1 if no post-release segment is present).
-    """
-    vc = str.split(ver, ".post")
-    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-
-
-def render_pep440_pre(pieces):
-    """TAG[.postN.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post0.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        if pieces["distance"]:
-            # update the post release segment
-            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
-            rendered = tag_version
-            if post_version is not None:
-                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
-            else:
-                rendered += ".post0.dev%d" % (pieces["distance"])
-        else:
-            # no commits, use the tag as the version
-            rendered = pieces["closest-tag"]
-    else:
-        # exception #1
-        rendered = "0.post0.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_post_branch(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
-
-    The ".dev0" means not master branch.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["branch"] != "master":
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["branch"] != "master":
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {
-            "version": "unknown",
-            "full-revisionid": pieces.get("long"),
-            "dirty": None,
-            "error": pieces["error"],
-            "date": None,
-        }
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-branch":
-        rendered = render_pep440_branch(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-post-branch":
-        rendered = render_pep440_post_branch(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {
-        "version": rendered,
-        "full-revisionid": pieces["long"],
-        "dirty": pieces["dirty"],
-        "error": None,
-        "date": pieces.get("date"),
-    }
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for _ in cfg.versionfile_source.split("/"):
-            root = os.path.dirname(root)
-    except NameError:
-        return {
-            "version": "0+unknown",
-            "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to find root of source tree",
-            "date": None,
-        }
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
+except FileNotFoundError:
+    __git_commit__ = ""
 
-    return {
-        "version": "0+unknown",
-        "full-revisionid": None,
-        "dirty": None,
-        "error": "unable to compute version",
-        "date": None,
-    }
+__all__ = ["__git_commit__", "__version__"]
diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 0b417e7b3..7f48d4fae 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -117,14 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
     wait_for_cluster(client, shutdown_on_failure=True)
     assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
-        client,
-        args.type == "gpu",
-        args.rmm_pool_size,
-        args.disable_rmm_pool,
-        args.enable_rmm_async,
-        args.enable_rmm_managed,
-        args.rmm_log_directory,
-        args.enable_rmm_statistics,
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index 4e9dea94e..2f07e3df7 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -139,7 +139,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index f26a26ae9..6a68ad788 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -7,8 +7,7 @@
 import pandas as pd
 
 import dask
-from dask.base import tokenize
-from dask.dataframe.core import new_dd_object
+import dask.dataframe as dd
 from dask.distributed import performance_report, wait
 from dask.utils import format_bytes, parse_bytes
 
@@ -25,12 +24,20 @@
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
 
 
-def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu):
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
+
+
+def generate_chunk(input):
+    i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu = input
+
     # Setting a seed that triggers max amount of comm in the two-GPU case.
     if gpu:
         import cupy as xp
 
         import cudf as xdf
+        import dask_cudf  # noqa: F401
     else:
         import numpy as xp
         import pandas as xdf
@@ -105,25 +112,25 @@ def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
 
     parts = [chunk_size for _ in range(num_chunks)]
     device_type = True if args.type == "gpu" else False
-    meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
+    meta = generate_chunk((0, 4, 1, chunk_type, None, device_type))
     divisions = [None] * (len(parts) + 1)
 
-    name = "generate-data-" + tokenize(chunk_size, num_chunks, frac_match, chunk_type)
-
-    graph = {
-        (name, i): (
-            generate_chunk,
-            i,
-            part,
-            len(parts),
-            chunk_type,
-            frac_match,
-            device_type,
-        )
-        for i, part in enumerate(parts)
-    }
-
-    ddf = new_dd_object(graph, name, meta, divisions)
+    ddf = dd.from_map(
+        generate_chunk,
+        [
+            (
+                i,
+                part,
+                len(parts),
+                chunk_type,
+                frac_match,
+                device_type,
+            )
+            for i, part in enumerate(parts)
+        ],
+        meta=meta,
+        divisions=divisions,
+    )
 
     if chunk_type == "build":
         if not args.no_shuffle:
@@ -217,7 +224,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Frac-match", value=f"{args.frac_match}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 51ba48f93..a1129dd37 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -8,8 +8,6 @@
 
 import dask
 import dask.dataframe
-from dask.dataframe.core import new_dd_object
-from dask.dataframe.shuffle import shuffle
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, parse_bytes
 
@@ -33,7 +31,7 @@
 
 
 def shuffle_dask(df, args):
-    result = shuffle(df, index="data", shuffle="tasks", ignore_index=args.ignore_index)
+    result = df.shuffle("data", shuffle_method="tasks", ignore_index=args.ignore_index)
     if args.backend == "dask-noop":
         result = as_noop(result)
     t1 = perf_counter()
@@ -94,18 +92,24 @@ def create_data(
         )
 
     # Create partition based to the specified partition distribution
-    dsk = {}
+    futures = []
     for i, part_size in enumerate(dist):
         for _ in range(part_size):
             # We use `client.submit` to control placement of the partition.
-            dsk[(name, len(dsk))] = client.submit(
-                create_df, chunksize, args.type, workers=[workers[i]], pure=False
+            futures.append(
+                client.submit(
+                    create_df, chunksize, args.type, workers=[workers[i]], pure=False
+                )
             )
-    wait(dsk.values())
+    wait(futures)
 
     df_meta = create_df(0, args.type)
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    ret = dask.dataframe.from_delayed(
+        futures,
+        meta=df_meta,
+        divisions=divs,
+    ).persist()
     wait(ret)
 
     data_processed = args.in_parts * args.partition_size
@@ -146,7 +150,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
@@ -254,7 +258,9 @@ def parse_args():
     ]
 
     return parse_benchmark_args(
-        description="Distributed shuffle (dask/cudf) benchmark", args_list=special_args
+        description="Distributed shuffle (dask/cudf) benchmark",
+        args_list=special_args,
+        check_explicit_comms=False,
     )
 
 
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index 1c1d12d30..22c51556f 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -193,7 +193,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index f40318559..8250c9f9f 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -78,7 +78,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index a3d51066a..48e4755fb 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -11,11 +11,13 @@
 import numpy as np
 import pandas as pd
 
+from dask import config
 from dask.distributed import Client, SSHCluster
 from dask.utils import format_bytes, format_time, parse_bytes
 from distributed.comm.addressing import get_address_host
 
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
+from dask_cuda.utils import parse_device_memory_limit
 
 
 def as_noop(dsk):
@@ -47,7 +49,11 @@ def as_noop(dsk):
         raise RuntimeError("Requested noop computation but dask-noop not installed.")
 
 
-def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]):
+def parse_benchmark_args(
+    description="Generic dask-cuda Benchmark",
+    args_list=[],
+    check_explicit_comms=True,
+):
     parser = argparse.ArgumentParser(description=description)
     worker_args = parser.add_argument_group(description="Worker configuration")
     worker_args.add_argument(
@@ -73,7 +79,7 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     cluster_args.add_argument(
         "-p",
         "--protocol",
-        choices=["tcp", "ucx"],
+        choices=["tcp", "ucx", "ucxx"],
         default="tcp",
         type=str,
         help="The communication protocol to use.",
@@ -88,15 +94,41 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "'forkserver' can be used to avoid issues with fork not being allowed "
         "after the networking stack has been initialised.",
     )
+    cluster_args.add_argument(
+        "--disable-rmm",
+        action="store_true",
+        help="Disable RMM.",
+    )
+    cluster_args.add_argument(
+        "--disable-rmm-pool",
+        action="store_true",
+        help="Uses RMM for allocations but without a memory pool.",
+    )
     cluster_args.add_argument(
         "--rmm-pool-size",
         default=None,
         type=parse_bytes,
         help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
-        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
+        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
-        "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
+        "--rmm-maximum-pool-size",
+        default=None,
+        help="When ``--rmm-pool-size`` is specified, this argument indicates the "
+        "maximum pool size.  Can be an integer (bytes), or a string (like '4GB' or "
+        "'5000M'). By default, the total available memory on the GPU is used. "
+        "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
+        "pool size."
+        ""
+        ".. note::"
+        "    When paired with `--enable-rmm-async` the maximum size cannot be "
+        "    guaranteed due to fragmentation."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
         "--enable-rmm-managed",
@@ -108,6 +140,15 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         action="store_true",
         help="Enable RMM async memory allocator (implies --disable-rmm-pool)",
     )
+    cluster_args.add_argument(
+        "--rmm-release-threshold",
+        default=None,
+        type=parse_bytes,
+        help="When --enable-rmm-async is set and the pool size grows beyond this "
+        "value, unused memory held by the pool will be released at the next "
+        "synchronization point. Can be an integer (bytes), or a string string (like "
+        "'4GB' or '5000M'). By default, this feature is disabled.",
+    )
     cluster_args.add_argument(
         "--rmm-log-directory",
         default=None,
@@ -122,6 +163,17 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "This enables spilling implementations such as JIT-Unspill to provides more "
         "information on out-of-memory errors",
     )
+    cluster_args.add_argument(
+        "--enable-rmm-track-allocations",
+        action="store_true",
+        help="When enabled, wraps the memory resource used by each worker with a "
+        "``rmm.mr.TrackingResourceAdaptor``, which tracks the amount of memory "
+        "allocated."
+        "NOTE: This option enables additional diagnostics to be collected and "
+        "reported by the Dask dashboard. However, there is significant overhead "
+        "associated with this and it should only be used for debugging and memory "
+        "profiling.",
+    )
     cluster_args.add_argument(
         "--enable-tcp-over-ucx",
         default=None,
@@ -201,6 +253,13 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "since the workers are assumed to be started separately. Similarly the other "
         "cluster configuration options have no effect.",
     )
+    group.add_argument(
+        "--dashboard-address",
+        default=None,
+        type=str,
+        help="Address on which to listen for diagnostics dashboard, ignored if "
+        "either ``--scheduler-address`` or ``--scheduler-file`` is specified.",
+    )
     cluster_args.add_argument(
         "--shutdown-external-cluster-on-exit",
         default=False,
@@ -290,6 +349,24 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     if args.multi_node and len(args.hosts.split(",")) < 2:
         raise ValueError("--multi-node requires at least 2 hosts")
 
+    # Raise error early if "explicit-comms" is not allowed
+    if (
+        check_explicit_comms
+        and args.backend == "explicit-comms"
+        and config.get(
+            "dataframe.query-planning",
+            None,
+        )
+        is not False
+    ):
+        raise NotImplementedError(
+            "The 'explicit-comms' config is not yet supported when "
+            "query-planning is enabled in dask. Please use the legacy "
+            "dask-dataframe API by setting the following environment "
+            "variable before executing:",
+            "    DASK_DATAFRAME__QUERY_PLANNING=False",
+        )
+
     return args
 
 
@@ -308,7 +385,11 @@ def get_cluster_options(args):
 
         cluster_kwargs = {
             "connect_options": {"known_hosts": None},
-            "scheduler_options": {"protocol": args.protocol, "port": 8786},
+            "scheduler_options": {
+                "protocol": args.protocol,
+                "port": 8786,
+                "dashboard_address": args.dashboard_address,
+            },
             "worker_class": "dask_cuda.CUDAWorker",
             "worker_options": {
                 "protocol": args.protocol,
@@ -325,6 +406,7 @@ def get_cluster_options(args):
         cluster_args = []
         cluster_kwargs = {
             "protocol": args.protocol,
+            "dashboard_address": args.dashboard_address,
             "n_workers": len(args.devs.split(",")),
             "threads_per_worker": args.threads_per_worker,
             "CUDA_VISIBLE_DEVICES": args.devs,
@@ -352,72 +434,132 @@ def get_worker_device():
         return -1
 
 
+def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
+    import cupy
+
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+
+
 def setup_memory_pool(
     dask_worker=None,
+    disable_rmm=None,
+    disable_rmm_pool=None,
     pool_size=None,
-    disable_pool=False,
+    maximum_pool_size=None,
     rmm_async=False,
     rmm_managed=False,
+    release_threshold=None,
     log_directory=None,
     statistics=False,
+    rmm_track_allocations=False,
 ):
-    import cupy
-
     import rmm
 
     from dask_cuda.utils import get_rmm_log_file_name
 
     logging = log_directory is not None
 
-    if rmm_async:
-        rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource())
-        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
-    else:
-        rmm.reinitialize(
-            pool_allocator=not disable_pool,
-            managed_memory=rmm_managed,
-            initial_pool_size=pool_size,
-            logging=logging,
-            log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
+    if pool_size is not None:
+        pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
+
+    if maximum_pool_size is not None:
+        maximum_pool_size = parse_device_memory_limit(
+            maximum_pool_size, alignment_size=256
         )
-        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
-    if statistics:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+
+    if release_threshold is not None:
+        release_threshold = parse_device_memory_limit(
+            release_threshold, alignment_size=256
         )
 
+    if not disable_rmm:
+        if rmm_async:
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size,
+                release_threshold=release_threshold,
+            )
+
+            if maximum_pool_size is not None:
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+        else:
+            rmm.reinitialize(
+                pool_allocator=not disable_rmm_pool,
+                managed_memory=rmm_managed,
+                initial_pool_size=pool_size,
+                maximum_pool_size=maximum_pool_size,
+                logging=logging,
+                log_file_name=get_rmm_log_file_name(
+                    dask_worker, logging, log_directory
+                ),
+            )
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+
 
 def setup_memory_pools(
     client,
     is_gpu,
+    disable_rmm,
+    disable_rmm_pool,
     pool_size,
-    disable_pool,
+    maximum_pool_size,
     rmm_async,
     rmm_managed,
+    release_threshold,
     log_directory,
     statistics,
+    rmm_track_allocations,
 ):
     if not is_gpu:
         return
     client.run(
         setup_memory_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
         pool_size=pool_size,
-        disable_pool=disable_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
+        release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
+        rmm_track_allocations=rmm_track_allocations,
     )
     # Create an RMM pool on the scheduler due to occasional deserialization
     # of CUDA objects. May cause issues with InfiniBand otherwise.
     client.run_on_scheduler(
         setup_memory_pool,
         pool_size=1e9,
-        disable_pool=disable_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
+        release_threshold=release_threshold,
         log_directory=log_directory,
         statistics=statistics,
+        rmm_track_allocations=rmm_track_allocations,
     )
 
 
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index b7069d632..ba58fe3e5 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -87,9 +87,10 @@ def cuda():
     "--memory-limit",
     default="auto",
     show_default=True,
-    help="""Bytes of memory per process that the worker can use. Can be an integer
-    (bytes), float (fraction of total system memory), string (like ``"5GB"`` or
-    ``"5000M"``), or ``"auto"`` or 0 for no memory management.""",
+    help="""Size of the host LRU cache, which is used to determine when the worker
+    starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
+    integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
+    or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.""",
 )
 @click.option(
     "--device-memory-limit",
@@ -119,6 +120,10 @@ def cuda():
     memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
     to set the maximum pool size.
 
+    .. note::
+        When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
+        to fragmentation.
+
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
 )
@@ -145,6 +150,17 @@ def cuda():
         incompatible with RMM pools and managed memory, trying to enable both will
         result in failure.""",
 )
+@click.option(
+    "--rmm-release-threshold",
+    default=None,
+    help="""When ``rmm.async`` is ``True`` and the pool size grows beyond this value, unused
+    memory held by the pool will be released at the next synchronization point. Can be
+    an integer (bytes), float (fraction of total device memory), string (like ``"5GB"``
+    or ``"5000M"``) or ``None``. By default, this feature is disabled.
+
+    .. note::
+        This size is a per-worker configuration, and not cluster-wide.""",
+)
 @click.option(
     "--rmm-log-directory",
     default=None,
@@ -232,6 +248,12 @@ def cuda():
     help="""Module that should be loaded by each worker process like ``"foo.bar"`` or
     ``"/path/to/foo.py"``.""",
 )
+@click.option(
+    "--death-timeout",
+    type=str,
+    default=None,
+    help="Seconds to wait for a scheduler before closing",
+)
 @click.option(
     "--dashboard-prefix",
     type=str,
@@ -312,6 +334,7 @@ def worker(
     rmm_maximum_pool_size,
     rmm_managed_memory,
     rmm_async,
+    rmm_release_threshold,
     rmm_log_directory,
     rmm_track_allocations,
     pid_file,
@@ -383,6 +406,7 @@ def worker(
             rmm_maximum_pool_size,
             rmm_managed_memory,
             rmm_async,
+            rmm_release_threshold,
             rmm_log_directory,
             rmm_track_allocations,
             pid_file,
@@ -480,3 +504,7 @@ def config(
     else:
         client = Client(scheduler, security=security)
     print_cluster_config(client)
+
+
+if __name__ == "__main__":
+    worker()
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 03b16b529..e25a7c142 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -20,11 +20,9 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_n_gpus,
@@ -47,6 +45,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
         pid_file=None,
@@ -138,19 +137,13 @@ def del_pid_file():
                     "For installation instructions, please see "
                     "https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            if rmm_async:
-                raise ValueError(
-                    "RMM pool and managed memory are incompatible with asynchronous "
-                    "allocator"
-                )
-
         else:
             if enable_nvlink:
                 warnings.warn(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size`.  Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
-                    "#important-notes for more details"
+                    "https://docs.rapids.ai/api/dask-cuda/nightly/ucx/ "
+                    "for more details"
                 )
 
         if enable_nvlink and rmm_managed_memory:
@@ -215,12 +208,13 @@ def del_pid_file():
                         get_cpu_affinity(nvml_device_index(i, cuda_visible_devices(i)))
                     ),
                     RMMSetup(
-                        rmm_pool_size,
-                        rmm_maximum_pool_size,
-                        rmm_managed_memory,
-                        rmm_async,
-                        rmm_log_directory,
-                        rmm_track_allocations,
+                        initial_pool_size=rmm_pool_size,
+                        maximum_pool_size=rmm_maximum_pool_size,
+                        managed_memory=rmm_managed_memory,
+                        async_alloc=rmm_async,
+                        release_threshold=rmm_release_threshold,
+                        log_directory=rmm_log_directory,
+                        track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
                 },
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index fb31c3dd2..b646a9294 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -1,14 +1,13 @@
-import functools
 import itertools
 import logging
 import os
-import sys
 import time
 
 import numpy
-from zict import Buffer, File, Func
+from zict import Buffer, Func
 from zict.common import ZictBase
 
+import dask
 from distributed.protocol import (
     dask_deserialize,
     dask_serialize,
@@ -18,6 +17,7 @@
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
+from distributed.spill import AnyKeyFile as KeyAsStringFile
 from distributed.utils import nbytes
 
 from .is_device_object import is_device_object
@@ -25,6 +25,13 @@
 from .utils import nvtx_annotate
 
 
+def _serialize_bytelist(x, **kwargs):
+    kwargs["on_error"] = "raise"
+
+    compression = dask.config.get("distributed.worker.memory.spill-compression")
+    return serialize_bytelist(x, compression=compression, **kwargs)
+
+
 class LoggedBuffer(Buffer):
     """Extends zict.Buffer with logging capabilities
 
@@ -193,9 +200,12 @@ def __init__(
 
         self.host_func = dict()
         self.disk_func = Func(
-            functools.partial(serialize_bytelist, on_error="raise"),
+            _serialize_bytelist,
             deserialize_bytes,
-            File(self.disk_func_path),
+            # Task keys are not strings, so this takes care of
+            # converting arbitrary tuple keys into a string before
+            # handing off to zict.File
+            KeyAsStringFile(self.disk_func_path),
         )
 
         host_buffer_kwargs = {}
@@ -240,34 +250,6 @@ def __init__(
         # Dict of objects that will not be spilled by DeviceHostFile.
         self.others = {}
 
-    if sys.version_info < (3, 9):
-
-        def __new__(
-            cls,
-            # So named such that dask will pass in the worker's local
-            # directory when constructing this through the "data" callback.
-            worker_local_directory,
-            *,
-            device_memory_limit=None,
-            memory_limit=None,
-            log_spilling=False,
-        ):
-            """
-            This is here to support Python 3.8. Right now (to support
-            3.8), ZictBase inherits from typing.MutableMapping through
-            which inspect.signature determines that the signature of
-            __init__ is just (*args, **kwargs). We need to advertise the
-            correct signature so that distributed will correctly figure
-            out that it needs to pass the worker's local directory. In
-            Python 3.9 and later, typing.MutableMapping is just an alias
-            for collections.abc.MutableMapping and we don't need to do
-            anything.
-
-            With this pass-through definition of __new__, the
-            signature of the constructor is correctly determined.
-            """
-            return super().__new__(cls)
-
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key
diff --git a/dask_cuda/disk_io.py b/dask_cuda/disk_io.py
index 0427b77f0..3885e9997 100644
--- a/dask_cuda/disk_io.py
+++ b/dask_cuda/disk_io.py
@@ -1,3 +1,4 @@
+import itertools
 import os
 import os.path
 import pathlib
@@ -125,11 +126,13 @@ def __init__(
 
         if self.gds_enabled:
             try:
-                import cucim.clara.filesystem as cucim_fs  # noqa F401
+                import kvikio  # noqa F401
             except ImportError:
-                raise ImportError("GPUDirect Storage requires the cucim Python package")
+                raise ImportError(
+                    "GPUDirect Storage requires the kvikio Python package"
+                )
             else:
-                self.gds_enabled = bool(cucim_fs.is_gds_available())
+                self.gds_enabled = kvikio.DriverProperties().is_gds_available
 
     def gen_file_path(self) -> str:
         """Generate an unique file path"""
@@ -162,18 +165,19 @@ def disk_write(path: str, frames: Iterable, shared_filesystem: bool, gds=False)
         A dict of metadata
     """
     cuda_frames = tuple(hasattr(f, "__cuda_array_interface__") for f in frames)
-    frame_lengths = tuple(map(nbytes, frames))
-    if gds and any(cuda_frames):
-        import cucim.clara.filesystem as cucim_fs
-
-        with cucim_fs.open(path, "w") as f:
-            for frame, length in zip(frames, frame_lengths):
-                f.pwrite(buf=frame, count=length, file_offset=0, buf_offset=0)
 
+    if gds and any(cuda_frames):
+        import kvikio
+
+        # Write each frame consecutively into `path` in parallel
+        with kvikio.CuFile(path, "w") as f:
+            file_offsets = itertools.accumulate(map(nbytes, frames), initial=0)
+            futures = [f.pwrite(b, file_offset=o) for b, o in zip(frames, file_offsets)]
+            for each_fut in futures:
+                each_fut.get()
     else:
         with open(path, "wb") as f:
-            for frame in frames:
-                f.write(frame)
+            os.writev(f.fileno(), frames)  # type: ignore
     return {
         "method": "stdio",
         "path": SpillToDiskFile(path),
@@ -199,22 +203,22 @@ def disk_read(header: Mapping, gds=False) -> list:
     frames: list
         List of read frames
     """
-    ret = []
+    ret: list = [
+        get_new_cuda_buffer()(length)
+        if gds and is_cuda
+        else np.empty((length,), dtype="u1")
+        for length, is_cuda in zip(header["frame-lengths"], header["cuda-frames"])
+    ]
     if gds:
-        import cucim.clara.filesystem as cucim_fs  # isort:skip
-
-        with cucim_fs.open(header["path"], "rb") as f:
-            file_offset = 0
-            for length, is_cuda in zip(header["frame-lengths"], header["cuda-frames"]):
-                if is_cuda:
-                    buf = get_new_cuda_buffer()(length)
-                else:
-                    buf = np.empty((length,), dtype="u1")
-                f.pread(buf=buf, count=length, file_offset=file_offset, buf_offset=0)
-                file_offset += length
-                ret.append(buf)
+        import kvikio  # isort:skip
+
+        with kvikio.CuFile(str(header["path"]), "r") as f:
+            # Read each frame consecutively from `path` in parallel
+            file_offsets = itertools.accumulate((b.nbytes for b in ret), initial=0)
+            futures = [f.pread(b, file_offset=o) for b, o in zip(ret, file_offsets)]
+            for each_fut in futures:
+                each_fut.get()
     else:
         with open(str(header["path"]), "rb") as f:
-            for length in header["frame-lengths"]:
-                ret.append(f.read(length))
+            os.readv(f.fileno(), ret)  # type: ignore
     return ret
diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index 05dbc9619..0fe5422d8 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -6,7 +6,6 @@
 from typing import Any, Dict, Hashable, Iterable, List, Optional
 
 import distributed.comm
-from dask.utils import stringify
 from distributed import Client, Worker, default_client, get_worker
 from distributed.comm.addressing import parse_address, parse_host_port, unparse_address
 
@@ -305,8 +304,7 @@ def stage_keys(self, name: str, keys: Iterable[Hashable]) -> Dict[int, set]:
         dict
             dict that maps each worker-rank to the workers set of staged keys
         """
-        key_set = {stringify(k) for k in keys}
-        return dict(self.run(_stage_keys, name, key_set))
+        return dict(self.run(_stage_keys, name, set(keys)))
 
 
 def pop_staging_area(session_state: dict, name: str) -> Dict[str, Any]:
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index a444fce0b..70f123354 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -8,13 +8,18 @@
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
+import numpy as np
+import pandas as pd
+
 import dask
 import dask.config
 import dask.dataframe
+import dask.dataframe as dd
 import dask.utils
 import distributed.worker
 from dask.base import tokenize
-from dask.dataframe.core import DataFrame, Series, _concat as dd_concat, new_dd_object
+from dask.dataframe import DataFrame, Series
+from dask.dataframe.core import _concat as dd_concat
 from dask.dataframe.shuffle import group_split_dispatch, hash_object_dispatch
 from distributed import wait
 from distributed.protocol import nested_deserialize, to_serialize
@@ -153,9 +158,16 @@ def compute_map_index(
     if column_names[0] == "_partitions":
         ind = df[column_names[0]]
     else:
-        ind = hash_object_dispatch(
-            df[column_names] if column_names else df, index=False
-        )
+        # Need to cast numerical dtypes to be consistent
+        # with `dask.dataframe.shuffle.partitioning_index`
+        dtypes = {}
+        index = df[column_names] if column_names else df
+        for col, dtype in index.dtypes.items():
+            if pd.api.types.is_numeric_dtype(dtype):
+                dtypes[col] = np.float64
+        if dtypes:
+            index = index.astype(dtypes, errors="ignore")
+        ind = hash_object_dispatch(index, index=False)
     return ind % npartitions
 
 
@@ -185,15 +197,8 @@ def partition_dataframe(
     partitions
         Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
-        return dict(
-            zip(
-                range(npartitions),
-                df.partition_by_hash(
-                    column_names, npartitions, keep_index=not ignore_index
-                ),
-            )
-        )
+    # TODO: Use `partition_by_hash` if/when dtype-casting is added
+    # (See: https://github.com/rapidsai/cudf/issues/16221)
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 
@@ -328,7 +333,7 @@ async def shuffle_task(
     ignore_index: bool,
     num_rounds: int,
     batchsize: int,
-) -> List[DataFrame]:
+) -> Dict[int, DataFrame]:
     """Explicit-comms shuffle task
 
     This function is running on each worker participating in the shuffle.
@@ -360,8 +365,8 @@ async def shuffle_task(
 
     Returns
     -------
-    partitions: list of DataFrames
-        List of dataframe-partitions
+    partitions: dict
+        dict that maps each Partition ID to a dataframe-partition
     """
 
     proxify = get_proxify(s["worker"])
@@ -387,14 +392,13 @@ async def shuffle_task(
         )
 
     # Finally, we concatenate the output dataframes into the final output partitions
-    ret = []
+    ret = {}
     while out_part_id_to_dataframe_list:
-        ret.append(
-            proxify(
-                dd_concat(
-                    out_part_id_to_dataframe_list.popitem()[1],
-                    ignore_index=ignore_index,
-                )
+        part_id, dataframe_list = out_part_id_to_dataframe_list.popitem()
+        ret[part_id] = proxify(
+            dd_concat(
+                dataframe_list,
+                ignore_index=ignore_index,
             )
         )
         # For robustness, we yield this task to give Dask a chance to do bookkeeping
@@ -469,18 +473,19 @@ def shuffle(
         npartitions = df.npartitions
 
     # Step (a):
-    df = df.persist()  # Make sure optimizations are apply on the existing graph
+    df = df.persist()  # Make sure optimizations are applied on the existing graph
     wait([df])  # Make sure all keys has been materialized on workers
+    persisted_keys = [f.key for f in c.client.futures_of(df)]
     name = (
         "explicit-comms-shuffle-"
-        f"{tokenize(df, column_names, npartitions, ignore_index)}"
+        f"{tokenize(df, column_names, npartitions, ignore_index, batchsize)}"
     )
     df_meta: DataFrame = df._meta
 
     # Stage all keys of `df` on the workers and cancel them, which makes it possible
     # for the shuffle to free memory as the partitions of `df` are consumed.
     # See CommsContext.stage_keys() for a description of staging.
-    rank_to_inkeys = c.stage_keys(name=name, keys=df.__dask_keys__())
+    rank_to_inkeys = c.stage_keys(name=name, keys=persisted_keys)
     c.client.cancel(df)
 
     # Get batchsize
@@ -527,20 +532,27 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
 
-    dsk = {}
+    _futures = {}
     for rank in ranks:
-        for i, part_id in enumerate(rank_to_out_part_ids[rank]):
-            dsk[(name, part_id)] = c.client.submit(
-                getitem, shuffle_result[rank], i, workers=[c.worker_addresses[rank]]
+        for part_id in rank_to_out_part_ids[rank]:
+            _futures[part_id] = c.client.submit(
+                getitem,
+                shuffle_result[rank],
+                part_id,
+                workers=[c.worker_addresses[rank]],
             )
 
+    # Make sure partitions are properly ordered
+    futures = [_futures.pop(i) for i in range(npartitions)]
+
     # Create a distributed Dataframe from all the pieces
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
+    ret = dd.from_delayed(futures, **kwargs).persist()
     wait([ret])
 
     # Release all temporary dataframes
-    for fut in [*shuffle_result.values(), *dsk.values()]:
+    for fut in [*shuffle_result.values(), *futures]:
         fut.release()
     return ret
 
@@ -575,7 +587,7 @@ def wrapper(*args, **kwargs):
             kw = kw.arguments
             # Notice, we only overwrite the default and the "tasks" shuffle
             # algorithm. The "disk" and "p2p" algorithm, we don't touch.
-            if kw["shuffle"] in ("tasks", None):
+            if kw["shuffle_method"] in ("tasks", None):
                 col = kw["col"]
                 if isinstance(col, str):
                     col = [col]
@@ -585,7 +597,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def get_default_shuffle_algorithm() -> str:
+def get_default_shuffle_method() -> str:
     """Return the default shuffle algorithm used by Dask
 
     This changes the default shuffle algorithm from "p2p" to "tasks"
@@ -594,4 +606,4 @@ def get_default_shuffle_algorithm() -> str:
     ret = dask.config.get("dataframe.shuffle.algorithm", None)
     if ret is None and _use_explicit_comms():
         return "tasks"
-    return dask.utils.get_default_shuffle_algorithm()
+    return dask.utils.get_default_shuffle_method()
diff --git a/dask_cuda/get_device_memory_objects.py b/dask_cuda/get_device_memory_objects.py
index c5746c862..cd079f4ed 100644
--- a/dask_cuda/get_device_memory_objects.py
+++ b/dask_cuda/get_device_memory_objects.py
@@ -124,6 +124,10 @@ def get_device_memory_objects_cudf_index(obj):
     def get_device_memory_objects_cudf_multiindex(obj):
         return dispatch(obj._columns)
 
+    @dispatch.register(cudf.core.column.ColumnBase)
+    def get_device_memory_objects_cudf_column(obj):
+        return dispatch(obj.data) + dispatch(obj.children) + dispatch(obj.mask)
+
 
 @sizeof.register_lazy("cupy")
 def register_cupy():  # NB: this overwrites dask.sizeof.register_cupy()
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index 0b9c92a59..571a46a55 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -5,7 +5,6 @@
 import numba.cuda
 
 import dask
-import distributed.comm.ucx
 from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
 
 from .utils import get_ucx_config
@@ -23,12 +22,21 @@ def _create_cuda_context_handler():
         numba.cuda.current_context()
 
 
-def _create_cuda_context():
+def _create_cuda_context(protocol="ucx"):
+    if protocol not in ["ucx", "ucxx"]:
+        return
     try:
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
         try:
-            distributed.comm.ucx.init_once()
+            if protocol == "ucx":
+                import distributed.comm.ucx
+
+                distributed.comm.ucx.init_once()
+            elif protocol == "ucxx":
+                import distributed_ucxx.ucxx
+
+                distributed_ucxx.ucxx.init_once()
         except ModuleNotFoundError:
             # UCX initialization has to be delegated to Distributed, it will take care
             # of setting correct environment variables and importing `ucp` after that.
@@ -39,20 +47,35 @@ def _create_cuda_context():
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if (
-            ctx.has_context
-            and not distributed.comm.ucx.cuda_context_created.has_context
-        ):
-            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        if protocol == "ucx":
+            if (
+                ctx.has_context
+                and not distributed.comm.ucx.cuda_context_created.has_context
+            ):
+                distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        elif protocol == "ucxx":
+            if (
+                ctx.has_context
+                and not distributed_ucxx.ucxx.cuda_context_created.has_context
+            ):
+                distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
 
         _create_cuda_context_handler()
 
-        if not distributed.comm.ucx.cuda_context_created.has_context:
-            ctx = has_cuda_context()
-            if ctx.has_context and ctx.device_info != cuda_visible_device:
-                distributed.comm.ucx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx.device_info, os.getpid()
-                )
+        if protocol == "ucx":
+            if not distributed.comm.ucx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed.comm.ucx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
+        elif protocol == "ucxx":
+            if not distributed_ucxx.ucxx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
 
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
@@ -64,6 +87,7 @@ def initialize(
     enable_infiniband=None,
     enable_nvlink=None,
     enable_rdmacm=None,
+    protocol="ucx",
 ):
     """Create CUDA context and initialize UCX-Py, depending on user parameters.
 
@@ -118,7 +142,7 @@ def initialize(
     dask.config.set({"distributed.comm.ucx": ucx_config})
 
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)
 
 
 @click.command()
@@ -127,6 +151,12 @@ def initialize(
     default=False,
     help="Create CUDA context",
 )
+@click.option(
+    "--protocol",
+    default=None,
+    type=str,
+    help="Communication protocol, such as: 'tcp', 'tls', 'ucx' or 'ucxx'.",
+)
 @click.option(
     "--enable-tcp-over-ucx/--disable-tcp-over-ucx",
     default=False,
@@ -150,10 +180,11 @@ def initialize(
 def dask_setup(
     service,
     create_cuda_context,
+    protocol,
     enable_tcp_over_ucx,
     enable_infiniband,
     enable_nvlink,
     enable_rdmacm,
 ):
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index fa532b5f0..1b81c7703 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import warnings
+from functools import partial
 
 import dask
 from distributed import LocalCluster, Nanny, Worker
@@ -9,11 +10,9 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_ucx_config,
@@ -65,9 +64,10 @@ class LocalCUDACluster(LocalCluster):
     threads_per_worker : int, default 1
         Number of threads to be used for each Dask worker process.
     memory_limit : int, float, str, or None, default "auto"
-        Bytes of memory per process that the worker can use. Can be an integer (bytes),
-        float (fraction of total system memory), string (like ``"5GB"`` or ``"5000M"``),
-        or ``"auto"``, 0, or ``None`` for no memory management.
+        Size of the host LRU cache, which is used to determine when the worker
+        starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
+        integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
+        or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.
     device_memory_limit : int, float, str, or None, default 0.8
         Size of the CUDA device LRU cache, which is used to determine when the worker
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
@@ -114,6 +114,10 @@ class LocalCUDACluster(LocalCluster):
         memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
         and to set the maximum pool size.
 
+        .. note::
+            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
+            due to fragmentation.
+
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_managed_memory : bool, default False
@@ -131,6 +135,14 @@ class LocalCUDACluster(LocalCluster):
             The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
             incompatible with RMM pools and managed memory. Trying to enable both will
             result in an exception.
+    rmm_release_threshold: int, str or None, default None
+        When ``rmm.async is True`` and the pool size grows beyond this value, unused
+        memory held by the pool will be released at the next synchronization point.
+        Can be an integer (bytes), float (fraction of total device memory), string (like
+        ``"5GB"`` or ``"5000M"``) or ``None``. By default, this feature is disabled.
+
+        .. note::
+            This size is a per-worker configuration, and not cluster-wide.
     rmm_log_directory : str or None, default None
         Directory to write per-worker RMM log files to. The client and scheduler are not
         logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to
@@ -178,8 +190,12 @@ class LocalCUDACluster(LocalCluster):
     TypeError
         If InfiniBand or NVLink are enabled and ``protocol!="ucx"``.
     ValueError
-        If NVLink and RMM managed memory are both enabled, or if RMM pools / managed
-        memory and asynchronous allocator are both enabled.
+        If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
+        cannot be imported.
+        If RMM managed memory and asynchronous allocator are both enabled.
+        If RMM maximum pool size is set but RMM pool size is not.
+        If RMM maximum pool size is set but RMM async allocator is used.
+        If RMM release threshold is set but the RMM async allocator is not being used.
 
     See Also
     --------
@@ -205,6 +221,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
         jit_unspill=None,
@@ -247,7 +264,8 @@ def __init__(
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
         self.rmm_managed_memory = rmm_managed_memory
         self.rmm_async = rmm_async
-        if rmm_pool_size is not None or rmm_managed_memory:
+        self.rmm_release_threshold = rmm_release_threshold
+        if rmm_pool_size is not None or rmm_managed_memory or rmm_async:
             try:
                 import rmm  # noqa F401
             except ImportError:
@@ -256,18 +274,13 @@ def __init__(
                     "is not available. For installation instructions, please "
                     "see https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            if rmm_async:
-                raise ValueError(
-                    "RMM pool and managed memory are incompatible with asynchronous "
-                    "allocator"
-                )
         else:
             if enable_nvlink:
                 warnings.warn(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size`. Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
-                    "#important-notes for more details"
+                    "https://docs.rapids.ai/api/dask-cuda/nightly/ucx/ "
+                    "for more details"
                 )
 
         self.rmm_log_directory = rmm_log_directory
@@ -310,8 +323,11 @@ def __init__(
         if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
             if protocol is None:
                 protocol = "ucx"
-            elif protocol != "ucx":
-                raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
+            elif protocol not in ["ucx", "ucxx"]:
+                raise TypeError(
+                    "Enabling InfiniBand or NVLink requires protocol='ucx' or "
+                    "protocol='ucxx'"
+                )
 
         self.host = kwargs.get("host", None)
 
@@ -324,12 +340,16 @@ def __init__(
         )
 
         if worker_class is not None:
-            from functools import partial
-
-            worker_class = partial(
-                LoggedNanny if log_spilling is True else Nanny,
-                worker_class=worker_class,
-            )
+            if log_spilling is True:
+                raise ValueError(
+                    "Cannot enable `log_spilling` when `worker_class` is specified. If "
+                    "logging is needed, ensure `worker_class` is a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedNanny` or a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedWorker`, and specify "
+                    "`log_spilling=False`."
+                )
+            if not issubclass(worker_class, Nanny):
+                worker_class = partial(Nanny, worker_class=worker_class)
 
         self.pre_import = pre_import
 
@@ -358,7 +378,7 @@ def __init__(
         ) + ["dask_cuda.initialize"]
         self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
             "preload_argv", []
-        ) + ["--create-cuda-context"]
+        ) + ["--create-cuda-context", "--protocol", protocol]
 
         self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
         self.scale(n_workers)
@@ -385,12 +405,13 @@ def new_worker_spec(self):
                         get_cpu_affinity(nvml_device_index(0, visible_devices))
                     ),
                     RMMSetup(
-                        self.rmm_pool_size,
-                        self.rmm_maximum_pool_size,
-                        self.rmm_managed_memory,
-                        self.rmm_async,
-                        self.rmm_log_directory,
-                        self.rmm_track_allocations,
+                        initial_pool_size=self.rmm_pool_size,
+                        maximum_pool_size=self.rmm_maximum_pool_size,
+                        managed_memory=self.rmm_managed_memory,
+                        async_alloc=self.rmm_async,
+                        release_threshold=self.rmm_release_threshold,
+                        log_directory=self.rmm_log_directory,
+                        track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
                 },
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
new file mode 100644
index 000000000..4eba97f2b
--- /dev/null
+++ b/dask_cuda/plugins.py
@@ -0,0 +1,122 @@
+import importlib
+import os
+
+from distributed import WorkerPlugin
+
+from .utils import get_rmm_log_file_name, parse_device_memory_limit
+
+
+class CPUAffinity(WorkerPlugin):
+    def __init__(self, cores):
+        self.cores = cores
+
+    def setup(self, worker=None):
+        os.sched_setaffinity(0, self.cores)
+
+
+class RMMSetup(WorkerPlugin):
+    def __init__(
+        self,
+        initial_pool_size,
+        maximum_pool_size,
+        managed_memory,
+        async_alloc,
+        release_threshold,
+        log_directory,
+        track_allocations,
+    ):
+        if initial_pool_size is None and maximum_pool_size is not None:
+            raise ValueError(
+                "`rmm_maximum_pool_size` was specified without specifying "
+                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
+            )
+        if async_alloc is True:
+            if managed_memory is True:
+                raise ValueError(
+                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
+                )
+        if async_alloc is False and release_threshold is not None:
+            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
+
+        self.initial_pool_size = initial_pool_size
+        self.maximum_pool_size = maximum_pool_size
+        self.managed_memory = managed_memory
+        self.async_alloc = async_alloc
+        self.release_threshold = release_threshold
+        self.logging = log_directory is not None
+        self.log_directory = log_directory
+        self.rmm_track_allocations = track_allocations
+
+    def setup(self, worker=None):
+        if self.initial_pool_size is not None:
+            self.initial_pool_size = parse_device_memory_limit(
+                self.initial_pool_size, alignment_size=256
+            )
+
+        if self.async_alloc:
+            import rmm
+
+            if self.release_threshold is not None:
+                self.release_threshold = parse_device_memory_limit(
+                    self.release_threshold, alignment_size=256
+                )
+
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=self.initial_pool_size,
+                release_threshold=self.release_threshold,
+            )
+
+            if self.maximum_pool_size is not None:
+                self.maximum_pool_size = parse_device_memory_limit(
+                    self.maximum_pool_size, alignment_size=256
+                )
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=self.maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+            if self.logging:
+                rmm.enable_logging(
+                    log_file_name=get_rmm_log_file_name(
+                        worker, self.logging, self.log_directory
+                    )
+                )
+        elif self.initial_pool_size is not None or self.managed_memory:
+            import rmm
+
+            pool_allocator = False if self.initial_pool_size is None else True
+
+            if self.initial_pool_size is not None:
+                if self.maximum_pool_size is not None:
+                    self.maximum_pool_size = parse_device_memory_limit(
+                        self.maximum_pool_size, alignment_size=256
+                    )
+
+            rmm.reinitialize(
+                pool_allocator=pool_allocator,
+                managed_memory=self.managed_memory,
+                initial_pool_size=self.initial_pool_size,
+                maximum_pool_size=self.maximum_pool_size,
+                logging=self.logging,
+                log_file_name=get_rmm_log_file_name(
+                    worker, self.logging, self.log_directory
+                ),
+            )
+        if self.rmm_track_allocations:
+            import rmm
+
+            mr = rmm.mr.get_current_device_resource()
+            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
+
+
+class PreImport(WorkerPlugin):
+    def __init__(self, libraries):
+        if libraries is None:
+            libraries = []
+        elif isinstance(libraries, str):
+            libraries = libraries.split(",")
+        self.libraries = libraries
+
+    def setup(self, worker=None):
+        for l in self.libraries:
+            importlib.import_module(l)
diff --git a/dask_cuda/proxy_object.py b/dask_cuda/proxy_object.py
index 21dc15ea1..ddb7f3292 100644
--- a/dask_cuda/proxy_object.py
+++ b/dask_cuda/proxy_object.py
@@ -19,7 +19,6 @@
 import distributed.utils
 from dask.sizeof import sizeof
 from distributed.protocol.compression import decompress
-from distributed.worker import dumps_function, loads_function
 
 from dask_cuda.disk_io import disk_read
 
@@ -85,7 +84,7 @@ def asproxy(
             subclass = ProxyObject
             subclass_serialized = None
         else:
-            subclass_serialized = dumps_function(subclass)
+            subclass_serialized = pickle.dumps(subclass)
 
         ret = subclass(
             ProxyDetail(
@@ -440,7 +439,7 @@ def __reduce__(self):
         pxy = self._pxy_get(copy=True)
         pxy.serialize(serializers=("pickle",))
         if pxy.subclass:
-            subclass = loads_function(pxy.subclass)
+            subclass = pickle.loads(pxy.subclass)
         else:
             subclass = ProxyObject
 
@@ -837,7 +836,10 @@ def obj_pxy_dask_serialize(obj: ProxyObject):
         header, frames = pxy.serialize(serializers=("dask", "pickle"))
     obj._pxy_set(pxy)
 
-    return {"proxied-header": header, "obj-pxy-detail": pxy.get_init_args()}, frames
+    return {
+        "proxied-header": header,
+        "obj-pxy-detail": pickle.dumps(pxy.get_init_args()),
+    }, frames
 
 
 @distributed.protocol.cuda.cuda_serialize.register(ProxyObject)
@@ -860,7 +862,10 @@ def obj_pxy_cuda_serialize(obj: ProxyObject):
         # the worker's data store.
         header, frames = pxy.serialize(serializers=("cuda",))
 
-    return {"proxied-header": header, "obj-pxy-detail": pxy.get_init_args()}, frames
+    return {
+        "proxied-header": header,
+        "obj-pxy-detail": pickle.dumps(pxy.get_init_args()),
+    }, frames
 
 
 @distributed.protocol.dask_deserialize.register(ProxyObject)
@@ -872,11 +877,11 @@ def obj_pxy_dask_deserialize(header, frames):
     deserialized using the same serializers that were used when the object was
     serialized.
     """
-    args = header["obj-pxy-detail"]
+    args = pickle.loads(header["obj-pxy-detail"])
     if args["subclass"] is None:
         subclass = ProxyObject
     else:
-        subclass = loads_function(args["subclass"])
+        subclass = pickle.loads(args["subclass"])
     pxy = ProxyDetail(obj=(header["proxied-header"], frames), **args)
     if pxy.serializer == "disk":
         header, _ = pxy.obj
diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index d4c28ba06..80b1d482d 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -20,7 +20,7 @@
     get_global_manager,
     set_global_manager,
 )
-from cudf.testing._utils import assert_eq  # noqa: E402
+from cudf.testing import assert_eq  # noqa: E402
 
 if get_global_manager() is not None:
     pytest.skip(
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 64950e2b6..974ad1319 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -40,7 +40,7 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 str(nthreads),
                 "--no-dashboard",
                 "--worker-class",
-                "dask_cuda.utils.MockWorker",
+                "dask_cuda.utils_test.MockWorker",
             ]
         ):
             with Client("127.0.0.1:9359", loop=loop) as client:
@@ -131,6 +131,10 @@ def test_rmm_async(loop):  # noqa: F811
                 "--host",
                 "127.0.0.1",
                 "--rmm-async",
+                "--rmm-pool-size",
+                "2 GB",
+                "--rmm-release-threshold",
+                "3 GB",
                 "--no-dashboard",
             ]
         ):
@@ -143,6 +147,61 @@ def test_rmm_async(loop):  # noqa: F811
                 for v in memory_resource_type.values():
                     assert v is rmm.mr.CudaAsyncMemoryResource
 
+                ret = get_cluster_configuration(client)
+                wait(ret)
+                assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+                assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+
+
+def test_rmm_async_with_maximum_pool_size(loop):  # noqa: F811
+    rmm = pytest.importorskip("rmm")
+
+    driver_version = rmm._cuda.gpu.driverGetVersion()
+    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+    if driver_version < 11020 or runtime_version < 11020:
+        pytest.skip("cudaMallocAsync not supported")
+
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--rmm-async",
+                "--rmm-pool-size",
+                "2 GB",
+                "--rmm-release-threshold",
+                "3 GB",
+                "--rmm-maximum-pool-size",
+                "4 GB",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                memory_resource_types = client.run(
+                    lambda: (
+                        rmm.mr.get_current_device_resource_type(),
+                        type(rmm.mr.get_current_device_resource().get_upstream()),
+                    )
+                )
+                for v in memory_resource_types.values():
+                    memory_resource_type, upstream_memory_resource_type = v
+                    assert memory_resource_type is rmm.mr.LimitingResourceAdaptor
+                    assert (
+                        upstream_memory_resource_type is rmm.mr.CudaAsyncMemoryResource
+                    )
+
+                ret = get_cluster_configuration(client)
+                wait(ret)
+                assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+                assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+                assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 4000000000
+
 
 def test_rmm_logging(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
@@ -270,7 +329,7 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
                     str(nthreads),
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
@@ -305,7 +364,7 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
                     "127.0.0.1",
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
@@ -422,3 +481,31 @@ def test_worker_fraction_limits(loop):  # noqa: F811
                     ret["[plugin] RMMSetup"]["maximum_pool_size"]
                     == (device_total_memory * 0.3) // 256 * 256
                 )
+
+
+@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+def test_worker_timeout():
+    ret = subprocess.run(
+        [
+            "dask",
+            "cuda",
+            "worker",
+            "192.168.1.100:7777",
+            "--death-timeout",
+            "1",
+        ],
+        text=True,
+        encoding="utf8",
+        capture_output=True,
+    )
+
+    assert "closing nanny at" in ret.stderr.lower()
+
+    # Depending on the environment, the error raised may be different
+    try:
+        assert "reason: failure-to-start-" in ret.stderr.lower()
+        assert "timeouterror" in ret.stderr.lower()
+    except AssertionError:
+        assert "reason: nanny-close" in ret.stderr.lower()
+
+    assert ret.returncode == 0
diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 59e066470..4a4807941 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -10,7 +10,6 @@
     serialize,
     serialize_bytelist,
 )
-from distributed.protocol.pickle import HIGHEST_PROTOCOL
 
 from dask_cuda.device_host_file import DeviceHostFile, device_to_host, host_to_device
 
@@ -189,10 +188,7 @@ def test_serialize_cupy_collection(collection, length, value):
 
     header, frames = serialize(obj, serializers=["pickle"], on_error="raise")
 
-    if HIGHEST_PROTOCOL >= 5:
-        assert len(frames) == (1 + len(obj.frames))
-    else:
-        assert len(frames) == 1
+    assert len(frames) == (1 + len(obj.frames))
 
     obj2 = deserialize(header, frames)
     res = host_to_device(obj2)
diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index ece399d45..41bfa6cb1 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -15,6 +15,10 @@
 psutil = pytest.importorskip("psutil")
 
 
+def _is_ucx_116(ucp):
+    return ucp.get_ucx_version()[:2] == (1, 16)
+
+
 class DGXVersion(Enum):
     DGX_1 = auto()
     DGX_2 = auto()
@@ -73,10 +77,13 @@ def test_default():
     assert not p.exitcode
 
 
-def _test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")
+def _test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
 
-    with LocalCUDACluster(enable_tcp_over_ucx=True) as cluster:
+    with LocalCUDACluster(protocol=protocol, enable_tcp_over_ucx=True) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
             res = res.sum().compute()
@@ -93,10 +100,19 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
-
-    p = mp.Process(target=_test_tcp_over_ucx)
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp):
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
+
+    p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
@@ -117,9 +133,26 @@ def test_tcp_only():
     assert not p.exitcode
 
 
-def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm):
+def _test_ucx_infiniband_nvlink(
+    skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
+):
     cupy = pytest.importorskip("cupy")
-    ucp = pytest.importorskip("ucp")
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
+    if enable_infiniband and not any(
+        [at.startswith("rc") for at in ucp.get_active_transports()]
+    ):
+        skip_queue.put("No support available for 'rc' transport in UCX")
+        return
+    else:
+        skip_queue.put("ok")
+
+    # `ucp.get_active_transports()` call above initializes UCX, we must reset it
+    # so that Dask doesn't try to initialize it again and raise an exception.
+    ucp.reset()
 
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
@@ -135,6 +168,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
             cm_tls_priority = ["tcp"]
 
     initialize(
+        protocol=protocol,
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
@@ -142,6 +176,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
     )
 
     with LocalCUDACluster(
+        protocol=protocol,
         interface="ib0",
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
@@ -171,6 +206,7 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
 @pytest.mark.parametrize(
     "params",
     [
@@ -185,16 +221,21 @@ def check_ucx_options():
     _get_dgx_version() == DGXVersion.DGX_A100,
     reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
 )
-def test_ucx_infiniband_nvlink(params):
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
+def test_ucx_infiniband_nvlink(protocol, params):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
 
-    if params["enable_infiniband"]:
-        if not any([at.startswith("rc") for at in ucp.get_active_transports()]):
-            pytest.skip("No support available for 'rc' transport in UCX")
+    skip_queue = mp.Queue()
 
     p = mp.Process(
         target=_test_ucx_infiniband_nvlink,
         args=(
+            skip_queue,
+            protocol,
             params["enable_infiniband"],
             params["enable_nvlink"],
             params["enable_rdmacm"],
@@ -203,9 +244,8 @@ def test_ucx_infiniband_nvlink(params):
     p.start()
     p.join()
 
-    # Starting a new cluster on the same pytest process after an rdmacm cluster
-    # has been used may cause UCX-Py to complain about being already initialized.
-    if params["enable_rdmacm"] is True:
-        ucp.reset()
+    skip_msg = skip_queue.get()
+    if skip_msg != "ok":
+        pytest.skip(skip_msg)
 
     assert not p.exitcode
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 624815e75..2806dc1cd 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -1,6 +1,9 @@
 import asyncio
 import multiprocessing as mp
 import os
+import signal
+import time
+from functools import partial
 from unittest.mock import patch
 
 import numpy as np
@@ -11,18 +14,33 @@
 from dask import dataframe as dd
 from dask.dataframe.shuffle import partitioning_index
 from dask.dataframe.utils import assert_eq
-from distributed import Client, get_worker
+from distributed import Client
 from distributed.deploy.local import LocalCluster
 
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from dask_cuda.initialize import initialize
-from dask_cuda.utils import get_ucx_config
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
 
+QUERY_PLANNING_ON = dask.config.get("dataframe.query-planning", None) is not False
+
+# Skip these tests when dask-expr is active (for now)
+query_planning_skip = pytest.mark.skipif(
+    QUERY_PLANNING_ON,
+    reason=(
+        "The 'explicit-comms' config is not supported "
+        "when query planning is enabled."
+    ),
+)
+
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
+
+
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
 
@@ -32,19 +50,12 @@ async def my_rank(state, arg):
 
 
 def _test_local_cluster(protocol):
-    dask.config.update(
-        dask.config.global_config,
-        {
-            "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-        },
-        priority="new",
-    )
-
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster) as client:
@@ -52,7 +63,7 @@ def _test_local_cluster(protocol):
             assert sum(c.run(my_rank, 0)) == sum(range(4))
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_local_cluster(protocol):
     p = mp.Process(target=_test_local_cluster, args=(protocol,))
     p.start()
@@ -66,6 +77,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
         dashboard_address=None,
         n_workers=npartitions,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -86,6 +98,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
                     pd.testing.assert_frame_equal(got, expected)
 
 
+@query_planning_skip
 def test_dataframe_merge_empty_partitions():
     # Notice, we use more partitions than rows
     p = mp.Process(target=_test_dataframe_merge_empty_partitions, args=(2, 4))
@@ -96,86 +109,123 @@ def test_dataframe_merge_empty_partitions():
 
 def check_partitions(df, npartitions):
     """Check that all values in `df` hashes to the same"""
-    hashes = partitioning_index(df, npartitions)
+    dtypes = {}
+    for col, dtype in df.dtypes.items():
+        if pd.api.types.is_numeric_dtype(dtype):
+            dtypes[col] = np.float64
+    if not dtypes:
+        dtypes = None
+
+    hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
     if len(hashes) > 0:
         return len(hashes.unique()) == 1
     else:
         return True
 
 
-def _test_dataframe_shuffle(backend, protocol, n_workers):
+def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")
-        initialize(enable_tcp_over_ucx=True)
-    else:
-        dask.config.update(
-            dask.config.global_config,
-            {
-                "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-            },
-            priority="new",
-        )
 
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
-        with Client(cluster) as client:
-            all_workers = list(client.get_worker_logs().keys())
+        with Client(cluster):
             comms.default_comms()
             np.random.seed(42)
-            df = pd.DataFrame({"key": np.random.random(100)})
+            df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
 
+            if _partitions:
+                df["_partitions"] = 0
+
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
-                    ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
-                        workers=all_workers
-                    )
+                    ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
                     # To reduce test runtime, we change the batchsizes here instead
                     # of using a test parameter.
                     for batchsize in (-1, 1, 2):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
-                                ddf,
-                                ["key"],
+                                ddf1,
+                                ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
                             ).persist()
 
                             assert ddf.npartitions == output_nparts
 
-                            # Check that each partition hashes to the same value
-                            result = ddf.map_partitions(
-                                check_partitions, output_nparts
-                            ).compute()
-                            assert all(result.to_list())
-
-                            # Check the values (ignoring the row order)
-                            expected = df.sort_values("key")
-                            got = ddf.compute().sort_values("key")
-                            assert_eq(got, expected)
+                            if _partitions:
+                                # If "_partitions" is the hash key, we expect all but
+                                # the first partition to be empty
+                                assert_eq(ddf.partitions[0].compute(), df)
+                                assert all(
+                                    len(ddf.partitions[i].compute()) == 0
+                                    for i in range(1, ddf.npartitions)
+                                )
+                            else:
+                                # Check that each partition hashes to the same value
+                                result = ddf.map_partitions(
+                                    check_partitions, output_nparts
+                                ).compute()
+                                assert all(result.to_list())
+
+                                # Check the values (ignoring the row order)
+                                expected = df.sort_values("key")
+                                got = ddf.compute().sort_values("key")
+                                assert_eq(got, expected)
+
+                                # Check that partitioning is consistent with "tasks"
+                                ddf_tasks = ddf1.shuffle(
+                                    ["key"],
+                                    npartitions=output_nparts,
+                                    shuffle_method="tasks",
+                                )
+                                for i in range(output_nparts):
+                                    expected_partition = ddf_tasks.partitions[
+                                        i
+                                    ].compute()["key"]
+                                    actual_partition = ddf.partitions[i].compute()[
+                                        "key"
+                                    ]
+                                    if backend == "cudf":
+                                        expected_partition = (
+                                            expected_partition.values_host
+                                        )
+                                        actual_partition = actual_partition.values_host
+                                    else:
+                                        expected_partition = expected_partition.values
+                                        actual_partition = actual_partition.values
+                                    assert all(
+                                        np.sort(expected_partition)
+                                        == np.sort(actual_partition)
+                                    )
 
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-def test_dataframe_shuffle(backend, protocol, nworkers):
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("_partitions", [True, False])
+def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
         pytest.importorskip("cudf")
 
-    p = mp.Process(target=_test_dataframe_shuffle, args=(backend, protocol, nworkers))
+    p = mp.Process(
+        target=_test_dataframe_shuffle, args=(backend, protocol, nworkers, _partitions)
+    )
     p.start()
     p.join()
     assert not p.exitcode
 
 
 @pytest.mark.parametrize("in_cluster", [True, False])
-def test_dask_use_explicit_comms(in_cluster):
+def _test_dask_use_explicit_comms(in_cluster):
     def check_shuffle():
         """Check if shuffle use explicit-comms by search for keys named
         'explicit-comms-shuffle'
@@ -208,6 +258,7 @@ def check_shuffle():
             dashboard_address=None,
             n_workers=2,
             threads_per_worker=1,
+            worker_class=IncreasedCloseTimeoutNanny,
             processes=True,
         ) as cluster:
             with Client(cluster):
@@ -216,26 +267,42 @@ def check_shuffle():
         check_shuffle()
 
 
+@query_planning_skip
+@pytest.mark.parametrize("in_cluster", [True, False])
+def test_dask_use_explicit_comms(in_cluster):
+    def _timeout(process, function, timeout):
+        if process.is_alive():
+            function()
+        timeout = time.time() + timeout
+        while process.is_alive() and time.time() < timeout:
+            time.sleep(0.1)
+
+    p = mp.Process(target=_test_dask_use_explicit_comms, args=(in_cluster,))
+    p.start()
+
+    # Timeout before killing process
+    _timeout(p, lambda: None, 60.0)
+
+    # Send SIGINT (i.e., KeyboardInterrupt) hoping we get a stack trace.
+    _timeout(p, partial(p._popen._send_signal, signal.SIGINT), 3.0)
+
+    # SIGINT didn't work, kill process.
+    _timeout(p, p.kill, 3.0)
+
+    assert not p.is_alive()
+    assert p.exitcode == 0
+
+
 def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
     if backend == "cudf":
         cudf = pytest.importorskip("cudf")
 
-        initialize(enable_tcp_over_ucx=True)
-    else:
-
-        dask.config.update(
-            dask.config.global_config,
-            {
-                "distributed.comm.ucx": get_ucx_config(enable_tcp_over_ucx=True),
-            },
-            priority="new",
-        )
-
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -263,9 +330,10 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
             assert_eq(got, expected)
 
 
+@query_planning_skip
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
         pytest.importorskip("cudf")
@@ -287,7 +355,6 @@ def _test_jit_unspill(protocol):
         threads_per_worker=1,
         jit_unspill=True,
         device_memory_limit="1B",
-        enable_tcp_over_ucx=True if protocol == "ucx" else False,
     ) as cluster:
         with Client(cluster):
             np.random.seed(42)
@@ -303,7 +370,7 @@ def _test_jit_unspill(protocol):
             assert_eq(got, expected)
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_jit_unspill(protocol):
     pytest.importorskip("cudf")
 
@@ -314,8 +381,8 @@ def test_jit_unspill(protocol):
 
 
 def _test_lock_workers(scheduler_address, ranks):
-    async def f(_):
-        worker = get_worker()
+    async def f(info):
+        worker = info["worker"]
         if hasattr(worker, "running"):
             assert not worker.running
         worker.running = True
@@ -343,6 +410,7 @@ def test_lock_workers():
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=5,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         ps = []
diff --git a/dask_cuda/tests/test_from_array.py b/dask_cuda/tests/test_from_array.py
new file mode 100644
index 000000000..e20afcf3e
--- /dev/null
+++ b/dask_cuda/tests/test_from_array.py
@@ -0,0 +1,22 @@
+import pytest
+
+import dask.array as da
+from distributed import Client
+
+from dask_cuda import LocalCUDACluster
+
+cupy = pytest.importorskip("cupy")
+
+
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "tcp"])
+def test_ucx_from_array(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    N = 10_000
+    with LocalCUDACluster(protocol=protocol) as cluster:
+        with Client(cluster):
+            val = da.from_array(cupy.arange(N), chunks=(N // 10,)).sum().compute()
+            assert val == (N * (N - 1)) // 2
diff --git a/dask_cuda/tests/test_initialize.py b/dask_cuda/tests/test_initialize.py
index 60c7a798f..a953a10c1 100644
--- a/dask_cuda/tests/test_initialize.py
+++ b/dask_cuda/tests/test_initialize.py
@@ -10,9 +10,9 @@
 
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import get_ucx_config
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 mp = mp.get_context("spawn")  # type: ignore
-ucp = pytest.importorskip("ucp")
 
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
@@ -20,15 +20,21 @@
 # of UCX before retrieving the current config.
 
 
-def _test_initialize_ucx_tcp():
+def _test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_tcp_over_ucx": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -48,22 +54,34 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_tcp():
-    p = mp.Process(target=_test_initialize_ucx_tcp)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_nvlink():
+def _test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_nvlink": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -84,22 +102,34 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_nvlink():
-    p = mp.Process(target=_test_initialize_ucx_nvlink)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_infiniband():
+def _test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
     kwargs = {"enable_infiniband": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -123,21 +153,33 @@ def check_ucx_options():
 @pytest.mark.skipif(
     "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
 )
-def test_initialize_ucx_infiniband():
-    p = mp.Process(target=_test_initialize_ucx_infiniband)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
 
 
-def _test_initialize_ucx_all():
-    initialize()
+def _test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+
+    initialize(protocol=protocol)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config()},
     ) as cluster:
         with Client(cluster) as client:
@@ -161,8 +203,14 @@ def check_ucx_options():
             assert all(client.run(check_ucx_options).values())
 
 
-def test_initialize_ucx_all():
-    p = mp.Process(target=_test_initialize_ucx_all)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b0ac88234..b05389e4c 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -9,18 +9,17 @@
 from dask.distributed import Client
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import gen_test, raises_with_cause
-from distributed.worker import get_worker
 
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import (
-    MockWorker,
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     print_cluster_config,
 )
+from dask_cuda.utils_test import MockWorker
 
 
 @gen_test(timeout=20)
@@ -88,14 +87,40 @@ def get_visible_devices():
                 }
 
 
-@pytest.mark.parametrize("protocol", ["ucx", None])
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
 async def test_ucx_protocol(protocol):
-    pytest.importorskip("ucp")
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
-    initialize(enable_tcp_over_ucx=True)
     async with LocalCUDACluster(
-        protocol=protocol, enable_tcp_over_ucx=True, asynchronous=True, data=dict
+        protocol=protocol, asynchronous=True, data=dict
+    ) as cluster:
+        assert all(
+            ws.address.startswith(f"{protocol}://")
+            for ws in cluster.scheduler.workers.values()
+        )
+
+
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+@gen_test(timeout=20)
+async def test_explicit_ucx_with_protocol_none(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
+    async with LocalCUDACluster(
+        protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
     ) as cluster:
         assert all(
             ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
@@ -103,11 +128,18 @@ async def test_ucx_protocol(protocol):
 
 
 @pytest.mark.filterwarnings("ignore:Exception ignored in")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_ucx_protocol_type_error():
-    pytest.importorskip("ucp")
+async def test_ucx_protocol_type_error(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
-    initialize(enable_tcp_over_ucx=True)
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
     with pytest.raises(TypeError):
         async with LocalCUDACluster(
             protocol="tcp", enable_tcp_over_ucx=True, asynchronous=True, data=dict
@@ -140,7 +172,9 @@ async def test_no_memory_limits_cluster():
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             # Check that all workers use a regular dict as their "data store".
-            res = await client.run(lambda: isinstance(get_worker().data, dict))
+            res = await client.run(
+                lambda dask_worker: isinstance(dask_worker.data, dict)
+            )
             assert all(res.values())
 
 
@@ -161,7 +195,9 @@ async def test_no_memory_limits_cudaworker():
             await new_worker
             await client.wait_for_workers(2)
             # Check that all workers use a regular dict as their "data store".
-            res = await client.run(lambda: isinstance(get_worker().data, dict))
+            res = await client.run(
+                lambda dask_worker: isinstance(dask_worker.data, dict)
+            )
             assert all(res.values())
             await new_worker.close()
 
@@ -231,6 +267,8 @@ async def test_rmm_async():
 
     async with LocalCUDACluster(
         rmm_async=True,
+        rmm_pool_size="2GB",
+        rmm_release_threshold="3GB",
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -240,6 +278,44 @@ async def test_rmm_async():
             for v in memory_resource_type.values():
                 assert v is rmm.mr.CudaAsyncMemoryResource
 
+            ret = await get_cluster_configuration(client)
+            assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+
+
+@gen_test(timeout=20)
+async def test_rmm_async_with_maximum_pool_size():
+    rmm = pytest.importorskip("rmm")
+
+    driver_version = rmm._cuda.gpu.driverGetVersion()
+    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+    if driver_version < 11020 or runtime_version < 11020:
+        pytest.skip("cudaMallocAsync not supported")
+
+    async with LocalCUDACluster(
+        rmm_async=True,
+        rmm_pool_size="2GB",
+        rmm_release_threshold="3GB",
+        rmm_maximum_pool_size="4GB",
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            memory_resource_types = await client.run(
+                lambda: (
+                    rmm.mr.get_current_device_resource_type(),
+                    type(rmm.mr.get_current_device_resource().get_upstream()),
+                )
+            )
+            for v in memory_resource_types.values():
+                memory_resource_type, upstream_memory_resource_type = v
+                assert memory_resource_type is rmm.mr.LimitingResourceAdaptor
+                assert upstream_memory_resource_type is rmm.mr.CudaAsyncMemoryResource
+
+            ret = await get_cluster_configuration(client)
+            assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["[plugin] RMMSetup"]["release_threshold"] == 3000000000
+            assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 4000000000
+
 
 @gen_test(timeout=20)
 async def test_rmm_logging():
@@ -283,6 +359,7 @@ async def test_pre_import():
 
 
 # Intentionally not using @gen_test to skip cleanup checks
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_pre_import_not_found():
     async def _test_pre_import_not_found():
         with raises_with_cause(RuntimeError, None, ImportError, None):
@@ -400,6 +477,7 @@ async def test_get_cluster_configuration():
 @gen_test(timeout=20)
 async def test_worker_fraction_limits():
     async with LocalCUDACluster(
+        dashboard_address=None,
         device_memory_limit=0.1,
         rmm_pool_size=0.2,
         rmm_maximum_pool_size=0.3,
@@ -422,15 +500,35 @@ async def test_worker_fraction_limits():
             )
 
 
-def test_print_cluster_config(capsys):
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_print_cluster_config(capsys, protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+
     pytest.importorskip("rich")
     with LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol="ucx"
+        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
     ) as cluster:
         with Client(cluster) as client:
             print_cluster_config(client)
             captured = capsys.readouterr()
             assert "Dask Cluster Configuration" in captured.out
-            assert "ucx" in captured.out
+            assert protocol in captured.out
             assert "1 B" in captured.out
             assert "[plugin]" in captured.out
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
+def test_death_timeout_raises():
+    with pytest.raises(asyncio.exceptions.TimeoutError):
+        with LocalCUDACluster(
+            silence_logs=False,
+            death_timeout=1e-10,
+            dashboard_address=":0",
+        ):
+            pass
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 41399d673..2683ea36d 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -12,7 +12,6 @@
 from dask.utils import format_bytes
 from distributed import Client
 from distributed.utils_test import gen_test
-from distributed.worker import get_worker
 
 import dask_cuda
 import dask_cuda.proxify_device_objects
@@ -20,6 +19,7 @@
 from dask_cuda.proxify_host_file import ProxifyHostFile
 from dask_cuda.proxy_object import ProxyObject, asproxy, unproxy
 from dask_cuda.utils import get_device_total_memory
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 cupy = pytest.importorskip("cupy")
 cupy.cuda.set_allocator(None)
@@ -302,13 +302,24 @@ def test_dataframes_share_dev_mem(root_dir):
 def test_cudf_get_device_memory_objects():
     cudf = pytest.importorskip("cudf")
     objects = [
-        cudf.DataFrame({"a": range(10), "b": range(10)}, index=reversed(range(10))),
+        cudf.DataFrame(
+            {"a": [0, 1, 2, 3, None, 5, 6, 7, 8, 9], "b": range(10)},
+            index=reversed(range(10)),
+        ),
         cudf.MultiIndex(
             levels=[[1, 2], ["blue", "red"]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
         ),
     ]
     res = get_device_memory_ids(objects)
-    assert len(res) == 4, "We expect four buffer objects"
+    # Buffers are:
+    # 1. int data for objects[0].a
+    # 2. mask data for objects[0].a
+    # 3. int data for objects[0].b
+    # 4. int data for objects[0].index
+    # 5. int data for objects[1].levels[0]
+    # 6. char data for objects[1].levels[1]
+    # 7. offset data for objects[1].levels[1]
+    assert len(res) == 7, "We expect seven buffer objects"
 
 
 def test_externals(root_dir):
@@ -385,7 +396,7 @@ def test_incompatible_types(root_dir):
 
 @pytest.mark.parametrize("npartitions", [1, 2, 3])
 @pytest.mark.parametrize("compatibility_mode", [True, False])
-@gen_test(timeout=20)
+@gen_test(timeout=30)
 async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
     cudf = pytest.importorskip("cudf")
 
@@ -394,13 +405,16 @@ def is_proxy_object(x):
 
     with dask.config.set(jit_unspill_compatibility_mode=compatibility_mode):
         async with dask_cuda.LocalCUDACluster(
-            n_workers=1, jit_unspill=True, asynchronous=True
+            n_workers=1,
+            jit_unspill=True,
+            worker_class=IncreasedCloseTimeoutNanny,
+            asynchronous=True,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
                 ddf = dask.dataframe.from_pandas(
                     cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
                 )
-                res = ddf.shuffle(on="key", shuffle="tasks").persist()
+                res = ddf.shuffle(on="key", shuffle_method="tasks").persist()
 
                 # With compatibility mode on, we shouldn't encounter any proxy objects
                 if compatibility_mode:
@@ -429,9 +443,9 @@ async def test_worker_force_spill_to_disk():
                 ddf = dask.dataframe.from_pandas(df, npartitions=1).persist()
                 await ddf
 
-                async def f():
+                async def f(dask_worker):
                     """Trigger a memory_monitor() and reset memory_limit"""
-                    w = get_worker()
+                    w = dask_worker
                     # Set a host memory limit that triggers spilling to disk
                     w.memory_manager.memory_pause_fraction = False
                     memory = w.monitor.proc.memory_info().rss
@@ -443,7 +457,7 @@ async def f():
                     assert w.monitor.proc.memory_info().rss < memory - 10**7
                     w.memory_manager.memory_limit = memory * 10  # Un-limit
 
-                await client.submit(f)
+                client.run(f)
                 log = str(await client.get_worker_logs())
                 # Check that the worker doesn't complain about unmanaged memory
                 assert "Unmanaged memory use is high" not in log
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 1a4abafe9..31a9e9962 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -23,6 +23,7 @@
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 # Make the "disk" serializer available and use a directory that are
 # remove on exit.
@@ -305,6 +306,7 @@ def task(x):
         n_workers=1,
         device_memory_limit="1B",
         jit_unspill=jit_unspill,
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -399,10 +401,14 @@ def _pxy_deserialize(self):
 
 
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-@gen_test(timeout=20)
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@gen_test(timeout=120)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
 
     def task(x):
@@ -411,7 +417,7 @@ def task(x):
         serializers_used = x._pxy_get().serializer
 
         # Check that `x` is serialized with the expected serializers
-        if protocol == "ucx":
+        if protocol in ["ucx", "ucxx"]:
             if send_serializers is None:
                 assert serializers_used == "cuda"
             else:
@@ -422,7 +428,7 @@ def task(x):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
-        enable_tcp_over_ucx=protocol == "ucx",
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -442,11 +448,15 @@ def task(x):
             await client.submit(task, df)
 
 
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @pytest.mark.parametrize("shared_fs", [True, False])
 @gen_test(timeout=20)
 async def test_communicating_disk_objects(protocol, shared_fs):
     """Testing disk serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
     ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
 
@@ -462,7 +472,6 @@ def task(x):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
-        enable_tcp_over_ucx=protocol == "ucx",
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -528,10 +537,10 @@ def test_from_cudf_of_proxy_object():
     assert has_parallel_type(df)
 
     ddf = dask_cudf.from_cudf(df, npartitions=1)
-    assert has_parallel_type(ddf)
+    assert has_parallel_type(ddf._meta)
 
     # Notice, the output is a dask-cudf dataframe and not a proxy object
-    assert type(ddf) is dask_cudf.core.DataFrame
+    assert type(ddf._meta) is cudf.DataFrame
 
 
 def test_proxy_object_parquet(tmp_path):
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index f93b83ec7..f8df7e04f 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -1,17 +1,18 @@
+import gc
 import os
 from time import sleep
 
 import pytest
-from zict.file import _safe_key as safe_key
 
 import dask
 from dask import array as da
-from distributed import Client, get_worker, wait
+from distributed import Client, wait
 from distributed.metrics import time
 from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 
 from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 
 if utils.get_device_total_memory() < 1e10:
     pytest.skip("Not enough GPU memory", allow_module_level=True)
@@ -31,7 +32,8 @@ def device_host_file_size_matches(
     # `dhf.disk` is only available when Worker's `memory_limit != 0`
     if dhf.disk is not None:
         file_path = [
-            os.path.join(dhf.disk.directory, safe_key(k)) for k in dhf.disk.keys()
+            os.path.join(dhf.disk.directory, fname)
+            for fname in dhf.disk.filenames.values()
         ]
         file_size = [os.path.getsize(f) for f in file_path]
         byte_sum += sum(file_size)
@@ -57,27 +59,49 @@ def assert_device_host_file_size(
     )
 
 
-def worker_assert(total_size, device_chunk_overhead, serialized_chunk_overhead):
+def worker_assert(
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
+):
     assert_device_host_file_size(
-        get_worker().data, total_size, device_chunk_overhead, serialized_chunk_overhead
+        dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
     )
 
 
-def delayed_worker_assert(total_size, device_chunk_overhead, serialized_chunk_overhead):
+def delayed_worker_assert(
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
+):
     start = time()
     while not device_host_file_size_matches(
-        get_worker().data, total_size, device_chunk_overhead, serialized_chunk_overhead
+        dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
     ):
         sleep(0.01)
         if time() < start + 3:
             assert_device_host_file_size(
-                get_worker().data,
+                dask_worker.data,
                 total_size,
                 device_chunk_overhead,
                 serialized_chunk_overhead,
             )
 
 
+def assert_host_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is False:
+        assert len(dask_worker.data.host)
+
+
+def assert_disk_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is True:
+        assert len(dask_worker.data.disk or list()) > 0
+    else:
+        assert len(dask_worker.data.disk or list()) == 0
+
+
 @pytest.mark.parametrize(
     "params",
     [
@@ -99,11 +123,12 @@ def delayed_worker_assert(total_size, device_chunk_overhead, serialized_chunk_ov
         },
         {
             # This test setup differs from the one above as Distributed worker
-            # pausing is enabled and thus triggers `DeviceHostFile.evict()`
+            # spilling fraction is very low and thus forcefully triggers
+            # `DeviceHostFile.evict()`
             "device_memory_limit": int(200e6),
             "memory_limit": int(200e6),
-            "host_target": None,
-            "host_spill": None,
+            "host_target": False,
+            "host_spill": 0.01,
             "host_pause": False,
             "spills_to_disk": True,
         },
@@ -120,7 +145,14 @@ def delayed_worker_assert(total_size, device_chunk_overhead, serialized_chunk_ov
 @gen_test(timeout=30)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
-    with dask.config.set({"distributed.worker.memory.terminate": False}):
+    with dask.config.set(
+        {
+            "distributed.worker.memory.terminate": False,
+            "distributed.worker.memory.pause": params["host_pause"],
+            "distributed.worker.memory.spill": params["host_spill"],
+            "distributed.worker.memory.target": params["host_target"],
+        }
+    ):
         async with LocalCUDACluster(
             n_workers=1,
             scheduler_port=0,
@@ -129,12 +161,12 @@ async def test_cupy_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
-            memory_target_fraction=params["host_target"],
-            memory_spill_fraction=params["host_spill"],
-            memory_pause_fraction=params["host_pause"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 rs = da.random.RandomState(RandomState=cupy.random.RandomState)
                 x = rs.random(int(50e6), chunks=2e6)
                 await wait(x)
@@ -143,24 +175,32 @@ async def test_cupy_cluster_device_spill(params):
                 await wait(xx)
 
                 # Allow up to 1024 bytes overhead per chunk serialized
-                await client.run(worker_assert, x.nbytes, 1024, 1024)
+                await client.run(
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
+                )
 
                 y = client.compute(x.sum())
                 res = await y
 
                 assert (abs(res / x.size) - 0.5) < 1e-3
 
-                await client.run(worker_assert, x.nbytes, 1024, 1024)
-                host_chunks = await client.run(lambda: len(get_worker().data.host))
-                disk_chunks = await client.run(
-                    lambda: len(get_worker().data.disk or list())
+                await client.run(
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
+                )
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
+                )
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
 
 @pytest.mark.parametrize(
@@ -184,11 +224,12 @@ async def test_cupy_cluster_device_spill(params):
         },
         {
             # This test setup differs from the one above as Distributed worker
-            # pausing is enabled and thus triggers `DeviceHostFile.evict()`
+            # spilling fraction is very low and thus forcefully triggers
+            # `DeviceHostFile.evict()`
             "device_memory_limit": int(50e6),
             "memory_limit": int(50e6),
-            "host_target": None,
-            "host_spill": None,
+            "host_target": False,
+            "host_spill": 0.01,
             "host_pause": False,
             "spills_to_disk": True,
         },
@@ -210,19 +251,26 @@ async def test_cudf_cluster_device_spill(params):
         {
             "distributed.comm.compression": False,
             "distributed.worker.memory.terminate": False,
+            "distributed.worker.memory.spill-compression": False,
+            "distributed.worker.memory.pause": params["host_pause"],
+            "distributed.worker.memory.spill": params["host_spill"],
+            "distributed.worker.memory.target": params["host_target"],
         }
     ):
         async with LocalCUDACluster(
             n_workers=1,
+            scheduler_port=0,
+            silence_logs=False,
+            dashboard_address=None,
+            asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
-            memory_target_fraction=params["host_target"],
-            memory_spill_fraction=params["host_spill"],
-            memory_pause_fraction=params["host_pause"],
-            asynchronous=True,
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
 
+                await client.wait_for_workers(1)
+
                 # There's a known issue with datetime64:
                 # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
                 # The same error above happens when spilling datetime64 to disk
@@ -244,20 +292,35 @@ async def test_cudf_cluster_device_spill(params):
                 await wait(cdf2)
 
                 del cdf
+                gc.collect()
 
-                host_chunks = await client.run(lambda: len(get_worker().data.host))
-                disk_chunks = await client.run(
-                    lambda: len(get_worker().data.disk or list())
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
+                )
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 
-                await client.run(worker_assert, nbytes, 32, 2048)
+                await client.run(
+                    worker_assert,
+                    nbytes,
+                    32,
+                    2048,
+                )
 
                 del cdf2
 
-                await client.run(delayed_worker_assert, 0, 0, 0)
+                while True:
+                    try:
+                        await client.run(
+                            delayed_worker_assert,
+                            0,
+                            0,
+                            0,
+                        )
+                    except AssertionError:
+                        gc.collect()
+                    else:
+                        break
diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 34e63f1b4..a0a77677d 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -79,11 +79,18 @@ def test_get_device_total_memory():
             assert total_mem > 0
 
 
-def test_get_preload_options_default():
-    pytest.importorskip("ucp")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_get_preload_options_default(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
     )
 
@@ -93,14 +100,21 @@ def test_get_preload_options_default():
     assert opts["preload_argv"] == ["--create-cuda-context"]
 
 
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @pytest.mark.parametrize("enable_tcp", [True, False])
 @pytest.mark.parametrize("enable_infiniband", [True, False])
 @pytest.mark.parametrize("enable_nvlink", [True, False])
-def test_get_preload_options(enable_tcp, enable_infiniband, enable_nvlink):
-    pytest.importorskip("ucp")
+def test_get_preload_options(protocol, enable_tcp, enable_infiniband, enable_nvlink):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
 
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
         enable_tcp_over_ucx=enable_tcp,
         enable_infiniband=enable_infiniband,
diff --git a/dask_cuda/tests/test_version.py b/dask_cuda/tests/test_version.py
new file mode 100644
index 000000000..f30b2847d
--- /dev/null
+++ b/dask_cuda/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import dask_cuda
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cuda.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(dask_cuda.__version__, str)
+    assert len(dask_cuda.__version__) > 0
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 5e558fbc5..ff4dbbae3 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -1,7 +1,7 @@
-import importlib
 import math
 import operator
 import os
+import pickle
 import time
 import warnings
 from contextlib import suppress
@@ -17,7 +17,7 @@
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
 from dask.utils import format_bytes, parse_bytes
-from distributed import Worker, wait
+from distributed import wait
 from distributed.comm import parse_address
 
 try:
@@ -31,93 +31,6 @@ def nvtx_annotate(message=None, color="blue", domain=None):
         yield
 
 
-class CPUAffinity:
-    def __init__(self, cores):
-        self.cores = cores
-
-    def setup(self, worker=None):
-        os.sched_setaffinity(0, self.cores)
-
-
-class RMMSetup:
-    def __init__(
-        self,
-        initial_pool_size,
-        maximum_pool_size,
-        managed_memory,
-        async_alloc,
-        log_directory,
-        track_allocations,
-    ):
-        if initial_pool_size is None and maximum_pool_size is not None:
-            raise ValueError(
-                "`rmm_maximum_pool_size` was specified without specifying "
-                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
-            )
-
-        self.initial_pool_size = initial_pool_size
-        self.maximum_pool_size = maximum_pool_size
-        self.managed_memory = managed_memory
-        self.async_alloc = async_alloc
-        self.logging = log_directory is not None
-        self.log_directory = log_directory
-        self.rmm_track_allocations = track_allocations
-
-    def setup(self, worker=None):
-        if self.async_alloc:
-            import rmm
-
-            rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource())
-            if self.logging:
-                rmm.enable_logging(
-                    log_file_name=get_rmm_log_file_name(
-                        worker, self.logging, self.log_directory
-                    )
-                )
-        elif self.initial_pool_size is not None or self.managed_memory:
-            import rmm
-
-            pool_allocator = False if self.initial_pool_size is None else True
-
-            if self.initial_pool_size is not None:
-                self.initial_pool_size = parse_device_memory_limit(
-                    self.initial_pool_size, alignment_size=256
-                )
-                if self.maximum_pool_size is not None:
-                    self.maximum_pool_size = parse_device_memory_limit(
-                        self.maximum_pool_size, alignment_size=256
-                    )
-
-            rmm.reinitialize(
-                pool_allocator=pool_allocator,
-                managed_memory=self.managed_memory,
-                initial_pool_size=self.initial_pool_size,
-                maximum_pool_size=self.maximum_pool_size,
-                logging=self.logging,
-                log_file_name=get_rmm_log_file_name(
-                    worker, self.logging, self.log_directory
-                ),
-            )
-        if self.rmm_track_allocations:
-            import rmm
-
-            mr = rmm.mr.get_current_device_resource()
-            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
-
-
-class PreImport:
-    def __init__(self, libraries):
-        if libraries is None:
-            libraries = []
-        elif isinstance(libraries, str):
-            libraries = libraries.split(",")
-        self.libraries = libraries
-
-    def setup(self, worker=None):
-        for l in self.libraries:
-            importlib.import_module(l)
-
-
 def unpack_bitmask(x, mask_bits=64):
     """Unpack a list of integers containing bitmasks.
 
@@ -374,7 +287,7 @@ def get_preload_options(
     if create_cuda_context:
         preload_options["preload_argv"].append("--create-cuda-context")
 
-    if protocol == "ucx":
+    if protocol in ["ucx", "ucxx"]:
         initialize_ucx_argv = []
         if enable_tcp_over_ucx:
             initialize_ucx_argv.append("--enable-tcp-over-ucx")
@@ -422,7 +335,9 @@ def wait_workers(
     client: distributed.Client
         Instance of client, used to query for number of workers connected.
     min_timeout: float
-        Minimum number of seconds to wait before timeout.
+        Minimum number of seconds to wait before timeout. This value may be
+        overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with
+        a positive integer.
     seconds_per_gpu: float
         Seconds to wait for each GPU on the system. For example, if its
         value is 2 and there is a total of 8 GPUs (workers) being started,
@@ -439,6 +354,8 @@ def wait_workers(
     -------
     True if all workers were started, False if a timeout occurs.
     """
+    min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None)
+    min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env)
     n_gpus = n_gpus or get_n_gpus()
     timeout = max(min_timeout, seconds_per_gpu * n_gpus)
 
@@ -635,27 +552,6 @@ def _align(size, alignment_size):
         return _align(int(device_memory_limit), alignment_size)
 
 
-class MockWorker(Worker):
-    """Mock Worker class preventing NVML from getting used by SystemMonitor.
-
-    By preventing the Worker from initializing NVML in the SystemMonitor, we can
-    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
-    machines.
-    """
-
-    def __init__(self, *args, **kwargs):
-        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
-        self._device_get_count = distributed.diagnostics.nvml.device_get_count
-        super().__init__(*args, **kwargs)
-
-    def __del__(self):
-        distributed.diagnostics.nvml.device_get_count = self._device_get_count
-
-    @staticmethod
-    def device_get_count():
-        return 0
-
-
 def get_gpu_uuid_from_index(device_index=0):
     """Get GPU UUID from CUDA device index.
 
@@ -688,14 +584,20 @@ def get_worker_config(dask_worker):
     # assume homogeneous cluster
     plugin_vals = dask_worker.plugins.values()
     ret = {}
-
     # device and host memory configuration
     for p in plugin_vals:
-        ret[f"[plugin] {type(p).__name__}"] = {
+        config = {
             v: getattr(p, v)
             for v in dir(p)
             if not (v.startswith("_") or v in {"setup", "cores"})
         }
+        # To send this back to the client the data will be serialised
+        # which might fail, so pre-emptively check
+        try:
+            pickle.dumps(config)
+        except TypeError:
+            config = "UNKNOWN CONFIG"
+        ret[f"[plugin] {type(p).__name__}"] = config
 
     for mem in [
         "memory_limit",
@@ -723,6 +625,10 @@ def get_worker_config(dask_worker):
         import ucp
 
         ret["ucx-transports"] = ucp.get_active_transports()
+    elif scheme == "ucxx":
+        import ucxx
+
+        ret["ucx-transports"] = ucxx.get_active_transports()
 
     # comm timeouts
     ret["distributed.comm.timeouts"] = dask.config.get("distributed.comm.timeouts")
diff --git a/dask_cuda/utils_test.py b/dask_cuda/utils_test.py
new file mode 100644
index 000000000..aba77ee79
--- /dev/null
+++ b/dask_cuda/utils_test.py
@@ -0,0 +1,45 @@
+from typing import Literal
+
+import distributed
+from distributed import Nanny, Worker
+
+
+class MockWorker(Worker):
+    """Mock Worker class preventing NVML from getting used by SystemMonitor.
+
+    By preventing the Worker from initializing NVML in the SystemMonitor, we can
+    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
+    machines.
+    """
+
+    def __init__(self, *args, **kwargs):
+        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
+        self._device_get_count = distributed.diagnostics.nvml.device_get_count
+        super().__init__(*args, **kwargs)
+
+    def __del__(self):
+        distributed.diagnostics.nvml.device_get_count = self._device_get_count
+
+    @staticmethod
+    def device_get_count():
+        return 0
+
+
+class IncreasedCloseTimeoutNanny(Nanny):
+    """Increase `Nanny`'s close timeout.
+
+    The internal close timeout mechanism of `Nanny` recomputes the time left to kill
+    the `Worker` process based on elapsed time of the close task, which may leave
+    very little time for the subprocess to shutdown cleanly, which may cause tests
+    to fail when the system is under higher load. This class increases the default
+    close timeout of 5.0 seconds that `Nanny` sets by default, which can be overriden
+    via Distributed's public API.
+
+    This class can be used with the `worker_class` argument of `LocalCluster` or
+    `LocalCUDACluster` to provide a much higher default of 30.0 seconds.
+    """
+
+    async def close(  # type:ignore[override]
+        self, timeout: float = 30.0, reason: str = "nanny-close"
+    ) -> Literal["OK"]:
+        return await super().close(timeout=timeout, reason=reason)
diff --git a/dask_cuda/worker_spec.py b/dask_cuda/worker_spec.py
index 6a61fa8f8..84ce51725 100644
--- a/dask_cuda/worker_spec.py
+++ b/dask_cuda/worker_spec.py
@@ -5,7 +5,8 @@
 
 from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
-from .utils import CPUAffinity, get_cpu_affinity, get_gpu_count
+from .plugins import CPUAffinity
+from .utils import get_cpu_affinity, get_gpu_count
 
 
 def worker_spec(
diff --git a/dependencies.yaml b/dependencies.yaml
index bb99ea21a..a9183cc2f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -1,10 +1,14 @@
 # Dependency list for https://github.com/rapidsai/dependency-file-generator
 files:
   all:
-    output: none
+    output: conda
+    matrix:
+      cuda: ["11.4", "11.8", "12.2"]
+      arch: [x86_64]
     includes:
       - build_python
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - develop
       - docs
       - py_version
@@ -13,7 +17,8 @@ files:
   test_python:
     output: none
     includes:
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - py_version
       - test_python
   checks:
@@ -24,9 +29,40 @@ files:
   docs:
     output: none
     includes:
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - docs
       - py_version
+  py_build:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: build-system
+    includes:
+      - build_python
+  py_run:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project
+    includes:
+      - run_python
+  py_test:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python
+  py_docs:
+    output: pyproject
+    pyproject_dir: .
+    extras:
+      table: project.optional-dependencies
+      key: docs
+    includes:
+      - docs
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -36,29 +72,47 @@ channels:
 dependencies:
   build_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
+          - rapids-build-backend>=0.3.0,<0.4.0dev0
           - setuptools>=64.0.0
-  cudatoolkit:
+  cuda_version:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              cuda: "11.2"
-            packages:
-              - cudatoolkit=11.2
           - matrix:
               cuda: "11.4"
             packages:
-              - cudatoolkit=11.4
+              - cuda-version=11.4
           - matrix:
               cuda: "11.5"
             packages:
-              - cudatoolkit=11.5
+              - cuda-version=11.5
           - matrix:
               cuda: "11.8"
             packages:
-              - cudatoolkit=11.8
+              - cuda-version=11.8
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+          - matrix:
+              cuda: "12.2"
+            packages:
+              - cuda-version=12.2
+  cuda:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - cudatoolkit
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-nvcc-impl
+              - cuda-nvrtc
   develop:
     common:
       - output_types: [conda, requirements]
@@ -66,20 +120,16 @@ dependencies:
           - pre-commit
   docs:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - numpydoc
+          - numpydoc>=1.1.0
           - sphinx
-          - sphinx-click
-          - sphinx_rtd_theme
+          - sphinx-click>=2.7.1
+          - sphinx-rtd-theme>=0.5.1
   py_version:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.8"
-            packages:
-              - python=3.8
           - matrix:
               py: "3.9"
             packages:
@@ -88,31 +138,39 @@ dependencies:
               py: "3.10"
             packages:
               - python=3.10
+          - matrix:
+              py: "3.11"
+            packages:
+              - python=3.11
           - matrix:
             packages:
-              - python>=3.8,<3.11
+              - python>=3.9,<3.12
   run_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.1.1
-          - distributed>=2023.1.1
-          - numba>=0.54
-          - numpy>=1.18.0
-          - pandas>=1.0
+          - click >=8.1
+          - numba>=0.57
+          - numpy>=1.23,<2.0a0
+          - pandas>=1.3
           - pynvml>=11.0.0
-          - zict>=0.1.3
+          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - zict>=2.0.0
   test_python:
     common:
-      - output_types: [conda]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cucim=23.04
-          - cudf=23.04
-          - dask-cudf=23.04
           - pytest
           - pytest-cov
+      - output_types: [conda]
+        packages:
+          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - distributed-ucxx==0.39.*,>=0.0.0a0
+          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucx-py=0.31
+          - ucxx==0.39.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -124,3 +182,23 @@ dependencies:
               arch: aarch64
             packages:
               - numactl-devel-cos7-aarch64
+      - output_types: [requirements, pyproject]
+        matrices:
+          # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
+          # ref: https://github.com/rapidsai/kvikio/pull/369
+          - matrix: {cuda: "12.*"}
+            packages:
+              - cudf-cu12==24.8.*,>=0.0.0a0
+              - dask-cudf-cu12==24.8.*,>=0.0.0a0
+              - ucx-py-cu12==0.39.*,>=0.0.0a0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - cudf-cu11==24.8.*,>=0.0.0a0
+              - dask-cudf-cu11==24.8.*,>=0.0.0a0
+              - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *kvikio_conda
+              - *ucx_py_conda
diff --git a/docs/source/api.rst b/docs/source/api.rst
index b9d9d6dfa..1594594cc 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,3 +33,6 @@ Explicit-comms
 .. currentmodule:: dask_cuda.explicit_comms.comms
 .. autoclass:: CommsContext
    :members:
+
+.. currentmodule:: dask_cuda.explicit_comms.dataframe.shuffle
+.. autofunction:: shuffle
diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
index 84cc78b88..2de3809c8 100644
--- a/docs/source/examples/best-practices.rst
+++ b/docs/source/examples/best-practices.rst
@@ -9,9 +9,7 @@ When choosing between two multi-GPU setups, it is best to pick the one where mos
 `DGX <https://www.nvidia.com/en-us/data-center/dgx-systems/>`_, a cloud instance with `multi-gpu options <https://rapids.ai/cloud>`_ , a high-density GPU HPC instance, etc.  This is done for two reasons:
 
 - Moving data between GPUs is costly and performance decreases when computation stops due to communication overheads, Host-to-Device/Device-to-Host transfers, etc
-- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated
-networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See
-`Accelerated Networking`_ for more discussion
+- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See `Accelerated Networking`_ for more discussion.
 
 .. code-block:: python
 
diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst
index 6230caf67..7a0651173 100644
--- a/docs/source/examples/ucx.rst
+++ b/docs/source/examples/ucx.rst
@@ -2,7 +2,7 @@ Enabling UCX communication
 ==========================
 
 A CUDA cluster using UCX communication can be started automatically with LocalCUDACluster or manually with the ``dask cuda worker`` CLI tool.
-In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../ucx.html#configuration>`_ for details on all available options.
+In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../../ucx/#configuration>`_ for details on all available options.
 
 LocalCUDACluster with Automatic Configuration
 ---------------------------------------------
@@ -29,7 +29,7 @@ To connect a client to a cluster with automatically-configured UCX and an RMM po
 LocalCUDACluster with Manual Configuration
 ------------------------------------------
 
-When using LocalCUDACluster with UCX communication and manual configuration, all required UCX configuration is handled through arguments supplied at construction; see `API -- Cluster <../api.html#cluster>`_ for a complete list of these arguments.
+When using LocalCUDACluster with UCX communication and manual configuration, all required UCX configuration is handled through arguments supplied at construction; see `API -- Cluster <../../api/#cluster>`_ for a complete list of these arguments.
 To connect a client to a cluster with all supported transports and an RMM pool:
 
 .. code-block:: python
@@ -69,7 +69,7 @@ To start a Dask scheduler using UCX with automatic configuration and one GB of R
 .. note::
     The ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Workers
 ^^^^^^^
@@ -86,7 +86,7 @@ To start workers with automatic UCX configuration and an RMM pool of 14GB per GP
 .. note::
     Analogous to the scheduler setup, the ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Client
 ^^^^^^
@@ -122,7 +122,7 @@ Alternatively, the ``with dask.config.set`` statement from the example above may
     We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 ``dask cuda worker`` with Manual Configuration
-------------------------------------------
+----------------------------------------------
 
 When using ``dask cuda worker`` with UCX communication and manual configuration, the scheduler, workers, and client must all be started manually, each using the same UCX configuration.
 
@@ -148,7 +148,7 @@ We communicate to the scheduler that we will be using UCX with the ``--protocol`
 Workers
 ^^^^^^^
 
-All UCX configuration options have analogous options in ``dask cuda worker``; see `API -- Worker <../api.html#worker>`_ for a complete list of these options.
+All UCX configuration options have analogous options in ``dask cuda worker``; see `API -- Worker <../../api/#worker>`_ for a complete list of these options.
 To start a cluster with all supported transports and an RMM pool:
 
 .. code-block:: bash
@@ -163,7 +163,7 @@ To start a cluster with all supported transports and an RMM pool:
 Client
 ^^^^^^
 
-A client can be configured to use UCX by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as LocalCUDACluster and adds them to the current Dask configuration used when creating it; see `API -- Client initialization <../api.html#client-initialization>`_ for a complete list of arguments.
+A client can be configured to use UCX by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as LocalCUDACluster and adds them to the current Dask configuration used when creating it; see `API -- Client initialization <../../api/#client-initialization>`_ for a complete list of arguments.
 To connect a client to the cluster we have made:
 
 .. code-block:: python
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index 56ad97758..9fde8756a 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -5,7 +5,7 @@ Communication and scheduling overhead can be a major bottleneck in Dask/Distribu
 The idea is that Dask/Distributed spawns workers and distribute data as usually while the user can submit tasks on the workers that communicate explicitly.
 
 This makes it possible to bypass Distributed's scheduler and write hand-tuned computation and communication patterns. Currently, Dask-CUDA includes an explicit-comms
-implementation of the Dataframe `shuffle <https://github.com/rapidsai/dask-cuda/blob/d3c723e2c556dfe18b47b392d0615624453406a5/dask_cuda/explicit_comms/dataframe/shuffle.py#L210>`_ operation used for merging and sorting.
+implementation of the Dataframe `shuffle <../api/#dask_cuda.explicit_comms.dataframe.shuffle.shuffle>`_ operation used for merging and sorting.
 
 
 Usage
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <api.html#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-0.20/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.08/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 37ba12139..0d415cb0d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,7 +11,7 @@ While Distributed can be used to leverage GPU workloads through libraries such a
 
 - **Automatic instantiation of per-GPU workers** -- Using Dask-CUDA's LocalCUDACluster or ``dask cuda worker`` CLI will automatically launch one worker for each GPU available on the executing node, avoiding the need to explicitly select GPUs.
 - **Automatic setting of CPU affinity**  -- The setting of CPU affinity for each GPU is done automatically, preventing memory transfers from taking suboptimal paths.
-- **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration <ucx.html>`_ for instructions on configuring UCX communication).
+- **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration <ucx>`_ for instructions on configuring UCX communication).
 - **Memory spilling from GPU** -- For memory-intensive workloads, Dask-CUDA supports spilling from GPU to host memory when a GPU reaches the default or user-specified memory utilization limit.
 - **Allocation of GPU memory** -- when using UCX communication, per-GPU memory pools can be allocated using `RAPIDS Memory Manager <https://github.com/rapidsai/rmm>`_ to circumvent the costly memory buffer mappings that would be required otherwise.
 
diff --git a/docs/source/install.rst b/docs/source/install.rst
index b8442b4ff..e522ae3c1 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 11.5:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 12.0:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cudatoolkit=11.5
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.0
 
 Pip
 ---
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index c5592b439..c42bd4837 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -16,6 +16,10 @@ To create a Dask-CUDA cluster using all available GPUs and connect a Dask.distri
     cluster = LocalCUDACluster()
     client = Client(cluster)
 
+.. tip::
+
+   Be sure to include an ``if __name__ == "__main__":`` block when using :py:class:`dask_cuda.LocalCUDACluster` in a standalone Python script. See `standalone Python scripts <https://docs.dask.org/en/stable/scheduling.html#standalone-python-scripts>`_ for more details.
+
 ``dask cuda worker``
 --------------------
 
diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst
index 28f3562b9..a237adf74 100644
--- a/docs/source/spilling.rst
+++ b/docs/source/spilling.rst
@@ -37,7 +37,7 @@ JIT-Unspill
 The regular spilling in Dask and Dask-CUDA has some significate issues. Instead of tracking individual objects, it tracks task outputs.
 This means that a task returning a collection of CUDA objects will either spill all of the CUDA objects or none of them.
 Other issues includes *object duplication*, *wrong spilling order*, and *non-tracking of sharing device buffers*
-(see: https://github.com/dask/distributed/issues/4568#issuecomment-805049321).
+(`see discussion <https://github.com/dask/distributed/issues/4568#issuecomment-805049321>`_).
 
 In order to address all of these issues, Dask-CUDA introduces JIT-Unspilling, which can improve performance and memory usage significantly.
 For workloads that require significant spilling
diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index d9cacdc77..cf798e5dc 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -37,7 +37,7 @@ Automatic
 
 Beginning with Dask-CUDA 22.02 and assuming UCX >= 1.11.1, specifying UCX transports is now optional.
 
-A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask scheduler --protocol="ucx"`` and connecting a ``dask cuda worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <examples/ucx.html>`_ for more details examples of UCX usage with automatic configuration.
+A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask scheduler --protocol="ucx"`` and connecting a ``dask cuda worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <../examples/ucx/>`_ for more details examples of UCX usage with automatic configuration.
 
 Configuring transports manually is still possible, please refer to the subsection below.
 
@@ -79,12 +79,12 @@ However, some will affect related libraries, such as RMM:
 .. note::
     These options can be used with mainline Dask.distributed.
     However, some features are exclusive to Dask-CUDA, such as the automatic detection of InfiniBand interfaces.
-    See `Dask-CUDA -- Motivation <index.html#motivation>`_ for more details on the benefits of using Dask-CUDA.
+    See `Dask-CUDA -- Motivation <../#motivation>`_ for more details on the benefits of using Dask-CUDA.
 
 Usage
 -----
 
-See `Enabling UCX communication <examples/ucx.html>`_ for examples of UCX usage with different supported transports.
+See `Enabling UCX communication <../examples/ucx/>`_ for examples of UCX usage with different supported transports.
 
 Running in a fork-starved environment
 -------------------------------------
@@ -97,7 +97,7 @@ this when using Dask-CUDA's UCX integration, processes launched via
 multiprocessing should use the start processes using the
 `"forkserver"
 <https://docs.python.org/dev/library/multiprocessing.html#contexts-and-start-methods>`_
-method. When launching workers using `dask cuda worker <quickstart.html#dask-cuda-worker>`_, this can be
+method. When launching workers using `dask cuda worker <../quickstart/#dask-cuda-worker>`_, this can be
 achieved by passing ``--multiprocessing-method forkserver`` as an
 argument. In user code, the method can be controlled with the
 ``distributed.worker.multiprocessing-method`` configuration key in
diff --git a/pyproject.toml b/pyproject.toml
index 1d5c59e3b..8daac618d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,41 +1,38 @@
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0dev0",
     "setuptools>=64.0.0",
-    "tomli  ; python_version < '3.11'",
-    "versioneer>=0.24",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "dask-cuda"
-dynamic = [
-    "version",
-]
+dynamic = ["version"]
 description = "Utilities for Dask and CUDA interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache-2.0" }
-requires-python = ">=3.8"
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
 dependencies = [
-    "dask >=2023.1.1",
-    "distributed >=2023.1.1",
-    "pynvml >=11.0.0",
-    "numpy >=1.18.0",
-    "numba >=0.54",
-    "pandas >=1.0",
-    "zict >=0.1.3",
-]
+    "click >=8.1",
+    "numba>=0.57",
+    "numpy>=1.23,<2.0a0",
+    "pandas>=1.3",
+    "pynvml>=11.0.0",
+    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "zict>=2.0.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
 ]
 
 [project.scripts]
@@ -51,10 +48,15 @@ docs = [
     "sphinx",
     "sphinx-click>=2.7.1",
     "sphinx-rtd-theme>=0.5.1",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
+    "cudf==24.8.*,>=0.0.0a0",
+    "dask-cudf==24.8.*,>=0.0.0a0",
+    "kvikio==24.8.*,>=0.0.0a0",
     "pytest",
-]
+    "pytest-cov",
+    "ucx-py==0.39.*,>=0.0.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/dask-cuda"
@@ -114,25 +116,31 @@ skip = [
     "build",
     "dist",
     "__init__.py",
-    "versioneer.py",
 ]
 
 [tool.pytest.ini_options]
 filterwarnings = [
     "error::DeprecationWarning",
     "error::FutureWarning",
-    "ignore::DeprecationWarning:pkg_resources",
-    "ignore:distutils Version classes are deprecated.*:DeprecationWarning:",
-    # tornado 6.2, remove when dask/distributed#6669 is fixed
-    "ignore:clear_current is deprecated:DeprecationWarning:",
-    "ignore:make_current is deprecated:DeprecationWarning:",
     # remove after https://github.com/rapidsai/dask-cuda/issues/1087 is closed
     "ignore:There is no current event loop:DeprecationWarning:tornado",
+    # This warning must be filtered until dask-expr support
+    # is enabled in both dask-cudf and dask-cuda.
+    # See: https://github.com/rapidsai/dask-cuda/issues/1311
+    "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+dependencies-file = "dependencies.yaml"
+disable-cuda = true
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "dask_cuda/VERSION"}
+
 [tool.setuptools.packages.find]
 exclude = [
     "docs",
@@ -140,11 +148,3 @@ exclude = [
     "docs.*",
     "tests.*",
 ]
-
-[tool.versioneer]
-VCS = "git"
-style = "pep440"
-versionfile_source = "dask_cuda/_version.py"
-versionfile_build = "dask_cuda/_version.py"
-tag_prefix = "v"
-parentdir_prefix = "dask_cuda-"
diff --git a/rtd/Makefile b/rtd/Makefile
deleted file mode 100644
index ba501f6f5..000000000
--- a/rtd/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/rtd/conf.py b/rtd/conf.py
deleted file mode 100644
index fe71b19e3..000000000
--- a/rtd/conf.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import datetime
-import os
-import shutil
-
-# -- Project information -----------------------------------------------------
-
-project = "dask-cuda"
-copyright = "2020-%s, NVIDIA" % datetime.datetime.now().year
-author = "NVIDIA"
-
-# The full version, including alpha/beta/rc tags.
-release = "21.06"
-
-# The short X.Y version.
-version = "21.06"
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-# extensions = [
-#     "sphinx.ext.autodoc",
-#     "sphinx.ext.mathjax",
-#     "sphinx.ext.viewcode",
-#     "sphinx.ext.githubpages",
-#     "sphinx.ext.autosummary",
-#     "sphinx.ext.intersphinx",
-#     "sphinx.ext.extlinks",
-#     "numpydoc",
-#     "sphinx_click",
-#     "sphinx_rtd_theme",
-# ]
-
-# numpydoc_show_class_members = False
-
-# Add any paths that contain templates here, relative to this directory.
-# templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-# html_theme = "sphinx_rtd_theme"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "dask-cudadoc"
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-# latex_elements = {
-#     # The paper size ('letterpaper' or 'a4paper').
-#     #
-#     # 'papersize': 'letterpaper',
-#     # The font size ('10pt', '11pt' or '12pt').
-#     #
-#     # 'pointsize': '10pt',
-#     # Additional stuff for the LaTeX preamble.
-#     #
-#     # 'preamble': '',
-#     # Latex figure (float) alignment
-#     #
-#     # 'figure_align': 'htbp',
-# }
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-# latex_documents = [
-#     (master_doc, "dask-cuda.tex", "dask-cuda Documentation", "NVIDIA", "manual")
-# ]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-# man_pages = [(master_doc, "dask-cuda", "dask-cuda Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-# texinfo_documents = [
-#     (
-#         master_doc,
-#         "dask-cuda",
-#         "dask-cuda Documentation",
-#         author,
-#         "dask-cuda",
-#         "One line description of project.",
-#         "Miscellaneous",
-#     )
-# ]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-# epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-# epub_exclude_files = ["search.html"]
-
-
-# -- Extension configuration -------------------------------------------------
-
-# lifted from dask-ml
-templates_path = ["templates"]
-pages = [
-    "index",
-]
-html_additional_pages = {page: "redirect.html" for page in pages}
-html_context = {
-    "redirects": {
-        page: f"https://docs.rapids.ai/api/dask-cuda/nightly/{page}" for page in pages
-    }
-}
-
-
-def add_404(app, docname):
-    if app.builder.format == "html":
-        pth_index = os.path.join(app.outdir, "index.html")
-        pth_404 = os.path.join(app.outdir, "404.html")
-        if os.path.exists(pth_index):
-            shutil.copyfile(pth_index, pth_404)
-
-
-def setup(app):
-    app.connect("build-finished", add_404)
diff --git a/rtd/index.rst b/rtd/index.rst
deleted file mode 100644
index e4d447108..000000000
--- a/rtd/index.rst
+++ /dev/null
@@ -1 +0,0 @@
-This page has moved!
diff --git a/rtd/templates/redirect.html b/rtd/templates/redirect.html
deleted file mode 100644
index 6c59fe2d5..000000000
--- a/rtd/templates/redirect.html
+++ /dev/null
@@ -1,11 +0,0 @@
-{% set redirect = redirects[pagename.split("/")[-1]] %}
-<html>
-    <head>
-        <meta http-equiv="Refresh" content="0; url={{ redirect }}.html" />
-        <title>dask-cuda docs</title>
-    </head>
-    <body>
-        <h4>The dask-cuda documentation has moved!</h4>
-        <p>You will now be redirected to our <a href="{{ redirect }}.html">new page</a>.</p>
-    </body>
-</html>
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 3b72644b6..000000000
--- a/setup.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-
-import versioneer
-from setuptools import setup
-
-if "GIT_DESCRIBE_TAG" in os.environ:
-    # Disgusting hack. For pypi uploads we cannot use the
-    # versioneer-provided version for non-release builds, since they
-    # strictly follow PEP440
-    # https://peps.python.org/pep-0440/#local-version-identifiers
-    # which disallows local version identifiers (as produced by
-    # versioneer) in public index servers.
-    # We still want to use versioneer infrastructure, so patch
-    # in our pypi-compatible version to the output of
-    # versioneer.get_versions.
-
-    orig_get_versions = versioneer.get_versions
-    version = os.environ["GIT_DESCRIBE_TAG"] + os.environ.get("VERSION_SUFFIX", "")
-
-    def get_versions():
-        data = orig_get_versions()
-        data["version"] = version
-        return data
-
-    versioneer.get_versions = get_versions
-
-
-setup(
-    version=versioneer.get_version(),
-    cmdclass=versioneer.get_cmdclass(),
-)