diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index fb7182f4133..65aebfb7f8c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9d79733703c..e955b8f1f80 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -186,7 +186,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
@@ -207,7 +207,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -217,7 +217,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 858352f515d..dc82c17022a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
@@ -94,7 +94,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -106,7 +106,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index af49942c8cd..d80e4fef0d0 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name="libcudf"
 package_dir="python/libcudf"
 
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
 rapids-logger "Generating build requirements"
 
 rapids-dependency-file-generator \
@@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
 mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair \
     --exclude libnvcomp.so.4 \
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index db86721755d..3c6dba72164 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Support invoking test_python_cudf.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -24,8 +24,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf (dask-expr)"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf"
+./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-rapids-logger "pytest dask_cudf (legacy)"
-DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-
 rapids-logger "pytest cudf_kafka"
 ./ci/run_cudf_kafka_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index e15949f4bdb..44f430ce98d 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
@@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf (dask-expr)"
+rapids-logger "pytest dask_cudf"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
+python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
   .
 popd
-
-# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
-rapids-logger "pytest dask_cudf (legacy)"
-pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-popd
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a4b3f4fe174..a8e5018b283 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13,<0.0.18
+- numba-cuda>=0.2.0,<0.3.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.15
+- polars>=1.11,<1.18
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7173c955116..6dc99b14f5d 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13,<0.0.18
+- numba-cuda>=0.2.0,<0.3.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.15
+- polars>=1.11,<1.18
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index b6c03dc1bc2..7a0005497df 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.15
+    - polars >=1.11,<1.18
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 2c16deeed82..b34496cc256 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -80,7 +80,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.0.13,<0.0.18
+    - numba-cuda >=0.2.0,<0.3.0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cb814aa8c0f..9dabe4e8800 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -276,7 +276,7 @@ rapids_cpm_init()
 
 include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
 rapids_cpm_rapids_logger()
-rapids_make_logger(cudf EXPORT_SET cudf-exports)
+rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN)
 
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
@@ -461,6 +461,7 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
+  src/hash/xxhash_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/arrow_utilities.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 749e1b628ee..0ff712c1c77 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 
+# ##################################################################################################
+# * rolling benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
new file mode 100644
index 00000000000..04afe5ac661
--- /dev/null
+++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality    = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys = create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result = cudf::grouped_rolling_window(
+      keys->view(), vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 28})
+  .add_int64_axis("preceding_size", {1, 10})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp
new file mode 100644
index 00000000000..af9ecd6a26f
--- /dev/null
+++ b/cpp/benchmarks/rolling/rolling_sum.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+
+template <typename Type>
+void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+
+  auto vals = [&]() {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  }();
+
+  auto preceding = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) {
+      return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto following = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) {
+      return std::max(-i - 1, std::min(following_size, num_rows - i - 1));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_fixed_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {1, 10, 100})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1, 20});
+
+NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_variable_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {10, 100})
+  .add_int64_axis("following_size", {2});
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 307a52cd242..88034b4f804 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Computes the XXHash_32 hash value of each row in the given table
+ *
+ * This function computes the hash of each column using the `seed` for the first column
+ * and the resulting hash as a seed for the next column and so on.
+ * The result is a uint32 value for each row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> xxhash_32(
+  table_view const& input,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 7cb80081a95..f796ff4526e 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr);
 
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view,
+                                  rmm::device_async_resource_ref mr);
+
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
new file mode 100644
index 00000000000..bb6e7f18fbc
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
+
+namespace cudf::hashing::detail {
+
+template <typename Key>
+struct XXHash_32 {
+  using result_type = std::uint32_t;
+
+  CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
+
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
+  {
+    return this->_impl.compute_hash(bytes, size);
+  }
+
+ private:
+  template <typename T>
+  __device__ constexpr result_type compute(T const& key) const
+  {
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
+  }
+
+  cuco::xxhash_32<Key> _impl;
+};
+
+template <>
+XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
+{
+  return this->compute(static_cast<uint8_t>(key));
+}
+
+template <>
+XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
+  double const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<cudf::string_view>::result_type
+  __device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+{
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
+}
+
+template <>
+XXHash_32<numeric::decimal32>::result_type
+  __device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal64>::result_type
+  __device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal128>::result_type
+  __device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
+{
+  CUDF_UNREACHABLE("List column hashing is not supported");
+}
+
+template <>
+XXHash_32<cudf::struct_view>::result_type
+  __device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+{
+  CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 42f84e4d0c7..e111367d191 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,8 +1,9 @@
 #
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 import argparse
 import os
+import re
 import sys
 import xml.etree.ElementTree as ET
 from pathlib import Path
@@ -144,6 +145,16 @@ def format_file_size(input_size):
     return file_size_str
 
 
+def replace_placeholder_patterns(input_string: str) -> str:
+    pattern = r'(_h_env_placehold)[_placehold]+'
+    return re.sub(pattern, r'\1...', input_string)
+
+
+# adjust name for display
+def format_file_name(name: str) -> str:
+    return replace_placeholder_patterns(name)
+
+
 # Output chart results in HTML format
 # Builds a standalone html file with no javascript or styles
 def output_html(entries, sorted_list, cmp_entries, args):
@@ -223,7 +234,8 @@ def output_html(entries, sorted_list, cmp_entries, args):
             print("<td height='20px' width='", size, "px' ", sep="", end="")
             # title text is shown as hover-text by most browsers
             print(color, "title='", end="")
-            print(name, "\n", build_time_str, "' ", sep="", end="")
+            display_name = format_file_name(name)
+            print(display_name, "\n", build_time_str, "' ", sep="", end="")
             # centers the name if it fits in the box
             print("align='center' nowrap>", end="")
             # use a slightly smaller, fixed-width font
@@ -265,7 +277,8 @@ def output_html(entries, sorted_list, cmp_entries, args):
         file_size_str = format_file_size(file_size)
 
         # output entry row
-        print("<tr ", color, "><td>", name, "</td>", sep="", end="")
+        display_name = format_file_name(name)
+        print("<tr ", color, "><td>", display_name, "</td>", sep="", end="")
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu
new file mode 100644
index 00000000000..40503f7f911
--- /dev/null
+++ b/cpp/src/hash/xxhash_32.cu
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_32.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/tabulate.h>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type const row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return cuda::std::numeric_limits<hash_value_type>::max();
+      }
+      auto const hasher = XXHash_32<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type const,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for XXHash_32");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
+                                    input.num_rows(),
+                                    mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable   = has_nulls(input);
+  auto const input_view = table_device_view::create(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   device_row_hasher(nullable, *input_view, seed));
+
+  return output;
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::xxhash_32(input, seed, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 82d8152ca1c..113342e9cbf 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -37,12 +38,25 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
+#include <BS_thread_pool.hpp>
+#include <BS_thread_pool_utils.hpp>
+
 #include <numeric>
 
 namespace cudf::io::json::detail {
 
 namespace {
 
+namespace pools {
+
+BS::thread_pool& tpool()
+{
+  static BS::thread_pool _tpool(std::thread::hardware_concurrency());
+  return _tpool;
+}
+
+}  // namespace pools
+
 class compressed_host_buffer_source final : public datasource {
  public:
   explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
@@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource {
   {
     auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
                                               _dbuf_ptr->size());
-    if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
-        comptype == compression_type::SNAPPY) {
+    if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP ||
+        _comptype == compression_type::SNAPPY) {
       _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
     } else {
       _decompressed_buffer         = cudf::io::detail::decompress(_comptype, ch_buffer);
@@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource {
     return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
   }
 
-  [[nodiscard]] bool supports_device_read() const override { return false; }
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    auto& thread_pool = pools::tpool();
+    return thread_pool.submit_task([this, offset, size, dst, stream] {
+      auto hbuf = host_read(offset, size);
+      CUDF_CUDA_TRY(
+        cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value()));
+      stream.synchronize();
+      return hbuf->size();
+    });
+  }
+
+  [[nodiscard]] bool supports_device_read() const override { return true; }
 
   [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
 
@@ -431,6 +460,8 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
+  std::vector<std::future<size_t>> thread_tasks;
+  auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count());
 
   auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
   std::vector<std::size_t> prefsum_source_sizes(sources.size());
@@ -447,13 +478,17 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
 
   auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
   range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-  for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
+  for (std::size_t i = start_source, cur_stream = 0;
+       i < sources.size() && bytes_read < total_bytes_to_read;
+       i++) {
     if (sources[i]->is_empty()) continue;
     auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
     auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
                        (num_delimiter_chars * delimiter_map.size());
-    if (sources[i]->is_device_read_preferred(data_size)) {
-      bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+    if (sources[i]->supports_device_read()) {
+      thread_tasks.emplace_back(sources[i]->device_read_async(
+        range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()]));
+      bytes_read += data_size;
     } else {
       h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
       auto const& h_buffer = h_buffers.back();
@@ -481,6 +516,15 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     buffer.data());
   }
   stream.synchronize();
+
+  if (thread_tasks.size()) {
+    auto const bytes_read = std::accumulate(
+      thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) {
+        return sum + task.get();
+      });
+    CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy");
+  }
+
   return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
 }
 
@@ -505,10 +549,17 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     return read_json_impl(sources, reader_opts, stream, mr);
 
   std::vector<std::unique_ptr<datasource>> compressed_sources;
-  for (size_t i = 0; i < sources.size(); i++) {
-    compressed_sources.emplace_back(
-      std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
+  std::vector<std::future<std::unique_ptr<compressed_host_buffer_source>>> thread_tasks;
+  auto& thread_pool = pools::tpool();
+  for (auto& src : sources) {
+    thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] {
+      return std::make_unique<compressed_host_buffer_source>(src, reader_opts.get_compression());
+    }));
   }
+  std::transform(thread_tasks.begin(),
+                 thread_tasks.end(),
+                 std::back_inserter(compressed_sources),
+                 [](auto& task) { return task.get(); });
   // in read_json_impl, we need the compressed source size to actually be the
   // uncompressed source size for correct batching
   return read_json_impl(compressed_sources, reader_opts, stream, mr);
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 7facc6497ed..469f933f918 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1572b7246c0..1f84d1f81dc 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,6 +132,177 @@ struct orcdec_state_s {
   } vals;
 };
 
+/**
+ * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group.
+ *
+ * This class is used to address a special case, where the first run of the DATA stream spans two
+ * adjacent row groups and its length is greater than the maximum length allowed to be consumed.
+ * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be
+ * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type
+ * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a
+ * local variable and does not reside in the shared memory.
+ */
+class run_cache_manager {
+ private:
+  enum class status : uint8_t {
+    DISABLED,  ///< Run cache manager is disabled. No caching will be performed. If the special case
+               ///< happens, the run cache manager will be set to this status after the cache read
+               ///< is completed. This status also applies when the special case does not happen.
+    CAN_WRITE_TO_CACHE,  ///< Run cache manager is ready for write. If the special case happens, the
+                         ///< run cache manager will be set to this status.
+    CAN_READ_FROM_CACHE,  ///< Run cache manager is ready for read. If the special case happens, the
+                          ///< run cache manager will be set to this status after the cache write is
+                          ///< completed.
+  };
+
+ public:
+  /**
+   * @brief Initialize the run cache manager.
+   *
+   * @param[in] s ORC decoder state.
+   */
+  __device__ void initialize(orcdec_state_s* s)
+  {
+    _status          = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP)
+                         ? status::CAN_WRITE_TO_CACHE
+                         : status::DISABLED;
+    _reusable_length = 0;
+    _run_length      = 0;
+  }
+
+ private:
+  status _status;  ///< The status of the run cache manager.
+  uint32_t
+    _reusable_length;  ///< The number of data to be cached and reused later. For example, if a run
+                       ///< has a length of 512 but the maximum length allowed to be consumed is
+                       ///< capped at 162, then 350 (512-162) data will be cached.
+  uint32_t _run_length;  ///< The length of the run, 512 in the above example.
+  friend class cache_helper;
+};
+
+/**
+ * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for
+ * a row group.
+ *
+ * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is
+ * in the local storage (as an optimization). If a function is to use run_cache_manager, both the
+ * manager and the cache objects need to be passed. This class is introduced to simplify the
+ * function call, so that only a single cache_helper object needs to be passed. To that end, public
+ * methods originally belonging to run_cache_manager have been moved to this class.
+ */
+class cache_helper {
+ public:
+  /**
+   * @brief Constructor.
+   *
+   * @param[in] run_cache_manager_inst An instance of run_cache_manager.
+   */
+  __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst)
+    : _manager(run_cache_manager_inst)
+  {
+  }
+
+  /**
+   * @brief Set the reusable length object.
+   *
+   * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the
+   * DATA stream.
+   * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed
+   * by the decoder when processing the SECONDARY stream.
+   */
+  __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length)
+  {
+    if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) {
+      _manager._run_length = run_length;
+      _manager._reusable_length =
+        (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0;
+    }
+  }
+
+  /**
+   * @brief Adjust the maximum length allowed to be consumed when the length of the first run is
+   * greater than it.
+   *
+   * @param[in] max_length The maximum length allowed to be consumed for the DATA stream.
+   * @return A new maximum length.
+   */
+  [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length)
+  {
+    auto new_max_length{max_length};
+    if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) {
+      new_max_length -= _manager._reusable_length;
+    }
+    return new_max_length;
+  }
+
+  /**
+   * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache.
+   *
+   * @param[in] src Intermediate buffer for the DATA stream.
+   */
+  __device__ void write_to_cache(int64_t* src)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    __syncthreads();
+
+    // All threads in the block always take a uniform code path for the following branches.
+    // _reusable_length ranges between [0, 512].
+    if (_manager._reusable_length > 0) {
+      auto const length_to_skip = _manager._run_length - _manager._reusable_length;
+      if (tid < _manager._reusable_length) {
+        auto const src_idx = tid + length_to_skip;
+        _storage           = src[src_idx];
+      }
+      if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; }
+    } else {
+      if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; }
+    }
+
+    __syncthreads();
+  }
+
+  /**
+   * @brief Copy the cached data to the intermediate buffer for the DATA stream.
+   *
+   * @param[in,out] dst Intermediate buffer for the DATA stream.
+   * @param[in,out] rle Run length decoder state object.
+   */
+  __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    // First, shift the data up
+    auto const dst_idx = tid + _manager._reusable_length;
+    auto const v       = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0;
+    __syncthreads();
+
+    if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; }
+    __syncthreads();
+
+    // Second, insert the cached data
+    if (tid < _manager._reusable_length) { dst[tid] = _storage; }
+    __syncthreads();
+
+    if (tid == 0) {
+      // Disable the run cache manager, since cache write-and-read happens at most once per row
+      // group.
+      _manager._status = run_cache_manager::status::DISABLED;
+      rle->num_vals += _manager._reusable_length;
+    }
+
+    __syncthreads();
+  }
+
+ private:
+  run_cache_manager& _manager;  ///< An instance of run_cache_manager.
+  int64_t _storage;             ///< Per-thread cache storage.
+};
+
 /**
  * @brief Initializes byte stream, modifying length and start position to keep the read pointer
  * 8-byte aligned.
@@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] maxvals maximum number of values to decode
  * @param[in] t thread id
  * @param[in] has_buffered_values If true, means there are already buffered values
+ * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage
+ * caching of the first run of the DATA stream.
  *
  * @return number of values decoded
  */
@@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
                                          T* vals,
                                          uint32_t maxvals,
                                          int t,
-                                         bool has_buffered_values = false)
+                                         bool has_buffered_values        = false,
+                                         cache_helper* cache_helper_inst = nullptr)
 {
   if (t == 0) {
+    if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); }
     uint32_t maxpos  = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
     uint32_t lastpos = bs->pos;
     auto numvals     = 0;
@@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
           l += deltapos;
         }
       }
+
+      if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); }
+
       if ((numvals != 0) and (numvals + n > maxvals)) break;
       // case where there are buffered values and can't consume a whole chunk
       // from decoded values, so skip adding any more to buffer, work on buffered values and then
@@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
     __syncwarp();
   }
   __syncthreads();
+  // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the
+  // data type is int64_t.
+  if constexpr (cuda::std::is_same_v<T, int64_t>) {
+    if (cache_helper_inst != nullptr) {
+      // Run cache is read from during the 2nd iteration of the top-level while loop in
+      // gpuDecodeOrcColumnData().
+      cache_helper_inst->read_from_cache(vals, rle);
+      // Run cache is written to during the 1st iteration of the loop.
+      cache_helper_inst->write_to_cache(vals);
+    }
+  }
   return rle->num_vals;
 }
 
@@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   // Struct doesn't have any data in itself, so skip
   bool const is_valid       = s->chunk.type_kind != STRUCT;
   size_t const max_num_rows = s->chunk.column_num_rows;
+  __shared__ run_cache_manager run_cache_manager_inst;
+  cache_helper cache_helper_inst(run_cache_manager_inst);
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
@@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
+
+    run_cache_manager_inst.initialize(s);
   }
   __syncthreads();
 
@@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         if (is_rlev1(s->chunk.encoding_kind)) {
           numvals = Integer_RLEv1<int64_t>(bs, &s->u.rlev1, s->vals.i64, numvals, t);
         } else {
-          numvals = Integer_RLEv2<int64_t>(bs, &s->u.rlev2, s->vals.i64, numvals, t);
+          numvals = Integer_RLEv2<int64_t>(bs,
+                                           &s->u.rlev2,
+                                           s->vals.i64,
+                                           numvals,
+                                           t,
+                                           false /**has_buffered_values */,
+                                           &cache_helper_inst);
         }
         if (s->chunk.type_kind == DECIMAL) {
           // If we're using an index, we may have to drop values from the initial run
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index b5f9b894c46..0d40a1f7b1b 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 9acbe026bb2..32bb3349666 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -961,9 +961,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     return;
   }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
-
   using value_decoder_type = std::conditional_t<
     split_decode_t,
     decode_fixed_width_split_values_func<decode_block_size_t, has_lists_t, state_buf_t>,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c48ff896e33..f9fcca6bb4f 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                              _stream);
     }
 
-    // Compute column string sizes (using page string offsets) for this subpass
+    // Compute column string sizes (using page string offsets) for this output table chunk
     col_string_sizes = calculate_page_string_offsets();
 
-    // ensure cumulative column string sizes have been initialized
-    if (pass.cumulative_col_string_sizes.empty()) {
-      pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
-    }
-
-    // Add to the cumulative column string sizes of this pass
-    std::transform(pass.cumulative_col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.end(),
-                   col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.begin(),
-                   std::plus<>{});
-
     // Check for overflow in cumulative column string sizes of this pass so that the page string
     // offsets of overflowing (large) string columns are treated as 64-bit.
     auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
-    auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
-                                               pass.cumulative_col_string_sizes.cend(),
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
                                                [=](std::size_t sz) { return sz > threshold; });
     if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
 
-    // Mark any chunks for which the cumulative column string size has exceeded the
-    // large strings threshold
-    if (has_large_strings) {
-      for (auto& chunk : pass.chunks) {
-        auto const idx = chunk.src_col_index;
-        if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
-      }
+    // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
+    // column chunks and the large strings threshold.
+    for (auto& chunk : pass.chunks) {
+      auto const idx            = chunk.src_col_index;
+      chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
     }
   }
 
@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         // only do string buffer for leaf
         if (idx == max_depth - 1 and out_buf.string_size() == 0 and
             col_string_sizes[pass.chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(
-            col_string_sizes[pass.chunks[c].src_col_index],
-            pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
-              static_cast<size_t>(strings::detail::get_offset64_threshold()),
-            _stream);
+          out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
+                                     pass.chunks[c].is_large_string_col,
+                                     _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
-        // need to cap off the string offsets column
-        auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        if (sz <= strings::detail::get_offset64_threshold()) {
+        // only if it is not a large strings column
+        if (col_string_sizes[idx] <=
+            static_cast<size_t>(strings::detail::get_offset64_threshold())) {
           out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
-          final_offsets.emplace_back(sz);
+          final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
         }
       }
     }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index ca46f198bb8..4a773fbced1 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,9 +130,6 @@ struct pass_intermediate_data {
   rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
   rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
 
-  // cumulative strings column sizes.
-  std::vector<size_t> cumulative_col_string_sizes{};
-
   int level_type_size{0};
 
   // skip_rows / num_rows for this pass.
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 4f75908fe72..37c5698f654 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 368b1fba870..4565626edad 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,6 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <cudf::size_type block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   mixed_join(table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index a4ec97af235..4c063b6202e 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,6 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <cudf::size_type block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   mixed_join_semi(table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 98170ed719a..869d05ce4d3 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,8 +34,6 @@ namespace cudf {
 namespace detail {
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <int block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   compute_mixed_join_output_size(table_device_view left_table,
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 73c4567d3a4..94d27d976c3 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/utilities/getenv_or.hpp"
+
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
@@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
 CUDF_EXPORT auto& kernel_pinned_copy_threshold()
 {
   // use cudaMemcpyAsync for all pinned copies
-  static std::atomic<size_t> threshold = 0;
+  static std::atomic<size_t> threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0);
   return threshold;
 }
 
@@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(
 CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
 {
   // use pageable memory for all host allocations
-  static std::atomic<size_t> threshold = 0;
+  static std::atomic<size_t> threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0);
   return threshold;
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 46ae5273853..35877ac34b9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -192,6 +192,7 @@ ConfigureTest(
   hashing/sha256_test.cpp
   hashing/sha384_test.cpp
   hashing/sha512_test.cpp
+  hashing/xxhash_32_test.cpp
   hashing/xxhash_64_test.cpp
 )
 
diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp
new file mode 100644
index 00000000000..9e3c66b0d0b
--- /dev/null
+++ b/cpp/tests/hashing/xxhash_32_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/hashing.hpp>
+
+class XXHash_32_Test : public cudf::test::BaseFixture {};
+
+TEST_F(XXHash_32_Test, TestInteger)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 825}};
+  auto constexpr seed = 0u;
+  auto const output   = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, TestDouble)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<double>{{-8., 25., 90.}};
+  auto constexpr seed = 42u;
+
+  auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, StringType)
+{
+  auto col1           = cudf::test::strings_column_wrapper({"I", "am", "AI"});
+  auto constexpr seed = 825u;
+
+  auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499009u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index 58396115a54..b5d20325b75 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel)
   cudf::default_logger().warn("warn");
   cudf::default_logger().error("error");
   cudf::default_logger().critical("critical");
-  ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n");
+  ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
diff --git a/dependencies.yaml b/dependencies.yaml
index b0f217a6770..4672a355c72 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -688,7 +688,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18
+          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -747,7 +747,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.15
+          - polars>=1.11,<1.18
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -810,11 +810,11 @@ dependencies:
         matrices:
           - matrix: {dependencies: "oldest"}
             packages:
-              - *numba-cuda-dep
+              - numba-cuda==0.2.0
               - pandas==2.0.*
           - matrix: {dependencies: "latest"}
             packages:
-              - numba-cuda==0.0.15
+              - *numba-cuda-dep
               - pandas==2.2.3
           - matrix:
             packages:
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index 53af52eff07..5e544e92a77 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
    * @param filePath Full path of the input Parquet file to read.
    */
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) {
-    handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
-
+    long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
                               ParquetOptions opts, HostMemoryBuffer buffer,
                               long offset, long len) {
-    handle = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
-        buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+    multiHostBufferSourceHandle = handles[1];
+  }
 
+  /**
+   * Construct the reader instance from a read limit and data in host memory buffers.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or
+   *                      0 if there is no limit
+   * @param opts The options for Parquet reading.
+   * @param buffers Array of buffers containing the file data. The buffers are logically
+   *                concatenated to construct the file being read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
+                              ParquetOptions opts, HostMemoryBuffer... buffers) {
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -181,6 +211,10 @@ public void close() {
       DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
       dataSourceHandle = 0;
     }
+    if (multiHostBufferSourceHandle != 0) {
+      destroyMultiHostBufferSource(multiHostBufferSourceHandle);
+      multiHostBufferSourceHandle = 0;
+    }
   }
 
 
@@ -196,6 +230,8 @@ public void close() {
 
   private long dataSourceHandle = 0;
 
+  private long multiHostBufferSourceHandle = 0;
+
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
    *
@@ -206,13 +242,12 @@ public void close() {
    * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
    * @param binaryToString Whether to convert the corresponding column to String if it is binary.
    * @param filePath Full path of the file to read, or given as null if reading from a buffer.
-   * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer.
-   * @param length The length of the buffer to read from.
+   * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers.
    * @param timeUnit Return type of time unit for timestamps.
    */
-  private static native long create(long chunkSizeByteLimit, long passReadLimit,
-                                    String[] filterColumnNames, boolean[] binaryToString,
-                                    String filePath, long bufferAddrs, long length, int timeUnit);
+  private static native long[] create(long chunkSizeByteLimit, long passReadLimit,
+                                      String[] filterColumnNames, boolean[] binaryToString,
+                                      String filePath, long[] bufferAddrsSizes, int timeUnit);
 
   private static native long createWithDataSource(long chunkedSizeByteLimit,
       String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
@@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit,
   private static native long[] readChunk(long handle);
 
   private static native void close(long handle);
+
+  private static native void destroyMultiHostBufferSource(long handle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b01ce31b1f3..298f2cff6f3 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length,
    *                           all of them
    * @param binaryToString     whether to convert this column to String if binary
    * @param filePath           the path of the file to read, or null if no path should be read.
-   * @param address            the address of the buffer to read from or 0 if we should not.
-   * @param length             the length of the buffer to read from.
+   * @param addrsAndSizes      the address and size pairs for every buffer or null for no buffers.
    * @param timeUnit           return type of TimeStamp in units
    */
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
-                                           long address, long length, int timeUnit) throws CudfException;
+                                           long[] addrsAndSizes, int timeUnit) throws CudfException;
 
   private static native long[] readParquetFromDataSource(String[] filterColumnNames,
                                                          boolean[] binaryToString, int timeUnit,
@@ -1357,7 +1356,7 @@ public static Table readParquet(File path) {
    */
   public static Table readParquet(ParquetOptions opts, File path) {
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
+        path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()));
   }
 
   /**
@@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset,
     }
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffer raw parquet formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
     return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
   }
@@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
     assert len > 0;
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
+  }
+
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated
+   *                in order to construct the file being read.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) {
+    assert buffers.length > 0;
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param ds custom datasource to provide the Parquet file data
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, DataSource ds) {
     long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
     try {
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 9ff43feeac6..bd1714aa476 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -156,8 +156,9 @@ add_library(
   src/ScalarJni.cpp
   src/TableJni.cpp
   src/aggregation128_utils.cu
-  src/maps_column_view.cu
   src/check_nvcomp_output_sizes.cu
+  src/maps_column_view.cu
+  src/multi_host_buffer_source.cpp
 )
 
 # Disable NVTX if necessary
diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp
new file mode 100644
index 00000000000..2aedb2321e4
--- /dev/null
+++ b/java/src/main/native/include/multi_host_buffer_source.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "jni_utils.hpp"
+
+#include <cudf/io/datasource.hpp>
+
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+/**
+ * @brief A custom datasource providing data from an array of host memory buffers.
+ */
+class multi_host_buffer_source : public cudf::io::datasource {
+  std::vector<uint8_t const*> addrs_;
+  std::vector<size_t> offsets_;
+
+  size_t locate_offset_index(size_t offset);
+
+ public:
+  explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes);
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
+  bool supports_device_read() const override { return true; }
+  bool is_device_read_preferred(size_t size) const override { return true; }
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override;
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override;
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override;
+  size_t size() const override { return offsets_.back(); }
+};
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index cf04a87262f..4967e0b2b04 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/orc.hpp>
@@ -36,7 +37,7 @@ extern "C" {
 
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL
+JNIEXPORT jlongArray JNICALL
 Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jclass,
                                                 jlong chunk_read_limit,
@@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jobjectArray filter_col_names,
                                                 jbooleanArray j_col_binary_read,
                                                 jstring inp_file_path,
-                                                jlong buffer,
-                                                jlong buffer_length,
+                                                jlongArray addrs_sizes,
                                                 jint unit)
 {
-  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr);
   bool read_buffer = true;
-  if (buffer == 0) {
-    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
+  if (addrs_sizes == nullptr) {
+    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(
-      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env,
+                  cudf::jni::ILLEGAL_ARG_CLASS,
+                  "Cannot pass in both buffers and an inp_file_path",
+                  nullptr);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                            static_cast<std::size_t>(buffer_length))
-                                    : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
                              .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
                              .build();
-
-    return reinterpret_cast<jlong>(
+    n_addrs_sizes.cancel();
+    n_col_binary_read.cancel();
+    auto reader_handle = reinterpret_cast<jlong>(
       new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
                                            static_cast<std::size_t>(pass_read_limit),
                                            read_opts));
+    cudf::jni::native_jlongArray result(env, 2);
+    result[0] = reader_handle;
+    result[1] = cudf::jni::release_as_jlong(multi_buffer_source);
+    return result.get_jArray();
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT jlong JNICALL
@@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource(
+  JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    delete reinterpret_cast<cudf::jni::multi_host_buffer_source*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 //
 // Chunked ORC reader JNI
 //
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ed35f35794d..a6c7ae9ba18 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -19,6 +19,7 @@
 #include "jni_compiled_expr.hpp"
 #include "jni_utils.hpp"
 #include "jni_writer_data_sink.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
                                                                    jobjectArray filter_col_names,
                                                                    jbooleanArray j_col_binary_read,
                                                                    jstring inputfilepath,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
+                                                                   jlongArray addrs_and_sizes,
                                                                    jint unit)
 {
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
-  if (buffer == 0) {
+  if (addrs_and_sizes == nullptr) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
     JNI_THROW_NEW(
       env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
 
   try {
@@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
-
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                      static_cast<std::size_t>(buffer_length))
-                              : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
       builder.convert_strings_to_categories(false)
         .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
         .build();
-    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+    auto tbl = cudf::io::read_parquet(opts).tbl;
+    n_col_binary_read.cancel();
+    n_addrs_sizes.cancel();
+    return convert_table_for_return(env, tbl);
   }
   CATCH_STD(env, NULL);
 }
diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp
new file mode 100644
index 00000000000..c577fc680ba
--- /dev/null
+++ b/java/src/main/native/src/multi_host_buffer_source.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multi_host_buffer_source.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes)
+{
+  if (addrs_sizes.size() % 2 != 0) {
+    throw std::logic_error("addrs_sizes length not a multiple of 2");
+  }
+  auto count = addrs_sizes.size() / 2;
+  addrs_.reserve(count);
+  offsets_.reserve(count + 1);
+  size_t total_size = 0;
+  for (int i = 0; i < addrs_sizes.size(); i += 2) {
+    addrs_.push_back(reinterpret_cast<uint8_t const*>(addrs_sizes[i]));
+    offsets_.push_back(total_size);
+    total_size += addrs_sizes[i + 1];
+  }
+  offsets_.push_back(total_size);
+}
+
+size_t multi_host_buffer_source::locate_offset_index(size_t offset)
+{
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto start = offsets_.begin();
+  auto it    = std::upper_bound(start, offsets_.end(), offset);
+  return (it - start) - 1;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::host_read(size_t offset,
+                                                                                  size_t size)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto const end_offset = offset + size;
+  if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto next_offset  = offsets_[buffer_index + 1];
+  if (end_offset <= next_offset) {
+    // read range hits only a single buffer, so return a zero-copy view of the data
+    auto src = addrs_[buffer_index] + offset - offsets_[buffer_index];
+    return std::make_unique<non_owning_buffer>(src, size);
+  }
+  auto buf        = std::vector<uint8_t>(size);
+  auto bytes_read = host_read(offset, size, buf.data());
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected host read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<std::vector<uint8_t>>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    std::memcpy(dst, src, copy_size);
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::device_read(
+  size_t offset, size_t size, rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer buf(size, stream);
+  auto dst        = static_cast<uint8_t*>(buf.data());
+  auto bytes_read = device_read(offset, size, dst, stream);
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected device read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<rmm::device_buffer>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::device_read(size_t offset,
+                                             size_t size,
+                                             uint8_t* dst,
+                                             rmm::cuda_stream_view stream)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value()));
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::future<size_t> multi_host_buffer_source::device_read_async(size_t offset,
+                                                                size_t size,
+                                                                uint8_t* dst,
+                                                                rmm::cuda_stream_view stream)
+{
+  std::promise<size_t> p;
+  p.set_value(device_read(offset, size, dst, stream));
+  return p.get_future();
+}
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c7fcb1756b6..7eb32892bad 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,8 +47,11 @@
 import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
@@ -1714,6 +1717,42 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetHostBuffers() throws Exception {
+    long size = TEST_PARQUET_FILE_CHUNKED_READ.length();
+    java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath();
+    try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2);
+         HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) {
+      try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) {
+        ByteBuffer bb1 = buf1.asByteBuffer();
+        while (bb1.hasRemaining()) {
+          if (channel.read(bb1) == -1) {
+            throw new EOFException("error reading first buffer");
+          }
+        }
+        ByteBuffer bb2 = buf2.asByteBuffer();
+        while (bb2.hasRemaining()) {
+          if (channel.read(bb2) == -1) {
+            throw new EOFException("error reading second buffer");
+          }
+        }
+      }
+      ParquetOptions opts = ParquetOptions.DEFAULT;
+      try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while(reader.hasNext()) {
+          ++numChunks;
+          try(Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(2, numChunks);
+        assertEquals(40000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testChunkedReadParquetFromDataSource() throws IOException {
     try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index ff6fba1c3e8..ec44a6aa8c5 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
+set(cython_sources column.pyx scalar.pyx strings_udf.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8b1d16f0d85..026c12895e8 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport (
 from pylibcudf.libcudf.types cimport size_type
 from rmm.librmm.device_buffer cimport device_buffer
 
+cdef dtype_from_lists_column_view(column_view cv)
+cdef dtype_from_column_view(column_view cv)
 
 cdef class Column:
     cdef public:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f7dcd89ea48..c59bbc0f40c 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 
 from typing import Literal
@@ -19,24 +19,21 @@ from cudf.core.buffer import (
     as_buffer,
     cuda_array_interface_wrapper,
 )
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.utils.dtypes import (
+    _get_base_dtype,
+    dtype_to_pylibcudf_type,
+    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+)
 
 from cpython.buffer cimport PyObject_CheckBuffer
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libc.stdint cimport uintptr_t, int32_t
+from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
-from cudf._lib.types cimport (
-    dtype_from_column_view,
-    dtype_to_pylibcudf_type,
-)
-
-from cudf._lib.types import dtype_from_pylibcudf_column
-
-from pylibcudf cimport DataType as plc_DataType
+from pylibcudf cimport DataType as plc_DataType, Column as plc_Column
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
     make_numeric_column
 )
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
@@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index):
     )
 
 
+def dtype_from_pylibcudf_column(plc_Column col not None):
+    type_ = col.type()
+    tid = type_.id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        child = col.list_view().child()
+        return cudf.ListDtype(dtype_from_pylibcudf_column(child))
+    elif tid == pylibcudf.TypeId.STRUCT:
+        fields = {
+            str(i): dtype_from_pylibcudf_column(col.child(i))
+            for i in range(col.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == pylibcudf.TypeId.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
+
+
+cdef dtype_from_lists_column_view(column_view cv):
+    # lists_column_view have no default constructor, so we heap
+    # allocate it to get around Cython's limitation of requiring
+    # default constructors for stack allocated objects
+    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
+    cdef column_view child = lv.get()[0].child()
+
+    if child.type().id() == libcudf_types.type_id.LIST:
+        return cudf.ListDtype(dtype_from_lists_column_view(child))
+    else:
+        return cudf.ListDtype(dtype_from_column_view(child))
+
+
+cdef dtype_from_column_view(column_view cv):
+    cdef libcudf_types.type_id tid = cv.type().id()
+    if tid == libcudf_types.type_id.LIST:
+        return dtype_from_lists_column_view(cv)
+    elif tid == libcudf_types.type_id.STRUCT:
+        fields = {
+            str(i): dtype_from_column_view(cv.child(i))
+            for i in range(cv.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == libcudf_types.type_id.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[<int32_t>(tid)]
+
+
 cdef class Column:
     """
     A Column stores columnar data in device memory.
@@ -361,7 +433,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -424,7 +496,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 40bd50acf16..65607c91302 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import copy
 
@@ -14,17 +14,16 @@ import pylibcudf as plc
 
 import cudf
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
+from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
-from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
+from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 def _replace_nested(obj, check, replacement):
@@ -223,63 +222,22 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef plc_TypeID cdtype_id = self.c_value.type().id()
+        cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
         elif cdtype_id in {
-            plc_TypeID.DECIMAL32,
-            plc_TypeID.DECIMAL64,
-            plc_TypeID.DECIMAL128,
+            plc.TypeID.DECIMAL32,
+            plc.TypeID.DECIMAL64,
+            plc.TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype_id == plc_TypeID.STRUCT:
-            struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
-            self._dtype = StructDtype({
-                str(i): dtype_from_column_view(struct_table_view.column(i))
-                for i in range(struct_table_view.num_columns())
-            })
-        elif cdtype_id == plc_TypeID.LIST:
-            if (
-                <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == plc_TypeID.LIST:
-                self._dtype = dtype_from_column_view(
-                    (<list_scalar*>self.get_raw_ptr())[0].view()
-                )
-            else:
-                self._dtype = ListDtype(
-                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                        <underlying_type_t_type_id>(
-                            (<list_scalar*>self.get_raw_ptr())[0]
-                            .view().type().id()
-                        )
-                    ]
-                )
-        else:
-            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype_id)
-            ]
-
-
-def as_device_scalar(val, dtype=None):
-    if isinstance(val, (cudf.Scalar, DeviceScalar)):
-        if dtype == val.dtype or dtype is None:
-            if isinstance(val, DeviceScalar):
-                return val
-            else:
-                return val.device_value
+        elif cdtype_id == plc.TypeID.STRUCT:
+            self._dtype = StructDtype.from_arrow(
+                plc.interop.to_arrow(self.c_value).type
+            )
+        elif cdtype_id == plc.TypeID.LIST:
+            self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type)
         else:
-            raise TypeError("Can't update dtype of existing GPU scalar")
-    else:
-        return cudf.Scalar(val, dtype=dtype).device_value
-
-
-def _is_null_host_scalar(slr):
-    if cudf.utils.utils.is_na_like(slr):
-        return True
-    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
-            slr is pd.NaT:
-        return True
-    else:
-        return False
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id]
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
deleted file mode 100644
index 18b1d26e4db..00000000000
--- a/python/cudf/cudf/_lib/types.pxd
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-
-ctypedef int32_t underlying_type_t_type_id
-
-cdef dtype_from_column_view(column_view cv)
-
-cpdef dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
deleted file mode 100644
index 777bd070b32..00000000000
--- a/python/cudf/cudf/_lib/types.pyx
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-
-from libcpp.memory cimport make_shared, shared_ptr
-
-cimport pylibcudf.libcudf.types as libcudf_types
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-
-import pylibcudf as plc
-
-import cudf
-
-
-SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
-    np.dtype("int8"): plc.types.TypeId.INT8,
-    np.dtype("int16"): plc.types.TypeId.INT16,
-    np.dtype("int32"): plc.types.TypeId.INT32,
-    np.dtype("int64"): plc.types.TypeId.INT64,
-    np.dtype("uint8"): plc.types.TypeId.UINT8,
-    np.dtype("uint16"): plc.types.TypeId.UINT16,
-    np.dtype("uint32"): plc.types.TypeId.UINT32,
-    np.dtype("uint64"): plc.types.TypeId.UINT64,
-    np.dtype("float32"): plc.types.TypeId.FLOAT32,
-    np.dtype("float64"): plc.types.TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): plc.types.TypeId.STRING,
-    np.dtype("bool"): plc.types.TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
-}
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    plc_type: np_type
-    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
-}
-# There's no equivalent to EMPTY in cudf.  We translate EMPTY
-# columns from libcudf to ``int8`` columns of all nulls in Python.
-# ``int8`` is chosen because it uses the least amount of memory.
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
-
-
-size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
-
-
-cdef dtype_from_lists_column_view(column_view cv):
-    # lists_column_view have no default constructor, so we heap
-    # allocate it to get around Cython's limitation of requiring
-    # default constructors for stack allocated objects
-    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
-    cdef column_view child = lv.get()[0].child()
-
-    if child.type().id() == libcudf_types.type_id.LIST:
-        return cudf.ListDtype(dtype_from_lists_column_view(child))
-    elif child.type().id() == libcudf_types.type_id.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_column_view(child)
-        )
-
-cdef dtype_from_structs_column_view(column_view cv):
-    fields = {
-        str(i): dtype_from_column_view(cv.child(i))
-        for i in range(cv.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-cdef dtype_from_column_view(column_view cv):
-    cdef libcudf_types.type_id tid = cv.type().id()
-    if tid == libcudf_types.type_id.LIST:
-        return dtype_from_lists_column_view(cv)
-    elif tid == libcudf_types.type_id.STRUCT:
-        return dtype_from_structs_column_view(cv)
-    elif tid == libcudf_types.type_id.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-            <underlying_type_t_type_id>(tid)
-        ]
-
-
-cpdef dtype_to_pylibcudf_type(dtype):
-    if isinstance(dtype, cudf.ListDtype):
-        return plc.DataType(plc.TypeId.LIST)
-    elif isinstance(dtype, cudf.StructDtype):
-        return plc.DataType(plc.TypeId.STRUCT)
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = plc.TypeId.DECIMAL128
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = plc.TypeId.DECIMAL64
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = plc.TypeId.DECIMAL32
-        return plc.DataType(tid, -dtype.scale)
-    # libcudf types don't support timezones so convert to the base type
-    elif isinstance(dtype, pd.DatetimeTZDtype):
-        dtype = np.dtype(f"<M8[{dtype.unit}]")
-    else:
-        dtype = np.dtype(dtype)
-    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
-
-
-def dtype_from_pylibcudf_lists_column(col):
-    child = col.list_view().child()
-    tid = child.type().id()
-
-    if tid == plc.TypeId.LIST:
-        return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
-    elif tid == plc.TypeId.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_pylibcudf_column(child)
-        )
-
-
-def dtype_from_pylibcudf_structs_column(col):
-    fields = {
-        str(i): dtype_from_pylibcudf_column(col.child(i))
-        for i in range(col.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-
-def dtype_from_pylibcudf_column(col):
-    type_ = col.type()
-    tid = type_.id()
-
-    if tid == plc.TypeId.LIST:
-        return dtype_from_pylibcudf_lists_column(col)
-    elif tid == plc.TypeId.STRUCT:
-        return dtype_from_pylibcudf_structs_column(col)
-    elif tid == plc.TypeId.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 9c436dfad18..cad4b1aa72c 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 """Define common type operations."""
 
@@ -13,6 +13,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pandas.api import types as pd_types
 
 import cudf
@@ -144,6 +145,7 @@ def is_scalar(val):
             cudf.Scalar,
             cudf._lib.scalar.DeviceScalar,
             cudf.core.tools.datetimes.DateOffset,
+            pa.Scalar,
         ),
     ) or (
         pd_types.is_scalar(val)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c2f3c782d10..5f90439f86f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -10,7 +10,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core._internals import copying
@@ -24,7 +23,11 @@
 from cudf.core.copy_types import GatherMap
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
-from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    is_mixed_with_object_dtype,
+)
 from cudf.utils.utils import _is_same_name
 
 if TYPE_CHECKING:
@@ -350,7 +353,7 @@ def names(self, values):
 
         self.name = values[0]
 
-    def _clean_nulls_from_index(self):
+    def _pandas_repr_compatible(self):
         """
         Convert all na values(if any) in Index object
         to `<NA>` as a preprocessing step to `__repr__` methods.
@@ -2047,7 +2050,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if gather_map.dtype.kind not in "iu":
-            gather_map = gather_map.astype(size_type_dtype)
+            gather_map = gather_map.astype(SIZE_TYPE_DTYPE)
 
         GatherMap(gather_map, len(self), nullify=not check_bounds or nullify)
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
index 1d21d34b1bf..e6e6c3bcedf 100644
--- a/python/cudf/cudf/core/_internals/aggregation.py
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Literal
@@ -8,9 +8,9 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf.api.types import is_scalar
 from cudf.utils import cudautils
+from cudf.utils.dtypes import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 
 if TYPE_CHECKING:
     from collections.abc import Callable
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
index 212150f505e..a9023f8fd59 100644
--- a/python/cudf/cudf/core/_internals/binaryop.py
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -6,8 +6,8 @@
 import pylibcudf as plc
 
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
index 3b8e3db60a7..c45c4a1b5cf 100644
--- a/python/cudf/cudf/core/_internals/unary.py
+++ b/python/cudf/cudf/core/_internals/unary.py
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
 import pylibcudf as plc
 
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_decimal_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index b49f5154697..0fe47255368 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         1             this is it
         dtype: object
         """
-        sep = cudf.Scalar(separator, dtype="str")
         return cudf.Series._from_column(
-            text._column.byte_pair_encoding(self.merge_pairs, sep)
+            text._column.byte_pair_encoding(self.merge_pairs, separator)
         )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index b10b8dfe207..b9d6c0e7f08 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -12,12 +12,12 @@
 from typing_extensions import Self
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     find_common_type,
     is_mixed_with_object_dtype,
     min_signed_type,
@@ -621,7 +621,7 @@ def ordered(self) -> bool:
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
-        ) and cudf._lib.scalar._is_null_host_scalar(value):
+        ) and cudf.utils.utils._is_null_host_scalar(value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
@@ -1140,7 +1140,7 @@ def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
             return self.codes
-        gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0)
+        gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0)
         out = self.categories.take(gather_map)
         out = out.set_mask(self.mask)
         return out
@@ -1192,10 +1192,10 @@ def _concat(
         codes = [o.codes for o in objs]
 
         newsize = sum(map(len, codes))
-        if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+        if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
             raise MemoryError(
                 f"Result of concat cannot have "
-                f"size > {libcudf.types.size_type_dtype}_MAX"
+                f"size > {SIZE_TYPE_DTYPE}_MAX"
             )
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 31efe267c96..30da8727366 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -25,8 +25,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.scalar import as_device_scalar
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -61,9 +59,11 @@
 from cudf.core.mixins import BinaryOperand, Reducible
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
     cudf_dtype_to_pa_type,
+    dtype_to_pylibcudf_type,
     find_common_type,
     get_time_unit,
     is_column_like,
@@ -71,13 +71,14 @@
     min_signed_type,
     min_unsigned_type,
 )
-from cudf.utils.utils import _array_ufunc, mask_dtype
+from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype
 
 if TYPE_CHECKING:
     import builtins
 
     from cudf._typing import ColumnLike, Dtype, ScalarLike
     from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.strings import StringColumn
 
 if PANDAS_GE_210:
     NumpyExtensionArray = pd.arrays.NumpyExtensionArray
@@ -93,6 +94,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
         "min",
     }
 
+    _PANDAS_NA_REPR = str(pd.NA)
+
     def data_array_view(
         self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
@@ -177,6 +180,17 @@ def __repr__(self):
             f"dtype: {self.dtype}"
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        if self.has_nulls():
+            return self.astype("str").fillna(self._PANDAS_NA_REPR)
+        return self
+
     def to_pandas(
         self,
         *,
@@ -240,8 +254,12 @@ def find_and_replace(
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
         plc_column = plc.replace.clamp(
             self.to_pylibcudf(mode="read"),
-            cudf.Scalar(lo, self.dtype).device_value.c_value,
-            cudf.Scalar(hi, self.dtype).device_value.c_value,
+            plc.interop.from_arrow(
+                pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
+            plc.interop.from_arrow(
+                pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
         )
         return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
@@ -777,9 +795,7 @@ def fillna(
         if not self.has_nulls(include_nan=True):
             return self.copy()
         elif method is None:
-            if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
-                fill_value
-            ):
+            if is_scalar(fill_value) and _is_null_host_scalar(fill_value):
                 return self.copy()
             else:
                 fill_value = self._validate_fillna_value(fill_value)
@@ -859,7 +875,7 @@ def indices_of(
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
         return apply_boolean_mask(  # type: ignore[return-value]
-            [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
+            [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
         )[0]
 
     def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
@@ -939,7 +955,7 @@ def take(
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if indices.dtype.kind not in {"u", "i"}:
-            indices = indices.astype(libcudf.types.size_type_dtype)
+            indices = indices.astype(SIZE_TYPE_DTYPE)
         GatherMap(indices, len(self), nullify=not check_bounds or nullify)
         gathered = copying.gather([self], indices, nullify=nullify)  # type: ignore[arg-type]
         return gathered[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
@@ -1018,7 +1034,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
             # https://github.com/rapidsai/cudf/issues/14515 by
             # providing a mode in which cudf::contains does not mask
             # the result.
-            result = result.fillna(cudf.Scalar(rhs.null_count > 0))
+            result = result.fillna(rhs.null_count > 0)
         return result
 
     def as_mask(self) -> Buffer:
@@ -1728,9 +1744,7 @@ def column_empty(
     elif isinstance(dtype, ListDtype):
         data = None
         children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
+            as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),
             column_empty(row_count, dtype=dtype.element_type),
         )
     elif isinstance(dtype, CategoricalDtype):
@@ -1739,21 +1753,16 @@ def column_empty(
             cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
-                        size=row_count
-                        * cudf.dtype(libcudf.types.size_type_dtype).itemsize
+                        size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize
                     )
                 ),
                 size=None,
-                dtype=libcudf.types.size_type_dtype,
+                dtype=SIZE_TYPE_DTYPE,
             ),
         )
     elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
         data = as_buffer(rmm.DeviceBuffer(size=0))
-        children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
-        )
+        children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),)
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
@@ -1984,12 +1993,12 @@ def as_column(
             column = Column.from_pylibcudf(
                 plc.filling.sequence(
                     len(arbitrary),
-                    as_device_scalar(
-                        arbitrary.start, dtype=np.dtype(np.int64)
-                    ).c_value,
-                    as_device_scalar(
-                        arbitrary.step, dtype=np.dtype(np.int64)
-                    ).c_value,
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.start, type=pa.int64())
+                    ),
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.step, type=pa.int64())
+                    ),
                 )
             )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
@@ -2537,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         )
 
     newsize = sum(map(len, objs))
-    if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+    if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
         raise MemoryError(
-            f"Result of concat cannot have "
-            f"size > {libcudf.types.size_type_dtype}_MAX"
+            f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX"
         )
     elif newsize == 0:
         return column_empty(0, head.dtype)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b6a4122ebb9..1bde7d27700 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase):
         "__rsub__",
     }
 
+    _PANDAS_NA_REPR = str(pd.NaT)
+
     def __init__(
         self,
         data: Buffer,
@@ -351,8 +353,8 @@ def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
         leap_dates = self.is_leap_year
 
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
+        leap = day_of_year == 366
+        non_leap = day_of_year == 365
         return leap.copy_if_else(non_leap, leap_dates).fillna(False)
 
     @property
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 3d9440cdf21..04b4003c510 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
@@ -22,12 +21,14 @@
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
+    from cudf.core.column.string import StringColumn
 
 
 class ListColumn(ColumnBase):
@@ -67,6 +68,16 @@ def __init__(
             children=children,
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        # TODO: handle if self.has_nulls(): case
+        return self
+
     @cached_property
     def memory_usage(self):
         n = super().memory_usage
@@ -236,7 +247,7 @@ def from_sequences(
 
         # Build Data, Mask & Offsets
         for data in arbitrary:
-            if cudf._lib.scalar._is_null_host_scalar(data):
+            if cudf.utils.utils._is_null_host_scalar(data):
                 mask_col.append(False)
                 offset_vals.append(offset)
             else:
@@ -247,7 +258,7 @@ def from_sequences(
 
         offset_col = cast(
             NumericalColumn,
-            column.as_column(offset_vals, dtype=size_type_dtype),
+            column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE),
         )
 
         # Build ListColumn
@@ -274,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         with acquire_spill_lock():
             plc_column = plc.strings.convert.convert_lists.format_list_column(
                 lc.to_pylibcudf(mode="read"),
-                cudf.Scalar("None").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar("None")),
                 separators.to_pylibcudf(mode="read"),
             )
             return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
@@ -380,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase:
         )
 
     @acquire_spill_lock()
-    def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.contains(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
             )
         )
 
     @acquire_spill_lock()
-    def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.index_of(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
                 plc.lists.DuplicateFindOption.FIND_FIRST,
             )
         )
@@ -558,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
         dtype: bool
         """
         return self._return_or_inplace(
-            self._column.contains_scalar(cudf.Scalar(search_key))
+            self._column.contains_scalar(pa.scalar(search_key))
         )
 
     def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
@@ -607,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
         """
 
         if is_scalar(search_key):
-            result = self._column.index_of_scalar(cudf.Scalar(search_key))
+            result = self._column.index_of_scalar(pa.scalar(search_key))
         else:
             result = self._column.index_of_column(as_column(search_key))
         return self._return_or_inplace(result)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 4405e153b0c..70103745926 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from numba.np import numpy_support
 from typing_extensions import Self
 
@@ -151,7 +152,7 @@ def __setitem__(self, key: Any, value: Any):
             cudf.Scalar(
                 value,
                 dtype=self.dtype
-                if cudf._lib.scalar._is_null_host_scalar(value)
+                if cudf.utils.utils._is_null_host_scalar(value)
                 else None,
             )
             if is_scalar(value)
@@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         elif self.dtype.kind == "b":
             conv_func = functools.partial(
                 plc.strings.convert.convert_booleans.from_booleans,
-                true_string=cudf.Scalar(
-                    "True", dtype="str"
-                ).device_value.c_value,
-                false_string=cudf.Scalar(
-                    "False", dtype="str"
-                ).device_value.c_value,
+                true_string=plc.interop.from_arrow(pa.scalar("True")),
+                false_string=plc.interop.from_arrow(pa.scalar("False")),
             )
         elif self.dtype.kind in {"i", "u"}:
             conv_func = plc.strings.convert.convert_integers.from_integers
@@ -789,7 +786,7 @@ def _normalize_find_and_replace_input(
         )
         # Scalar case
         if len(col_to_normalize) == 1:
-            if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
+            if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
             if np.isinf(col_to_normalize[0]):
                 return normalized_column
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fcdcb789f23..2bee85cb387 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -19,16 +19,18 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import can_convert_to_column
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
@@ -302,8 +304,10 @@ def cat(self, others=None, sep=None, na_rep=None):
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_strings(
                     self._column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(na_rep, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(
+                        pa.scalar(na_rep, type=pa.string())
+                    ),
                 )
                 data = Column.from_pylibcudf(plc_column)
         else:
@@ -359,8 +363,10 @@ def cat(self, others=None, sep=None, na_rep=None):
                             )
                         ]
                     ),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(na_rep, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(
+                        pa.scalar(na_rep, type=pa.string())
+                    ),
                 )
                 data = Column.from_pylibcudf(plc_column)
 
@@ -522,11 +528,9 @@ def join(
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_list_elements(
                     strings_column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(string_na_rep).device_value.c_value,
-                    cudf._lib.scalar.DeviceScalar(
-                        "", cudf.dtype("object")
-                    ).c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(pa.scalar(string_na_rep)),
+                    plc.interop.from_arrow(pa.scalar("")),
                     plc.strings.combine.SeparatorOnNulls.YES,
                     plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
                 )
@@ -547,8 +551,8 @@ def join(
                 plc_column = plc.strings.combine.join_list_elements(
                     strings_column.to_pylibcudf(mode="read"),
                     sep_column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep_na_rep).device_value.c_value,
-                    cudf.Scalar(string_na_rep).device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep_na_rep)),
+                    plc.interop.from_arrow(pa.scalar(string_na_rep)),
                     plc.strings.combine.SeparatorOnNulls.YES,
                     plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
                 )
@@ -800,14 +804,14 @@ def contains(
             else:
                 if case is False:
                     input_column = self.lower()._column  # type: ignore[union-attr]
-                    plc_pat = cudf.Scalar(pat.lower(), dtype="str")  # type: ignore[union-attr]
+                    pat_normed = pat.lower()  # type: ignore[union-attr]
                 else:
                     input_column = self._column
-                    plc_pat = cudf.Scalar(pat, dtype="str")
+                    pat_normed = pat
                 with acquire_spill_lock():
                     plc_result = plc.strings.find.contains(
                         input_column.to_pylibcudf(mode="read"),
-                        plc_pat.device_value.c_value,
+                        plc.interop.from_arrow(pa.scalar(pat_normed)),
                     )
                     result_col = Column.from_pylibcudf(plc_result)
         else:
@@ -892,8 +896,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex:
         with acquire_spill_lock():
             plc_result = plc.strings.contains.like(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(pat, "str").device_value.c_value,
-                cudf.Scalar(esc, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(pat)),
+                plc.interop.from_arrow(pa.scalar(esc)),
             )
             result = Column.from_pylibcudf(plc_result)
 
@@ -1071,14 +1075,14 @@ def replace(
                     plc.strings.regex_program.RegexProgram.create(
                         pat, plc.strings.regex_flags.RegexFlags.DEFAULT
                     ),
-                    cudf.Scalar(repl, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(repl)),
                     n,
                 )
             else:
                 plc_result = plc.strings.replace.replace(
                     self._column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(pat).device_value.c_value,
-                    cudf.Scalar(repl).device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(pat)),
+                    plc.interop.from_arrow(pa.scalar(repl)),
                     n,
                 )
             result = Column.from_pylibcudf(plc_result)
@@ -1194,13 +1198,13 @@ def slice(
         2    cm
         dtype: object
         """
-        param_dtype = np.dtype(np.int32)
+        param_dtype = pa.int32()
         with acquire_spill_lock():
             plc_result = plc.strings.slice.slice_strings(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(start, param_dtype).device_value.c_value,
-                cudf.Scalar(stop, param_dtype).device_value.c_value,
-                cudf.Scalar(step, param_dtype).device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(start, param_dtype)),
+                plc.interop.from_arrow(pa.scalar(stop, param_dtype)),
+                plc.interop.from_arrow(pa.scalar(step, param_dtype)),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -2174,7 +2178,7 @@ def filter_alphanum(
                 plc.strings.char_types.StringCharacterTypes.ALL_TYPES
                 if keep
                 else plc.strings.char_types.StringCharacterTypes.ALPHANUM,
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
                 plc.strings.char_types.StringCharacterTypes.ALPHANUM
                 if keep
                 else plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
@@ -2318,7 +2322,7 @@ def slice_replace(
         with acquire_spill_lock():
             plc_result = plc.strings.replace.replace_slice(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
                 start,
                 stop,
             )
@@ -2499,7 +2503,7 @@ def get_json_object(
         with acquire_spill_lock():
             plc_result = plc.json.get_json_object(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(json_path, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(json_path)),
                 options,
             )
             result = Column.from_pylibcudf(plc_result)
@@ -2657,7 +2661,12 @@ def split(
                 if regex is True:
                     data = self._column.split_re(pat, n)
                 else:
-                    data = self._column.split(cudf.Scalar(pat, "str"), n)
+                    data = self._column.split(
+                        plc.interop.from_arrow(
+                            pa.scalar(pat, type=pa.string())
+                        ),
+                        n,
+                    )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
@@ -2667,7 +2676,7 @@ def split(
                 result_table = self._column.split_record_re(pat, n)
             else:
                 result_table = self._column.split_record(
-                    cudf.Scalar(pat, "str"), n
+                    plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2829,7 +2838,12 @@ def rsplit(
                 if regex is True:
                     data = self._column.rsplit_re(pat, n)
                 else:
-                    data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
+                    data = self._column.rsplit(
+                        plc.interop.from_arrow(
+                            pa.scalar(pat, type=pa.string())
+                        ),
+                        n,
+                    )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
@@ -2839,7 +2853,7 @@ def rsplit(
                 result_table = self._column.rsplit_record_re(pat, n)
             else:
                 result_table = self._column.rsplit_record(
-                    cudf.Scalar(pat, "str"), n
+                    plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2924,7 +2938,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            self._column.partition(cudf.Scalar(sep, "str")),
+            self._column.partition(
+                plc.interop.from_arrow(pa.scalar(sep, type=pa.string()))
+            ),
             expand=expand,
         )
 
@@ -2989,7 +3005,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            self._column.rpartition(cudf.Scalar(sep, "str")),
+            self._column.rpartition(
+                plc.interop.from_arrow(pa.scalar(sep, type=pa.string()))
+            ),
             expand=expand,
         )
 
@@ -3303,7 +3321,7 @@ def _strip(
             plc_result = plc.strings.strip.strip(
                 self._column.to_pylibcudf(mode="read"),
                 side,
-                cudf.Scalar(to_strip, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -3920,7 +3938,7 @@ def _starts_ends_with(
                 f"{type(pat).__name__}"
             )
         elif is_scalar(pat):
-            plc_pat = cudf.Scalar(pat, "str").device_value.c_value
+            plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string()))
         else:
             plc_pat = column.as_column(pat, dtype="str").to_pylibcudf(
                 mode="read"
@@ -4120,7 +4138,7 @@ def _find(
         with acquire_spill_lock():
             plc_result = method(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(sub, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(sub, type=pa.string())),
                 start,
                 end,
             )
@@ -4603,7 +4621,7 @@ def filter_characters(
                 plc.strings.translate.FilterType.KEEP
                 if keep
                 else plc.strings.translate.FilterType.REMOVE,
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -4710,10 +4728,10 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
 
         if isinstance(delim, Column):
             result = self._return_or_inplace(
-                self._column.tokenize_column(delim),
+                self._column.tokenize_column(delim),  # type: ignore[arg-type]
                 retain_index=False,
             )
-        elif isinstance(delim, cudf.Scalar):
+        elif isinstance(delim, plc.Scalar):
             result = self._return_or_inplace(
                 self._column.tokenize_scalar(delim),
                 retain_index=False,
@@ -4851,10 +4869,10 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
         if isinstance(delim, Column):
             return self._return_or_inplace(
-                self._column.count_tokens_column(delim)
+                self._column.count_tokens_column(delim)  # type: ignore[arg-type]
             )
 
-        elif isinstance(delim, cudf.Scalar):
+        elif isinstance(delim, plc.Scalar):
             return self._return_or_inplace(
                 self._column.count_tokens_scalar(delim)  # type: ignore[arg-type]
             )
@@ -5112,7 +5130,7 @@ def replace_tokens(
             self._column.replace_tokens(
                 targets_column,  # type: ignore[arg-type]
                 replacements_column,  # type: ignore[arg-type]
-                cudf.Scalar(delimiter, dtype="str"),
+                plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
             ),
         )
 
@@ -5181,8 +5199,10 @@ def filter_tokens(
         return self._return_or_inplace(
             self._column.filter_tokens(
                 min_token_length,
-                cudf.Scalar(replacement, dtype="str"),
-                cudf.Scalar(delimiter, dtype="str"),
+                plc.interop.from_arrow(
+                    pa.scalar(replacement, type=pa.string())
+                ),
+                plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
             ),
         )
 
@@ -5501,12 +5521,12 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
 
 def _massage_string_arg(
     value, name, allow_col: bool = False
-) -> StringColumn | cudf.Scalar:
+) -> StringColumn | plc.Scalar:
     if isinstance(value, cudf.Scalar):
         return value
 
     if isinstance(value, str):
-        return cudf.Scalar(value, dtype="str")
+        return plc.interop.from_arrow(pa.scalar(value, type=pa.string()))
 
     allowed_types = ["Scalar"]
 
@@ -5593,7 +5613,7 @@ def __init__(
         if len(children) == 0 and size != 0:
             # all nulls-column:
             offsets = column.as_column(
-                0, length=size + 1, dtype=size_type_dtype
+                0, length=size + 1, dtype=SIZE_TYPE_DTYPE
             )
 
             children = (offsets,)
@@ -5747,8 +5767,8 @@ def sum(
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_strings(
                     result_col.to_pylibcudf(mode="read"),
-                    cudf.Scalar("").device_value.c_value,
-                    cudf.Scalar(None, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar("")),
+                    plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
                 )
                 return Column.from_pylibcudf(plc_column).element_indexing(0)
         else:
@@ -5766,7 +5786,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
                     self.to_pylibcudf(mode="read")
                 )
                 result = Column.from_pylibcudf(plc_column)
-            return (result > cudf.Scalar(0, dtype="int8")).fillna(False)
+            return (result > np.int8(0)).fillna(False)
         elif out_dtype.kind in {"i", "u"}:
             if not self.is_integer().all():
                 raise ValueError(
@@ -5870,7 +5890,7 @@ def as_decimal_column(
     ) -> cudf.core.column.DecimalBaseColumn:
         plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
             self.to_pylibcudf(mode="read"),
-            libcudf.types.dtype_to_pylibcudf_type(dtype),
+            dtype_to_pylibcudf_type(dtype),
         )
         result = Column.from_pylibcudf(plc_column)
         result.dtype.precision = dtype.precision  # type: ignore[union-attr]
@@ -6033,8 +6053,10 @@ def _binaryop(
                                 rhs.to_pylibcudf(mode="read"),
                             ]
                         ),
-                        cudf.Scalar("").device_value.c_value,
-                        cudf.Scalar(None, "str").device_value.c_value,
+                        plc.interop.from_arrow(pa.scalar("")),
+                        plc.interop.from_arrow(
+                            pa.scalar(None, type=pa.string())
+                        ),
                     )
                     return Column.from_pylibcudf(plc_column)
             elif op in {
@@ -6120,11 +6142,11 @@ def jaccard_index(self, other: Self, width: int) -> NumericalColumn:
         return type(self).from_pylibcudf(result)  # type: ignore[return-value]
 
     @acquire_spill_lock()
-    def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self:
+    def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self:
         result = plc.nvtext.generate_ngrams.generate_ngrams(
             self.to_pylibcudf(mode="read"),
             ngrams,
-            separator.device_value.c_value,
+            separator,
         )
         return type(self).from_pylibcudf(result)  # type: ignore[return-value]
 
@@ -6160,13 +6182,13 @@ def edit_distance_matrix(self) -> ListColumn:
     def byte_pair_encoding(
         self,
         merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,
-        separator: cudf.Scalar,
+        separator: str,
     ) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.byte_pair_encode.byte_pair_encoding(
                 self.to_pylibcudf(mode="read"),
                 merge_pairs,
-                separator.device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(separator)),
             )
         )
 
@@ -6174,15 +6196,15 @@ def byte_pair_encoding(
     def ngrams_tokenize(
         self,
         ngrams: int,
-        delimiter: cudf.Scalar,
-        separator: cudf.Scalar,
+        delimiter: plc.Scalar,
+        separator: plc.Scalar,
     ) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.ngrams_tokenize.ngrams_tokenize(
                 self.to_pylibcudf(mode="read"),
                 ngrams,
-                delimiter.device_value.c_value,
-                separator.device_value.c_value,
+                delimiter,
+                separator,
             )
         )
 
@@ -6205,14 +6227,14 @@ def normalize_characters(self, do_lower: bool = True) -> Self:
 
     @acquire_spill_lock()
     def replace_tokens(
-        self, targets: Self, replacements: Self, delimiter: cudf.Scalar
+        self, targets: Self, replacements: Self, delimiter: plc.Scalar
     ) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.replace.replace_tokens(
                 self.to_pylibcudf(mode="read"),
                 targets.to_pylibcudf(mode="read"),
                 replacements.to_pylibcudf(mode="read"),
-                delimiter.device_value.c_value,
+                delimiter,
             )
         )
 
@@ -6220,15 +6242,15 @@ def replace_tokens(
     def filter_tokens(
         self,
         min_token_length: int,
-        replacement: cudf.Scalar,
-        delimiter: cudf.Scalar,
+        replacement: plc.Scalar,
+        delimiter: plc.Scalar,
     ) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.replace.filter_tokens(
                 self.to_pylibcudf(mode="read"),
                 min_token_length,
-                replacement.device_value.c_value,
-                delimiter.device_value.c_value,
+                replacement,
+                delimiter,
             )
         )
 
@@ -6279,10 +6301,10 @@ def subword_tokenize(
         return tokens, masks, metadata
 
     @acquire_spill_lock()
-    def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self:
+    def tokenize_scalar(self, delimiter: plc.Scalar) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.tokenize.tokenize_scalar(
-                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+                self.to_pylibcudf(mode="read"), delimiter
             )
         )
 
@@ -6296,10 +6318,10 @@ def tokenize_column(self, delimiters: Self) -> Self:
         )
 
     @acquire_spill_lock()
-    def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn:
+    def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.tokenize.count_tokens_scalar(
-                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+                self.to_pylibcudf(mode="read"), delimiter
             )
         )
 
@@ -6324,25 +6346,25 @@ def character_tokenize(self) -> Self:
     def tokenize_with_vocabulary(
         self,
         vocabulary: plc.nvtext.tokenize.TokenizeVocabulary,
-        delimiter: cudf.Scalar,
+        delimiter: str,
         default_id: int,
     ) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.tokenize.tokenize_with_vocabulary(
                 self.to_pylibcudf(mode="read"),
                 vocabulary,
-                delimiter.device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(delimiter)),
                 default_id,
             )
         )
 
     @acquire_spill_lock()
-    def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self:
+    def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.tokenize.detokenize(
                 self.to_pylibcudf(mode="read"),
                 indices.to_pylibcudf(mode="read"),
-                separator.device_value.c_value,
+                separator,
             )
         )
 
@@ -6491,23 +6513,23 @@ def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
     @acquire_spill_lock()
     def _split_record(
         self,
-        delimiter: cudf.Scalar,
+        delimiter: plc.Scalar,
         maxsplit: int,
         method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
     ) -> Self:
         plc_column = method(
             self.to_pylibcudf(mode="read"),
-            delimiter.device_value.c_value,
+            delimiter,
             maxsplit,
         )
         return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
-    def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+    def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self:
         return self._split_record(
             delimiter, maxsplit, plc.strings.split.split.split_record
         )
 
-    def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+    def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self:
         return self._split_record(
             delimiter, maxsplit, plc.strings.split.split.rsplit_record
         )
@@ -6515,13 +6537,13 @@ def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
     @acquire_spill_lock()
     def _split(
         self,
-        delimiter: cudf.Scalar,
+        delimiter: plc.Scalar,
         maxsplit: int,
         method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
     ) -> dict[int, Self]:
         plc_table = method(
             self.to_pylibcudf(mode="read"),
-            delimiter.device_value.c_value,
+            delimiter,
             maxsplit,
         )
         return dict(
@@ -6531,21 +6553,21 @@ def _split(
             )
         )
 
-    def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+    def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
         return self._split(delimiter, maxsplit, plc.strings.split.split.split)
 
-    def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+    def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
         return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
 
     @acquire_spill_lock()
     def _partition(
         self,
-        delimiter: cudf.Scalar,
+        delimiter: plc.Scalar,
         method: Callable[[plc.Column, plc.Scalar], plc.Column],
     ) -> dict[int, Self]:
         plc_table = method(
             self.to_pylibcudf(mode="read"),
-            delimiter.device_value.c_value,
+            delimiter,
         )
         return dict(
             enumerate(
@@ -6554,12 +6576,12 @@ def _partition(
             )
         )
 
-    def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+    def partition(self, delimiter: plc.Scalar) -> dict[int, Self]:
         return self._partition(
             delimiter, plc.strings.split.partition.partition
         )
 
-    def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+    def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]:
         return self._partition(
             delimiter, plc.strings.split.partition.rpartition
         )
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index ba765b50729..052a68cec98 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from functools import cached_property
@@ -18,6 +18,7 @@
 
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
+    from cudf.core.column.string import StringColumn
 
 
 class StructColumn(ColumnBase):
@@ -51,6 +52,16 @@ def __init__(
             children=children,
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        # TODO: handle if self.has_nulls(): case
+        return self
+
     @staticmethod
     def _validate_dtype_instance(dtype: StructDtype) -> StructDtype:
         # IntervalDtype is a subclass of StructDtype, so compare types exactly
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 749ab8e837a..302178ea277 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase):
         "__rfloordiv__",
     }
 
+    _PANDAS_NA_REPR = str(pd.NaT)
+
     def __init__(
         self,
         data: Buffer,
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 4b6ad59c8e1..aaaf6c7ee4f 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, cast
 
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from cudf.core.column import NumericalColumn
@@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             # Alternately we can have an Optional[Column] and handle None
             # specially in _gather.
             self.column = cast(
-                "NumericalColumn", self.column.astype(size_type_dtype)
+                "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE)
             )
         else:
             if self.column.dtype.kind not in {"i", "u"}:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3334b57ce1b..5cea35ac0d6 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -92,7 +92,11 @@
     min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
+from cudf.utils.utils import (
+    GetAttrGetItemMixin,
+    _external_only_api,
+    _is_null_host_scalar,
+)
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnLike, Dtype, NotImplementedType
@@ -1890,7 +1894,7 @@ def astype(
             dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
-    def _clean_renderable_dataframe(self, output):
+    def _clean_renderable_dataframe(self, output: Self) -> str:
         """
         This method takes in partial/preprocessed dataframe
         and returns correct representation of it with correct
@@ -1925,41 +1929,7 @@ def _clean_renderable_dataframe(self, output):
             )
         return "\n".join(lines)
 
-    def _clean_nulls_from_dataframe(self, df):
-        """
-        This function converts all ``null`` values to ``<NA>`` for
-        representation as a string in `__repr__`.
-
-        Since we utilize Pandas `__repr__` at all places in our code
-        for formatting purposes, we convert columns to `str` dtype for
-        filling with `<NA>` values.
-        """
-        for col in df._data:
-            if isinstance(
-                df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype)
-            ):
-                # TODO we need to handle this
-                pass
-            elif df._data[col].has_nulls():
-                fill_value = (
-                    str(cudf.NaT)
-                    if isinstance(
-                        df._data[col],
-                        (
-                            cudf.core.column.DatetimeColumn,
-                            cudf.core.column.TimeDeltaColumn,
-                        ),
-                    )
-                    else str(cudf.NA)
-                )
-
-                df[col] = df._data[col].astype("str").fillna(fill_value)
-            else:
-                df[col] = df._data[col]
-
-        return df
-
-    def _get_renderable_dataframe(self):
+    def _get_renderable_dataframe(self) -> Self:
         """
         Takes rows and columns from pandas settings or estimation from size.
         pulls quadrants based off of some known parameters then style for
@@ -1967,9 +1937,9 @@ def _get_renderable_dataframe(self):
         for printing with the dataframe.
         """
         max_rows = pd.options.display.max_rows
-        nrows = np.max([len(self) if max_rows is None else max_rows, 1])
-        if pd.options.display.max_rows == 0:
-            nrows = len(self)
+        if max_rows in {0, None}:
+            max_rows = len(self)
+        nrows = max(max_rows, 1)
         ncols = (
             pd.options.display.max_columns
             if pd.options.display.max_columns
@@ -1977,7 +1947,7 @@ def _get_renderable_dataframe(self):
         )
 
         if len(self) <= nrows and self._num_columns <= ncols:
-            output = self.copy(deep=False)
+            output = self
         elif self.empty and len(self.index) > 0:
             max_seq_items = pd.options.display.max_seq_items
             # In case of Empty DataFrame with index, Pandas prints
@@ -2037,10 +2007,7 @@ def _get_renderable_dataframe(self):
                 lower = cudf.concat([lower_left, lower_right], axis=1)
                 output = cudf.concat([upper, lower])
 
-        output = self._clean_nulls_from_dataframe(output)
-        output.index = output.index._clean_nulls_from_index()
-
-        return output
+        return output._pandas_repr_compatible()
 
     @_performance_tracking
     def __repr__(self):
@@ -3371,7 +3338,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             if isinstance(value, (np.ndarray, cupy.ndarray)):
                 dtype = value.dtype
                 value = value.item()
-            if libcudf.scalar._is_null_host_scalar(value):
+            if _is_null_host_scalar(value):
                 dtype = "str"
             value = as_column(
                 value,
@@ -6262,10 +6229,8 @@ def isin(self, values):
         # TODO: propagate nulls through isin
         # https://github.com/rapidsai/cudf/issues/7556
 
-        fill_value = cudf.Scalar(False)
-
         def make_false_column_like_self():
-            return column.as_column(fill_value, length=len(self), dtype="bool")
+            return column.as_column(False, length=len(self), dtype="bool")
 
         # Preprocess different input types into a mapping from column names to
         # a list of values to check.
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 8ed233ba737..ce7fb968069 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import decimal
@@ -57,7 +57,8 @@ def dtype(arbitrary):
         if np_dtype.kind in set("OU"):
             return np.dtype("object")
         elif (
-            np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+            np_dtype
+            not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
         ):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8f45c6f0115..abf9f7b3686 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -820,6 +820,13 @@ def fillna(
             inplace=inplace,
         )
 
+    def _pandas_repr_compatible(self) -> Self:
+        """Return Self but with columns prepared for a pandas-like repr."""
+        columns = (col._prep_pandas_compat_repr() for col in self._columns)
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(columns, verify=False)
+        )
+
     @_performance_tracking
     def _drop_column(
         self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6ae524d6346..7bc4b08fc49 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -14,13 +14,13 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 import pylibcudf as plc
 
 import cudf
 import cudf.core._internals
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     is_list_like,
@@ -45,6 +45,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
@@ -586,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         offsets, group_keys, (indices,) = self._groups(
             [
                 cudf.core.column.as_column(
-                    range(len(self.obj)), dtype=size_type_dtype
+                    range(len(self.obj)), dtype=SIZE_TYPE_DTYPE
                 )
             ]
         )
@@ -852,7 +853,9 @@ def _shift(
             plc.table.Table([col.to_pylibcudf(mode="read") for col in values]),
             [periods] * len(values),
             [
-                cudf.Scalar(val, dtype=col.dtype).device_value.c_value
+                plc.interop.from_arrow(
+                    pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype))
+                )
                 for val, col in zip(fill_values, values)
             ],
         )
@@ -1181,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         # aggregation scheme in libcudf. This is probably "fast
         # enough" for most reasonable input sizes.
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         # "Out of bounds" n for the group size either means no entries
         # (negative) or all the entries (positive)
@@ -1195,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             group_offsets = group_offsets[:-1]
         else:
             group_offsets = group_offsets[1:] - size_per_group
-        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
+        to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
         fixup = np.empty_like(size_per_group)
         fixup[0] = 0
         np.cumsum(size_per_group[:-1], out=fixup[1:])
@@ -1496,11 +1499,11 @@ def sample(
         # into a numpy array directly, rather than a list.
         # TODO: this uses the sort-based groupby, could one use hash-based?
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         if n is not None:
             samples_per_group = np.broadcast_to(
-                size_type_dtype.type(n), size_per_group.shape
+                SIZE_TYPE_DTYPE.type(n), size_per_group.shape
             )
             if not replace and (minsize := size_per_group.min()) < n:
                 raise ValueError(
@@ -1513,7 +1516,7 @@ def sample(
             # which is round-to-nearest, ties to sgn(x) * inf).
             samples_per_group = np.round(
                 size_per_group * frac, decimals=0
-            ).astype(size_type_dtype)
+            ).astype(SIZE_TYPE_DTYPE)
         if replace:
             # We would prefer to use cupy here, but their rng.integers
             # interface doesn't take array-based low and high
@@ -1521,7 +1524,7 @@ def sample(
             low = 0
             high = np.repeat(size_per_group, samples_per_group)
             rng = np.random.default_rng(seed=random_state)
-            indices = rng.integers(low, high, dtype=size_type_dtype)
+            indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE)
             indices += np.repeat(group_offsets[:-1], samples_per_group)
         else:
             # Approach: do a segmented argsort of the index array and take
@@ -1529,7 +1532,7 @@ def sample(
             # We will shuffle the group indices and then pick them out
             # from the grouped dataframe index.
             nrows = len(group_values)
-            indices = cp.arange(nrows, dtype=size_type_dtype)
+            indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE)
             if len(size_per_group) < 500:
                 # Empirically shuffling with cupy is faster at this scale
                 rs = cp.random.get_random_state()
@@ -1553,7 +1556,7 @@ def sample(
                     indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
-            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
+            want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
             scan = np.empty_like(samples_per_group)
             scan[0] = 0
             np.cumsum(samples_per_group[:-1], out=scan[1:])
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 85be8d21d27..0d1bf552982 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -19,7 +19,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -53,6 +52,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -339,7 +339,7 @@ def _values(self) -> ColumnBase:
         else:
             return column.column_empty(0, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self) -> Self:
+    def _pandas_repr_compatible(self) -> Self:
         return self
 
     def _is_numeric(self) -> bool:
@@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = [self._range.index(value)]
         except ValueError:
             i = []
-        return as_column(i, dtype=size_type_dtype)
+        return as_column(i, dtype=SIZE_TYPE_DTYPE)
 
     def isin(self, values, level=None):
         if level is not None and level > 0:
@@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
             out.name = name
         return out
 
-    @classmethod
     @_performance_tracking
-    def _from_data_like_self(
-        cls, data: MutableMapping, name: Any = no_default
-    ) -> Self:
-        out = _index_from_data(data, name)
-        if name is not no_default:
-            out.name = name
-        return out
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
+        return _index_from_data(data, self.name)
 
     @classmethod
     @_performance_tracking
@@ -1354,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = as_column(
             -1,
             length=len(needle),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
 
         if not len(self):
@@ -1494,7 +1488,7 @@ def __repr__(self) -> str:
             if isinstance(self._values, StringColumn):
                 output = repr(self.to_pandas(nullable=True))
             else:
-                output = repr(self._clean_nulls_from_index().to_pandas())
+                output = repr(self._pandas_repr_compatible().to_pandas())
                 # We should remove all the single quotes
                 # from the output due to the type-cast to
                 # object dtype happening above.
@@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool:
         hash(item)
         return item in self._column
 
-    def _clean_nulls_from_index(self) -> Index:
-        if self._values.has_nulls():
-            fill_value = (
-                str(cudf.NaT)
-                if isinstance(self, (DatetimeIndex, TimedeltaIndex))
-                else str(cudf.NA)
-            )
-            return Index._from_column(
-                self._column.astype("str").fillna(fill_value),
-                name=self.name,
-            )
-
-        return self
-
     def any(self) -> bool:
         return self._column.any()
 
@@ -2347,8 +2327,7 @@ def microsecond(self) -> Index:
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
                 # __mul__ returns an int16 column.
-                self._column.millisecond.astype("int32")
-                * cudf.Scalar(1000, dtype="int32")
+                self._column.millisecond.astype("int32") * np.int32(1000)
             )
             + self._column.microsecond,
             name=self.name,
@@ -3615,7 +3594,7 @@ def _is_interval(self) -> bool:
     def _is_boolean(self) -> bool:
         return False
 
-    def _clean_nulls_from_index(self) -> Self:
+    def _pandas_repr_compatible(self) -> Self:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 6854cb02aa5..4c6f8a9c152 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 """Base class for Frame types that have an index."""
 
 from __future__ import annotations
@@ -60,6 +60,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
@@ -2836,16 +2837,22 @@ def hash_values(
 
         Parameters
         ----------
-        method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
+        method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3'
             Hash function to use:
 
             * murmur3: MurmurHash3 hash function
-            * md5: MD5 hash function
+            * xxhash32: xxHash32 hash function
             * xxhash64: xxHash64 hash function
+            * md5: MD5 hash function
+            * sha1: SHA-1 hash function
+            * sha224: SHA-224 hash function
+            * sha256: SHA-256 hash function
+            * sha384: SHA-384 hash function
+            * sha512: SHA-512 hash function
 
         seed : int, optional
             Seed value to use for the hash function. This parameter is only
-            supported for 'murmur3' and 'xxhash64'.
+            supported for 'murmur3', 'xxhash32', and 'xxhash64'.
 
 
         Returns
@@ -2900,7 +2907,7 @@ def hash_values(
         2    fe061786ea286a515b772d91b0dfcd70
         dtype: object
         """
-        seed_hash_methods = {"murmur3", "xxhash64"}
+        seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"}
         if seed is None:
             seed = 0
         elif method not in seed_hash_methods:
@@ -2914,6 +2921,8 @@ def hash_values(
             )
             if method == "murmur3":
                 plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
+            elif method == "xxhash32":
+                plc_column = plc.hashing.xxhash_32(plc_table, seed)
             elif method == "xxhash64":
                 plc_column = plc.hashing.xxhash_64(plc_table, seed)
             elif method == "md5":
@@ -3026,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                         NumericalColumn,
                         as_column(
                             range(start, stop, stride),
-                            dtype=libcudf.types.size_type_dtype,
+                            dtype=SIZE_TYPE_DTYPE,
                         ),
                     ),
                     len(self),
@@ -3247,7 +3256,7 @@ def duplicated(
             )
             distinct = libcudf.column.Column.from_pylibcudf(plc_column)
         result = copying.scatter(
-            [cudf.Scalar(False, dtype=bool)],
+            [cudf.Scalar(False)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
@@ -4402,6 +4411,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
             index_names=self.index.names if keep_index else None,
         )
 
+    def _pandas_repr_compatible(self) -> Self:
+        """Return Self but with columns prepared for a pandas-like repr."""
+        result = super()._pandas_repr_compatible()
+        result.index = self.index._pandas_repr_compatible()
+        return result
+
     def take(self, indices, axis=0):
         """Return a new frame containing the rows specified by *indices*.
 
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6e965ceca66..ce7edc8fdbe 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import Any
@@ -7,7 +7,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
@@ -17,6 +16,7 @@
     _IndexIndexer,
     _match_join_keys,
 )
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 
 class Merge:
@@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols):
         # tables, we gather from iota on both right and left, and then
         # sort the gather maps with those two columns as key.
         key_order = [
-            cudf.core.column.as_column(range(n), dtype=size_type_dtype).take(
+            cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take(
                 map_, nullify=null, check_bounds=False
             )
             for map_, n, null in zip(maps, lengths, nullify)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 1e613e49ffc..64ec099cb39 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -17,7 +17,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
@@ -34,7 +33,7 @@
     ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
-from cudf.utils.dtypes import is_column_like
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -199,7 +198,7 @@ def __init__(
                     )
                 if lo == -1:
                     # Now we can gather and insert null automatically
-                    code[code == -1] = np.iinfo(size_type_dtype).min
+                    code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min
             result_col = level._column.take(code, nullify=True)
             source_data[i] = result_col._with_type_metadata(level.dtype)
 
@@ -361,6 +360,13 @@ def _from_data(
             name=name,
         )
 
+    @_performance_tracking
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
+        mi = type(self)._from_data(data, name=self.name)
+        if mi.nlevels == self.nlevels:
+            mi.names = self.names
+        return mi
+
     @classmethod
     def _simple_new(
         cls,
@@ -1571,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index:
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
-        # cudf uses np.iinfo(size_type_dtype).min as missing code
+        # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code
         # pandas uses -1 as missing code
         pd_codes = (
             code.find_and_replace(
-                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1),
                 column.as_column(-1, length=1),
             )
             for code in self._codes
@@ -1753,16 +1759,6 @@ def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
 
-    def _clean_nulls_from_index(self) -> Self:
-        """
-        Convert all na values(if any) in MultiIndex object
-        to `<NA>` as a preprocessing step to `__repr__` methods.
-        """
-        index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
-        return MultiIndex.from_frame(
-            index_df._clean_nulls_from_dataframe(index_df), names=self.names
-        )
-
     @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         usage = sum(col.memory_usage for col in self._columns)
@@ -1906,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = column.as_column(
             -1,
             length=len(target),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
         if not len(self):
             return _return_get_indexer_result(result.values)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 0abd42d4d4e..eedd777aafe 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -12,13 +12,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.utils.dtypes import min_unsigned_type
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
@@ -1333,10 +1332,10 @@ def _one_hot_encode_column(
         else:
             column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
-    if column.size * categories.size >= np.iinfo(size_type_dtype).max:
+    if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max:
         raise ValueError(
             "Size limitation exceeded: column.size * category.size < "
-            f"np.iinfo({size_type_dtype}).max. Consider reducing "
+            f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing "
             "size of category"
         )
     result_labels = (
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 80dd0921f9c..7d246960cc9 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -178,13 +178,13 @@ def dtype(self):
     def is_valid(self):
         if not self._is_host_value_current:
             self._device_value_to_host()
-        return not cudf._lib.scalar._is_null_host_scalar(self._host_value)
+        return not cudf.utils.utils._is_null_host_scalar(self._host_value)
 
     def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        valid = not cudf._lib.scalar._is_null_host_scalar(value)
+        valid = not cudf.utils.utils._is_null_host_scalar(value)
 
         if isinstance(value, list):
             if dtype is not None:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 49c2c8cf387..805f9f9a9f9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1449,35 +1449,16 @@ def __repr__(self):
                 warnings.simplefilter("ignore", FutureWarning)
                 preprocess = cudf.concat([top, bottom])
         else:
-            preprocess = self.copy()
-        preprocess.index = preprocess.index._clean_nulls_from_index()
-        if (
-            preprocess.nullable
-            and not isinstance(
-                preprocess.dtype,
-                (
-                    cudf.CategoricalDtype,
-                    cudf.ListDtype,
-                    cudf.StructDtype,
-                    cudf.core.dtypes.DecimalDtype,
-                ),
-            )
-        ) or preprocess.dtype.kind == "m":
-            fill_value = (
-                str(cudf.NaT)
-                if preprocess.dtype.kind in "mM"
-                else str(cudf.NA)
-            )
-            output = repr(
-                preprocess.astype("str").fillna(fill_value).to_pandas()
-            )
-        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
+            preprocess = self
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
+            preprocess = preprocess.copy(deep=False)
+            preprocess.index = preprocess.index._pandas_repr_compatible()
             if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
@@ -1502,7 +1483,7 @@ def __repr__(self):
                 na_rep=str(cudf.NA),
             )
         else:
-            output = repr(preprocess.to_pandas())
+            output = repr(preprocess._pandas_repr_compatible().to_pandas())
 
         lines = output.split("\n")
         if isinstance(preprocess.dtype, cudf.CategoricalDtype):
@@ -4125,8 +4106,8 @@ def microsecond(self) -> Series:
         # Need to manually promote column to int32 because
         # pandas-matching binop behaviour requires that this
         # __mul__ returns an int16 column.
-        extra = self.series._column.millisecond.astype("int32") * cudf.Scalar(
-            1000, dtype="int32"
+        extra = self.series._column.millisecond.astype("int32") * np.int32(
+            1000
         )
         return self._return_result_like_self(micro + extra)
 
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index fb8b9b3131c..58dabc85491 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -42,9 +42,8 @@ def tokenize(
         """
         if delimiter is None:
             delimiter = ""
-        delim = cudf.Scalar(delimiter, dtype="str")
         result = text._column.tokenize_with_vocabulary(
-            self.vocabulary, delim, default_id
+            self.vocabulary, delimiter, default_id
         )
 
         return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 2f8a6d9e5e7..e2c332f34f5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION
+# Copyright (c) 2020-2025, NVIDIA CORPORATION
 from __future__ import annotations
 
 import warnings
 from typing import TYPE_CHECKING
 
 import numba
+import numpy as np
 import pandas as pd
 from pandas.api.indexers import BaseIndexer
 
@@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name):
             end = as_column(end, dtype="int32")
 
             idx = as_column(range(len(start)))
-            preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype(
-                "int32"
-            )
-            following_window = (end - idx - cudf.Scalar(1, "int32")).astype(
-                "int32"
-            )
+            preceding_window = (idx - start + np.int32(1)).astype("int32")
+            following_window = (end - idx - np.int32(1)).astype("int32")
             window = None
         else:
             preceding_window = as_column(self.window)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 6d617cbf38e..7e8468c8e8a 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import errno
@@ -16,11 +16,13 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_hashable, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 
 _CSV_HEX_TYPE_MAP = {
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index ff326e09315..16c7d189dfd 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import os
@@ -14,10 +14,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index f3124552fd1..0ac2950a22b 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -11,11 +11,11 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_list_like
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.index import _index_from_data
 from cudf.utils import ioutils
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 try:
     import ujson as json  # type: ignore[import-untyped]
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc
new file mode 100644
index 00000000000..a0ea4fbbfc2
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc
new file mode 100644
index 00000000000..8a7969cdbbb
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 11a9b398b50..f3cf8e36a5b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import array as arr
 import contextlib
@@ -1440,6 +1440,7 @@ def test_assign_callable(mapping):
         "sha256",
         "sha384",
         "sha512",
+        "xxhash32",
         "xxhash64",
     ],
 )
@@ -1447,6 +1448,7 @@ def test_assign_callable(mapping):
 def test_dataframe_hash_values(nrows, method, seed):
     warning_expected = seed is not None and method not in {
         "murmur3",
+        "xxhash32",
         "xxhash64",
     }
     potential_warning = (
@@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         "sha256": object,
         "sha384": object,
         "sha512": object,
+        "xxhash32": np.uint32,
         "xxhash64": np.uint64,
     }
     assert out.dtype == expected_dtypes[method]
@@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
 
 
-@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
+@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"])
 def test_dataframe_hash_values_seed(method):
     gdf = cudf.DataFrame()
     data = np.arange(10)
@@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method):
     assert_neq(out_one, out_two)
 
 
+def test_dataframe_hash_values_xxhash32():
+    # xxhash32 has no built-in implementation in Python and we don't want to
+    # add a testing dependency, so we use regression tests against known good
+    # values.
+    gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
+    gdf["b"] = -gdf["a"]
+    out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
+    expected_a = cudf.Series(
+        [3736311059, 2307980487, 2906647130, 746578903, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_a, expected_a)
+
+    out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
+    expected_b = cudf.Series(
+        [1076387279, 2261349915, 531498073, 650869264, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_b, expected_b)
+
+    out_df = gdf.hash_values(method="xxhash32", seed=0)
+    expected_df = cudf.Series(
+        [1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_df, expected_df)
+
+
 def test_dataframe_hash_values_xxhash64():
     # xxhash64 has no built-in implementation in Python and we don't want to
     # add a testing dependency, so we use regression tests against known good
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index da0aa5be6f5..b1f81edfc54 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import functools
 import operator
@@ -14,6 +14,7 @@
 from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
+from cudf.utils.dtypes import cudf_dtype_to_pa_type
 
 
 @pytest.mark.parametrize(
@@ -423,7 +424,9 @@ def test_get_ind_sequence():
 def test_contains_scalar(data, scalar, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect)
-    got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type))
+    got = sr.list.contains(
+        pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type))
+    )
     assert_eq(expect, got)
 
 
@@ -455,7 +458,9 @@ def test_contains_scalar(data, scalar, expect):
 def test_contains_null_search_key(data, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="bool")
-    got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
+    got = sr.list.contains(
+        pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type))
+    )
     assert_eq(expect, got)
 
 
@@ -518,12 +523,12 @@ def test_contains_invalid(data, scalar):
         ),
         (
             [["d", None, "e"], [None, "f"], []],
-            cudf.Scalar(cudf.NA, "O"),
+            pa.scalar(None, type=pa.string()),
             [None, None, None],
         ),
         (
             [None, [10, 9, 8], [5, 8, None]],
-            cudf.Scalar(cudf.NA, "int64"),
+            pa.scalar(None, type=pa.int64()),
             [None, None, None],
         ),
     ],
@@ -532,7 +537,11 @@ def test_index(data, search_key, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="int32")
     if is_scalar(search_key):
-        got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type))
+        got = sr.list.index(
+            pa.scalar(
+                search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type)
+            )
+        )
     else:
         got = sr.list.index(
             cudf.Series(search_key, dtype=sr.dtype.element_type)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c4b4ef60184..fe143e66407 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir):
     got = cudf.read_orc(buffer)
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "inputfile",
+    [
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+    ],
+)
+def test_orc_reader_desynced_timestamp(datadir, inputfile):
+    # Test a special case where the DATA stream (second) in a TIMESTAMP column
+    # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row
+    # group. In this case, the "run cache manager" in the decoder kernel is used to
+    # orchestrate the dual-stream processing.
+    # For more information, see https://github.com/rapidsai/cudf/issues/17155.
+
+    path = datadir / inputfile
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_frame_equal(cudf.from_pandas(expect), got)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index bf0c97adb00..2cb742727cc 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import textwrap
 
@@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype):
             cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.001000000
-            1    0 days 00:00:00.000200000
-            2    0 days 00:00:00.003000000
+            0    0 days 00:00:00.001000
+            1    0 days 00:00:00.000200
+            2    0 days 00:00:00.003000
             dtype: timedelta64[ns]
             """
             ),
@@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.012
-            1    0 days 00:00:00.012
-            2    0 days 00:00:00.022
-            3    0 days 00:00:00.343
-            4    0 days 01:12:33.534
-            5    0 days 00:07:15.342
+            0    0 days 00:00:00.012000
+            1    0 days 00:00:00.012000
+            2    0 days 00:00:00.022000
+            3    0 days 00:00:00.343000
+            4    0 days 01:12:33.534000
+            5    0 days 00:07:15.342000
             dtype: timedelta64[ms]
             """
             ),
@@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.001
-            1    0 days 00:00:01.132
-            2    0 days 06:27:03.231
-            3    0 days 00:00:00.233
-            4        0 days 00:00:00
-            5    0 days 00:00:00.332
-            6    0 days 00:00:00.323
+            0    0 days 00:00:00.001000
+            1    0 days 00:00:01.132000
+            2    0 days 06:27:03.231000
+            3    0 days 00:00:00.233000
+            4           0 days 00:00:00
+            5    0 days 00:00:00.332000
+            6    0 days 00:00:00.323000
             dtype: timedelta64[ms]
             """
             ),
@@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    157937 days 02:23:52.432
-            1         1 days 13:25:36.784
-            2         2 days 20:09:05.345
-            3         2 days 14:03:52.411
-            4     11573 days 23:39:03.241
-            5        42 days 01:35:48.734
-            6         0 days 00:00:23.234
+            0    157937 days 02:23:52.432000
+            1         1 days 13:25:36.784000
+            2         2 days 20:09:05.345000
+            3         2 days 14:03:52.411000
+            4     11573 days 23:39:03.241000
+            5        42 days 01:35:48.734000
+            6         0 days 00:00:23.234000
             dtype: timedelta64[ms]
             """
             ),
@@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    157937 days 02:23:52.432
-            1         1 days 13:25:36.784
-            2         2 days 20:09:05.345
-            3         2 days 14:03:52.411
-            4     11573 days 23:39:03.241
-            5        42 days 01:35:48.734
-            6         0 days 00:00:23.234
+            0    157937 days 02:23:52.432000
+            1         1 days 13:25:36.784000
+            2         2 days 20:09:05.345000
+            3         2 days 14:03:52.411000
+            4     11573 days 23:39:03.241000
+            5        42 days 01:35:48.734000
+            6         0 days 00:00:23.234000
             Name: abc, dtype: timedelta64[ms]
             """
             ),
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index d9dde58d998..574170d28c6 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -130,9 +130,7 @@ def _setup_numba():
             if driver_version < (12, 0):
                 patch_numba_linker_cuda_11()
             else:
-                from pynvjitlink.patch import patch_numba_linker
-
-                patch_numba_linker()
+                numba_config.CUDA_ENABLE_PYNVJITLINK = True
 
 
 class _CUDFNumbaConfig:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index ca8f9cac2d0..9e932acb5fa 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import datetime
@@ -11,6 +11,8 @@
 import pyarrow as pa
 from pandas.core.dtypes.common import infer_dtype_from_object
 
+import pylibcudf as plc
+
 import cudf
 
 if TYPE_CHECKING:
@@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif cudf.api.types.is_decimal128_dtype(dtype):
         return cudf.core.dtypes.Decimal128Dtype
-    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
+    elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)
@@ -198,7 +200,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
     If `val` is None, returns None.
     """
 
-    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
+    if cudf.utils.utils._is_null_host_scalar(val) or isinstance(
         val, cudf.Scalar
     ):
         return val
@@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
         return dtype.base
 
 
+def dtype_to_pylibcudf_type(dtype) -> plc.DataType:
+    if isinstance(dtype, cudf.ListDtype):
+        return plc.DataType(plc.TypeId.LIST)
+    elif isinstance(dtype, cudf.StructDtype):
+        return plc.DataType(plc.TypeId.STRUCT)
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        tid = plc.TypeId.DECIMAL128
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal64Dtype):
+        tid = plc.TypeId.DECIMAL64
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        tid = plc.TypeId.DECIMAL32
+        return plc.DataType(tid, -dtype.scale)
+    # libcudf types don't support timezones so convert to the base type
+    elif isinstance(dtype, pd.DatetimeTZDtype):
+        dtype = _get_base_dtype(dtype)
+    else:
+        dtype = np.dtype(dtype)
+    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
+
+
+SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
+    np.dtype("int8"): plc.types.TypeId.INT8,
+    np.dtype("int16"): plc.types.TypeId.INT16,
+    np.dtype("int32"): plc.types.TypeId.INT32,
+    np.dtype("int64"): plc.types.TypeId.INT64,
+    np.dtype("uint8"): plc.types.TypeId.UINT8,
+    np.dtype("uint16"): plc.types.TypeId.UINT16,
+    np.dtype("uint32"): plc.types.TypeId.UINT32,
+    np.dtype("uint64"): plc.types.TypeId.UINT64,
+    np.dtype("float32"): plc.types.TypeId.FLOAT32,
+    np.dtype("float64"): plc.types.TypeId.FLOAT64,
+    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
+    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
+    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
+    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
+    np.dtype("object"): plc.types.TypeId.STRING,
+    np.dtype("bool"): plc.types.TypeId.BOOL8,
+    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
+    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
+    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
+    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
+}
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
+    plc_type: np_type
+    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
+}
+# There's no equivalent to EMPTY in cudf.  We translate EMPTY
+# columns from libcudf to ``int8`` columns of all nulls in Python.
+# ``int8`` is chosen because it uses the least amount of memory.
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype(
+    "object"
+)
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
+
+
+SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
+
 # Type dispatch loops similar to what are found in `np.add.types`
 # In NumPy, whether or not an op can be performed between two
 # operands is determined by checking to see if NumPy has a c/c++
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c83c1cbe895..0adaaa60654 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -341,6 +341,15 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
+def _is_null_host_scalar(slr) -> bool:
+    # slr is NA like or NaT like
+    return (
+        is_na_like(slr)
+        or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr))
+        or slr is pd.NaT
+    )
+
+
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 2fdf6b34b8f..c6a5887f85d 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -24,7 +24,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13,<0.0.18",
+    "numba-cuda>=0.2.0,<0.3.0",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1c1d4860eec..fd56329a48e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """
 DSL nodes for the LogicalPlan of polars.
@@ -34,9 +34,11 @@
 from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Hashable, MutableMapping, Sequence
+    from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
     from typing import Literal
 
+    from polars.polars import _expr_nodes as pl_expr
+
     from cudf_polars.typing import Schema
 
 
@@ -1019,7 +1021,27 @@ class ConditionalJoin(IR):
     __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
-    options: tuple
+    """Expression predicate to join on"""
+    options: tuple[
+        tuple[
+            str,
+            pl_expr.Operator | Iterable[pl_expr.Operator],
+        ],
+        bool,
+        tuple[int, int] | None,
+        str,
+        bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
+    ]
+    """
+    tuple of options:
+    - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions
+    - join_nulls: do nulls compare equal?
+    - slice: optional slice to perform after joining.
+    - suffix: string suffix for right columns if names match
+    - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
+    """
 
     def __init__(
         self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR
@@ -1029,15 +1051,16 @@ def __init__(
         self.options = options
         self.children = (left, right)
         self.ast_predicate = to_ast(predicate)
-        _, join_nulls, zlice, suffix, coalesce = self.options
+        _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options
         # Preconditions from polars
         assert not join_nulls
         assert not coalesce
+        assert maintain_order == "none"
         if self.ast_predicate is None:
             raise NotImplementedError(
                 f"Conditional join with predicate {predicate}"
             )  # pragma: no cover; polars never delivers expressions we can't handle
-        self._non_child_args = (self.ast_predicate, zlice, suffix)
+        self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order)
 
     @classmethod
     def do_evaluate(
@@ -1045,6 +1068,7 @@ def do_evaluate(
         predicate: plc.expressions.Expression,
         zlice: tuple[int, int] | None,
         suffix: str,
+        maintain_order: Literal["none", "left", "right", "left_right", "right_left"],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
@@ -1088,6 +1112,7 @@ class Join(IR):
         tuple[int, int] | None,
         str,
         bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
     ]
     """
     tuple of options:
@@ -1096,6 +1121,7 @@ class Join(IR):
     - slice: optional slice to perform after joining.
     - suffix: string suffix for right columns if names match
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
     """
 
     def __init__(
@@ -1113,6 +1139,9 @@ def __init__(
         self.options = options
         self.children = (left, right)
         self._non_child_args = (self.left_on, self.right_on, self.options)
+        # TODO: Implement maintain_order
+        if options[5] != "none":
+            raise NotImplementedError("maintain_order not implemented yet")
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -1222,12 +1251,13 @@ def do_evaluate(
             tuple[int, int] | None,
             str,
             bool,
+            Literal["none", "left", "right", "left_right", "right_left"],
         ],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        how, join_nulls, zlice, suffix, coalesce = options
+        how, join_nulls, zlice, suffix, coalesce, _ = options
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 37cf36dc4dd..2138ac0c700 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Translate polars IR representation to ours."""
@@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (4, 0):
+        if (version := self.visitor.version()) >= (4, 3):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 7a759eea2e9..c16df320ceb 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Plugin for running polars test suite setting GPU engine as default."""
@@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[read_parquet-<lambda>]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-<lambda>0]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-<lambda>2]": "Need to add include_file_path to IR",
     "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
     "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
     "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
@@ -140,6 +145,22 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -174,6 +195,19 @@ def pytest_configure(config: pytest.Config) -> None:
 }
 
 
+TESTS_TO_SKIP: Mapping[str, str] = {
+    # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
+    # for obsolete timezone names. However, the chrono_tz package that
+    # polars uses doesn't read /usr/share/zoneinfo, instead packaging
+    # the current zoneinfo database from IANA. Consequently, when this
+    # hypothesis-generated test runs and generates timezones from the
+    # available zoneinfo-reported timezones, we can get an error from
+    # polars that the requested timezone is unknown.
+    # Since this is random, just skip it, rather than xfailing.
+    "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+}
+
+
 def pytest_collection_modifyitems(
     session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
 ) -> None:
@@ -182,5 +216,7 @@ def pytest_collection_modifyitems(
         # Don't xfail tests if running without fallback
         return
     for item in items:
-        if item.nodeid in EXPECTED_FAILURES:
+        if item.nodeid in TESTS_TO_SKIP:
+            item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
+        elif item.nodeid in EXPECTED_FAILURES:
             item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 5904942aea2..9fb9bbf391e 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.15",
+    "polars>=1.11,<1.18",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 2fcbbf21f1c..f1f47bfb9f1 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -53,6 +53,15 @@ def right():
     )
 
 
+@pytest.mark.parametrize(
+    "maintain_order", ["left", "left_right", "right_left", "right"]
+)
+def test_join_maintain_order_param_unsupported(left, right, maintain_order):
+    q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "join_expr",
     [
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 20eb2404b77..863102103ed 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -1,7 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import warnings
-from importlib import import_module
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import dask.dataframe as dd
 from dask import config
@@ -9,11 +6,16 @@
 
 import cudf
 
-from . import backends  # noqa: F401
+from . import backends, io  # noqa: F401
+from ._expr.expr import _patch_dask_expr
 from ._version import __git_commit__, __version__  # noqa: F401
-from .core import DataFrame, Index, Series, concat, from_cudf
+from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf
 
-QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED
+if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()):
+    raise ValueError(
+        "The legacy DataFrame API is not supported in dask_cudf>24.12. "
+        "Please enable query-planning, or downgrade to dask_cudf<=24.12"
+    )
 
 
 def read_csv(*args, **kwargs):
@@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs):
         return dd.read_parquet(*args, **kwargs)
 
 
-def _deprecated_api(old_api, new_api=None, rec=None):
-    def inner_func(*args, **kwargs):
-        if new_api:
-            # Use alternative
-            msg = f"{old_api} is now deprecated. "
-            msg += rec or f"Please use {new_api} instead."
-            warnings.warn(msg, FutureWarning)
-            new_attr = new_api.split(".")
-            module = import_module(".".join(new_attr[:-1]))
-            return getattr(module, new_attr[-1])(*args, **kwargs)
-
-        # No alternative - raise an error
-        raise NotImplementedError(
-            f"{old_api} is no longer supported. " + (rec or "")
-        )
-
-    return inner_func
-
-
-if QUERY_PLANNING_ON:
-    from . import io
-    from ._expr.expr import _patch_dask_expr
-
-    groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
-    read_text = DataFrame.read_text
-    _patch_dask_expr()
-
-else:
-    from . import io  # noqa: F401
-    from ._legacy.groupby import groupby_agg  # noqa: F401
-    from ._legacy.io import read_text  # noqa: F401
-
-
+groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
+read_text = DataFrame.read_text
 to_orc = _deprecated_api(
     "dask_cudf.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.to_orc",
     rec="Please use DataFrame.to_orc instead.",
 )
 
 
+_patch_dask_expr()
+
+
 __all__ = [
     "DataFrame",
     "Index",
diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 5192e6b8171..e8c9a970b7b 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import warnings
 from functools import cached_property
@@ -15,19 +15,11 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.dataframe.dispatch import get_parallel_type
 from dask.typing import no_default
 
 import cudf
 
-_LEGACY_WORKAROUND = (
-    "To enable the 'legacy' dask-cudf API, set the "
-    "global 'dataframe.query-planning' config to "
-    "`False` before dask is imported. This can also "
-    "be done by setting an environment variable: "
-    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
-)
-
-
 ##
 ## Custom collection classes
 ##
@@ -103,9 +95,8 @@ def set_index(
             divisions = None
             warnings.warn(
                 "Ignoring divisions='quantile'. This option is now "
-                "deprecated. Please use the legacy API and raise an "
-                "issue on github if this feature is necessary."
-                f"\n{_LEGACY_WORKAROUND}",
+                "deprecated. Please raise an issue on github if this "
+                "feature is necessary.",
                 FutureWarning,
             )
 
@@ -135,9 +126,7 @@ def groupby(
 
             if kwargs.pop("as_index") is not True:
                 raise NotImplementedError(
-                    f"{msg} Please reset the index after aggregating, or "
-                    "use the legacy API if `as_index=False` is required.\n"
-                    f"{_LEGACY_WORKAROUND}"
+                    f"{msg} Please reset the index after aggregating."
                 )
             else:
                 warnings.warn(msg, FutureWarning)
@@ -153,15 +142,15 @@ def groupby(
         )
 
     def to_orc(self, *args, **kwargs):
-        from dask_cudf._legacy.io import to_orc
+        from dask_cudf.io.orc import to_orc as to_orc_impl
 
-        return to_orc(self, *args, **kwargs)
+        return to_orc_impl(self, *args, **kwargs)
 
     @staticmethod
     def read_text(*args, **kwargs):
-        from dask_cudf._legacy.io.text import read_text as legacy_read_text
+        from dask_cudf.io.text import read_text as read_text_impl
 
-        return legacy_read_text(*args, **kwargs)
+        return read_text_impl(*args, **kwargs)
 
     def clip(self, lower=None, upper=None, axis=1):
         if axis not in (None, 1):
@@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
+# dask.dataframe dispatch
+get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
+get_parallel_type.register(cudf.Series, lambda _: Series)
+get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+# dask_expr dispatch (might go away?)
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py
index 8b91e53604c..03d1da0d258 100644
--- a/python/dask_cudf/dask_cudf/_expr/expr.py
+++ b/python/dask_cudf/dask_cudf/_expr/expr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 import functools
 
 import dask_expr._shuffle as _shuffle_module
@@ -7,13 +7,13 @@
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
 
-from dask.dataframe.core import (
-    is_dataframe_like,
+from dask.dataframe.dispatch import (
+    is_categorical_dtype,
     make_meta,
     meta_nonempty,
 )
-from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
+from dask.utils import is_dataframe_like
 
 import cudf
 
diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py
index 0242fac6e72..a5cdd43169b 100644
--- a/python/dask_cudf/dask_cudf/_expr/groupby.py
+++ b/python/dask_cudf/dask_cudf/_expr/groupby.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 import functools
 
+import numpy as np
 import pandas as pd
 from dask_expr._collection import new_collection
 from dask_expr._groupby import (
@@ -16,11 +17,262 @@
 from dask.dataframe.groupby import Aggregation
 
 from cudf.core.groupby.groupby import _deprecate_collect
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 ##
 ## Fused groupby aggregations
 ##
 
+OPTIMIZED_AGGS = (
+    "count",
+    "mean",
+    "std",
+    "var",
+    "sum",
+    "min",
+    "max",
+    list,
+    "first",
+    "last",
+)
+
+
+def _make_name(col_name, sep="_"):
+    """Combine elements of `col_name` into a single string, or no-op if
+    `col_name` is already a string
+    """
+    if isinstance(col_name, str):
+        return col_name
+    return sep.join(name for name in col_name if name != "")
+
+
+@_dask_cudf_performance_tracking
+def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
+    """Initial partition-level aggregation task.
+
+    This is the first operation to be executed on each input
+    partition in `groupby_agg`.  Depending on `aggs`, four possible
+    groupby aggregations ("count", "sum", "min", and "max") are
+    performed.  The result is then partitioned (by hashing `gb_cols`)
+    into a number of distinct dictionary elements.  The number of
+    elements in the output dictionary (`split_out`) corresponds to
+    the number of partitions in the final output of `groupby_agg`.
+    """
+
+    # Modify dict for initial (partition-wise) aggregations
+    _agg_dict = {}
+    for col, agg_list in aggs.items():
+        _agg_dict[col] = set()
+        for agg in agg_list:
+            if agg in ("mean", "std", "var"):
+                _agg_dict[col].add("count")
+                _agg_dict[col].add("sum")
+            else:
+                _agg_dict[col].add(agg)
+        _agg_dict[col] = list(_agg_dict[col])
+        if set(agg_list).intersection({"std", "var"}):
+            pow2_name = _make_name((col, "pow2"), sep=sep)
+            df[pow2_name] = df[col].astype("float64").pow(2)
+            _agg_dict[pow2_name] = ["sum"]
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        _agg_dict
+    )
+    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _tree_node_agg(df, gb_cols, dropna, sort, sep):
+    """Node in groupby-aggregation reduction tree.
+
+    The input DataFrame (`df`) corresponds to the
+    concatenated output of one or more `_groupby_partition_agg`
+    tasks. In this function, "sum", "min" and/or "max" groupby
+    aggregations will be used to combine the statistics for
+    duplicate keys.
+    """
+
+    agg_dict = {}
+    for col in df.columns:
+        if col in gb_cols:
+            continue
+        agg = col.split(sep)[-1]
+        if agg in ("count", "sum"):
+            agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
+        elif agg in OPTIMIZED_AGGS:
+            agg_dict[col] = [agg]
+        else:
+            raise ValueError(f"Unexpected aggregation: {agg}")
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        agg_dict
+    )
+
+    # Don't include the last aggregation in the column names
+    output_columns = [
+        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
+        for name in gb.columns
+    ]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
+    """Calculate variance (given count, sum, and sum-squared columns)."""
+
+    # Select count, sum, and sum-squared
+    n = df[count_name]
+    x = df[sum_name]
+    x2 = df[pow2_sum_name]
+
+    # Use sum-squared approach to get variance
+    var = x2 - x**2 / n
+    div = n - ddof
+    div[div < 1] = 1  # Avoid division by 0
+    var /= div
+
+    # Set appropriate NaN elements
+    # (since we avoided 0-division)
+    var[(n - ddof) == 0] = np.nan
+
+    return var
+
+
+@_dask_cudf_performance_tracking
+def _finalize_gb_agg(
+    gb_in,
+    gb_cols,
+    aggs,
+    columns,
+    final_columns,
+    as_index,
+    dropna,
+    sort,
+    sep,
+    str_cols_out,
+    aggs_renames,
+):
+    """Final aggregation task.
+
+    This is the final operation on each output partitions
+    of the `groupby_agg` algorithm.  This function must
+    take care of higher-order aggregations, like "mean",
+    "std" and "var".  We also need to deal with the column
+    index, the row index, and final sorting behavior.
+    """
+
+    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
+
+    # Deal with higher-order aggregations
+    for col in columns:
+        agg_list = aggs.get(col, [])
+        agg_set = set(agg_list)
+        if agg_set.intersection({"mean", "std", "var"}):
+            count_name = _make_name((col, "count"), sep=sep)
+            sum_name = _make_name((col, "sum"), sep=sep)
+            if agg_set.intersection({"std", "var"}):
+                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
+                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
+                if "var" in agg_list:
+                    name_var = _make_name((col, "var"), sep=sep)
+                    gb[name_var] = var
+                if "std" in agg_list:
+                    name_std = _make_name((col, "std"), sep=sep)
+                    gb[name_std] = np.sqrt(var)
+                gb.drop(columns=[pow2_sum_name], inplace=True)
+            if "mean" in agg_list:
+                mean_name = _make_name((col, "mean"), sep=sep)
+                gb[mean_name] = gb[sum_name] / gb[count_name]
+            if "sum" not in agg_list:
+                gb.drop(columns=[sum_name], inplace=True)
+            if "count" not in agg_list:
+                gb.drop(columns=[count_name], inplace=True)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
+            gb[collect_name] = gb[collect_name].list.concat()
+
+    # Ensure sorted keys if `sort=True`
+    if sort:
+        gb = gb.sort_values(gb_cols)
+
+    # Set index if necessary
+    if as_index:
+        gb.set_index(gb_cols, inplace=True)
+
+    # Unflatten column names
+    col_array = []
+    agg_array = []
+    for col in gb.columns:
+        if col in gb_cols:
+            col_array.append(col)
+            agg_array.append("")
+        else:
+            name, agg = col.split(sep)
+            col_array.append(name)
+            agg_array.append(aggs_renames.get((name, agg), agg))
+    if str_cols_out:
+        gb.columns = col_array
+    else:
+        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+
+    return gb[final_columns]
+
+
+@_dask_cudf_performance_tracking
+def _redirect_aggs(arg):
+    """Redirect aggregations to their corresponding name in cuDF"""
+    redirects = {
+        sum: "sum",
+        max: "max",
+        min: "min",
+        "collect": list,
+        "list": list,
+    }
+    if isinstance(arg, dict):
+        new_arg = dict()
+        for col in arg:
+            if isinstance(arg[col], list):
+                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            elif isinstance(arg[col], dict):
+                new_arg[col] = {
+                    k: redirects.get(v, v) for k, v in arg[col].items()
+                }
+            else:
+                new_arg[col] = redirects.get(arg[col], arg[col])
+        return new_arg
+    if isinstance(arg, list):
+        return [redirects.get(agg, agg) for agg in arg]
+    return redirects.get(arg, arg)
+
+
+@_dask_cudf_performance_tracking
+def _aggs_optimized(arg, supported: set):
+    """Check that aggregations in `arg` are a subset of `supported`"""
+    if isinstance(arg, (list, dict)):
+        if isinstance(arg, dict):
+            _global_set: set[str] = set()
+            for col in arg:
+                if isinstance(arg[col], list):
+                    _global_set = _global_set.union(set(arg[col]))
+                elif isinstance(arg[col], dict):
+                    _global_set = _global_set.union(set(arg[col].values()))
+                else:
+                    _global_set.add(arg[col])
+        else:
+            _global_set = set(arg)
+
+        return bool(_global_set.issubset(supported))
+    elif isinstance(arg, (str, type)):
+        return arg in supported
+    return False
+
 
 def _get_spec_info(gb):
     if isinstance(gb.arg, (dict, list)):
@@ -105,20 +357,14 @@ def shuffle_by_index(self):
 
     @classmethod
     def chunk(cls, df, *by, **kwargs):
-        from dask_cudf._legacy.groupby import _groupby_partition_agg
-
         return _groupby_partition_agg(df, **kwargs)
 
     @classmethod
     def combine(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _tree_node_agg
-
         return _tree_node_agg(_concat(inputs), **kwargs)
 
     @classmethod
     def aggregate(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _finalize_gb_agg
-
         return _finalize_gb_agg(_concat(inputs), **kwargs)
 
     @property
@@ -193,12 +439,6 @@ def _maybe_get_custom_expr(
     shuffle_method=None,
     **kwargs,
 ):
-    from dask_cudf._legacy.groupby import (
-        OPTIMIZED_AGGS,
-        _aggs_optimized,
-        _redirect_aggs,
-    )
-
     if kwargs:
         # Unsupported key-word arguments
         return None
diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py
deleted file mode 100644
index d6beb775a5e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/core.py
+++ /dev/null
@@ -1,711 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import math
-import warnings
-
-import numpy as np
-import pandas as pd
-from tlz import partition_all
-
-from dask import dataframe as dd
-from dask.base import normalize_token, tokenize
-from dask.dataframe.core import (
-    Scalar,
-    handle_out,
-    make_meta as dask_make_meta,
-    map_partitions,
-)
-from dask.dataframe.utils import raise_on_meta_error
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname
-
-import cudf
-from cudf import _lib as libcudf
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._expr.accessors import ListMethods, StructMethods
-from dask_cudf._legacy import sorting
-from dask_cudf._legacy.sorting import (
-    _deprecate_shuffle_kwarg,
-    _get_shuffle_method,
-)
-
-
-class _Frame(dd.core._Frame, OperatorMethodMixin):
-    """Superclass for DataFrame and Series
-
-    Parameters
-    ----------
-    dsk : dict
-        The dask graph to compute this DataFrame
-    name : str
-        The key prefix that specifies which keys in the dask comprise this
-        particular DataFrame / Series
-    meta : cudf.DataFrame, cudf.Series, or cudf.Index
-        An empty cudf object with names, dtypes, and indices matching the
-        expected output.
-    divisions : tuple of index values
-        Values along which we partition our blocks on the index
-    """
-
-    def _is_partition_type(self, meta):
-        return isinstance(meta, self._partition_type)
-
-    def __repr__(self):
-        s = "<dask_cudf.%s | %d tasks | %d npartitions>"
-        return s % (type(self).__name__, len(self.dask), self.npartitions)
-
-
-normalize_token.register(_Frame, lambda a: a._name)
-
-
-class DataFrame(_Frame, dd.core.DataFrame):
-    """
-    A distributed Dask DataFrame where the backing dataframe is a
-    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
-
-    Typically you would not construct this object directly, but rather
-    use one of Dask-cuDF's IO routines.
-
-    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
-    supported, with many of the same caveats.
-
-    """
-
-    _partition_type = cudf.DataFrame
-
-    @_dask_cudf_performance_tracking
-    def _assign_column(self, k, v):
-        def assigner(df, k, v):
-            out = df.copy()
-            out[k] = v
-            return out
-
-        meta = assigner(self._meta, k, dask_make_meta(v))
-        return self.map_partitions(assigner, k, v, meta=meta)
-
-    @_dask_cudf_performance_tracking
-    def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
-        import uuid
-
-        if kwargs is None:
-            kwargs = {}
-
-        if cache_key is None:
-            cache_key = uuid.uuid4()
-
-        def do_apply_rows(df, func, incols, outcols, kwargs):
-            return df.apply_rows(
-                func, incols, outcols, kwargs, cache_key=cache_key
-            )
-
-        meta = do_apply_rows(self._meta, func, incols, outcols, kwargs)
-        return self.map_partitions(
-            do_apply_rows, func, incols, outcols, kwargs, meta=meta
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def merge(self, other, shuffle_method=None, **kwargs):
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().merge(
-            other,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def join(self, other, shuffle_method=None, **kwargs):
-        # CuDF doesn't support "right" join yet
-        how = kwargs.pop("how", "left")
-        if how == "right":
-            return other.join(other=self, how="left", **kwargs)
-
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().join(
-            other,
-            how=how,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def set_index(
-        self,
-        other,
-        sorted=False,
-        divisions=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        pre_sorted = sorted
-        del sorted
-
-        if divisions == "quantile":
-            warnings.warn(
-                "Using divisions='quantile' is now deprecated. "
-                "Please raise an issue on github if you believe "
-                "this feature is necessary.",
-                FutureWarning,
-            )
-
-        if (
-            divisions == "quantile"
-            or isinstance(divisions, (cudf.DataFrame, cudf.Series))
-            or (
-                isinstance(other, str)
-                and cudf.api.types.is_string_dtype(self[other].dtype)
-            )
-        ):
-            # Let upstream-dask handle "pre-sorted" case
-            if pre_sorted:
-                return dd.shuffle.set_sorted_index(
-                    self, other, divisions=divisions, **kwargs
-                )
-
-            by = other
-            if not isinstance(other, list):
-                by = [by]
-            if len(by) > 1:
-                raise ValueError("Dask does not support MultiIndex (yet).")
-            if divisions == "quantile":
-                divisions = None
-
-            # Use dask_cudf's sort_values
-            df = self.sort_values(
-                by,
-                max_branch=kwargs.get("max_branch", None),
-                divisions=divisions,
-                set_divisions=True,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-
-            # Ignore divisions if its a dataframe
-            if isinstance(divisions, cudf.DataFrame):
-                divisions = None
-
-            # Set index and repartition
-            df2 = df.map_partitions(
-                sorting.set_index_post,
-                index_name=other,
-                drop=kwargs.get("drop", True),
-                column_dtype=df.columns.dtype,
-            )
-            npartitions = kwargs.get("npartitions", self.npartitions)
-            partition_size = kwargs.get("partition_size", None)
-            if partition_size:
-                return df2.repartition(partition_size=partition_size)
-            if not divisions and df2.npartitions != npartitions:
-                return df2.repartition(npartitions=npartitions)
-            if divisions and df2.npartitions != len(divisions) - 1:
-                return df2.repartition(divisions=divisions)
-            return df2
-
-        return super().set_index(
-            other,
-            sorted=pre_sorted,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            divisions=divisions,
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def sort_values(
-        self,
-        by,
-        ignore_index=False,
-        max_branch=None,
-        divisions=None,
-        set_divisions=False,
-        ascending=True,
-        na_position="last",
-        sort_function=None,
-        sort_function_kwargs=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        if kwargs:
-            raise ValueError(
-                f"Unsupported input arguments passed : {list(kwargs.keys())}"
-            )
-
-        df = sorting.sort_values(
-            self,
-            by,
-            max_branch=max_branch,
-            divisions=divisions,
-            set_divisions=set_divisions,
-            ignore_index=ignore_index,
-            ascending=ascending,
-            na_position=na_position,
-            shuffle_method=shuffle_method,
-            sort_function=sort_function,
-            sort_function_kwargs=sort_function_kwargs,
-        )
-
-        if ignore_index:
-            return df.reset_index(drop=True)
-        return df
-
-    @_dask_cudf_performance_tracking
-    def to_parquet(self, path, *args, **kwargs):
-        """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
-        from dask_cudf._legacy.io import to_parquet
-
-        return to_parquet(self, path, *args, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def to_orc(self, path, **kwargs):
-        """Calls dask_cudf._legacy.io.to_orc"""
-        from dask_cudf._legacy.io import to_orc
-
-        return to_orc(self, path, **kwargs)
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-        numeric_only=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(
-            axis=axis, skipna=skipna, numeric_only=numeric_only
-        )
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-                numeric_only=numeric_only,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def shuffle(self, *args, shuffle_method=None, **kwargs):
-        """Wraps dask.dataframe DataFrame.shuffle method"""
-        return super().shuffle(
-            *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
-        )
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, by=None, **kwargs):
-        from .groupby import CudfDataFrameGroupBy
-
-        return CudfDataFrameGroupBy(self, by=by, **kwargs)
-
-
-@_dask_cudf_performance_tracking
-def sum_of_squares(x):
-    x = x.astype("f8")._column
-    outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series._from_column(outcol)
-
-
-@_dask_cudf_performance_tracking
-def var_aggregate(x2, x, n, ddof):
-    try:
-        with warnings.catch_warnings(record=True):
-            warnings.simplefilter("always")
-            result = (x2 / n) - (x / n) ** 2
-        if ddof != 0:
-            result = result * n / (n - ddof)
-        return result
-    except ZeroDivisionError:
-        return np.float64(np.nan)
-
-
-@_dask_cudf_performance_tracking
-def nlargest_agg(x, **kwargs):
-    return cudf.concat(x).nlargest(**kwargs)
-
-
-@_dask_cudf_performance_tracking
-def nsmallest_agg(x, **kwargs):
-    return cudf.concat(x).nsmallest(**kwargs)
-
-
-class Series(_Frame, dd.core.Series):
-    _partition_type = cudf.Series
-
-    @_dask_cudf_performance_tracking
-    def count(self, split_every=False):
-        return reduction(
-            [self],
-            chunk=M.count,
-            aggregate=np.sum,
-            split_every=split_every,
-            meta="i8",
-        )
-
-    @_dask_cudf_performance_tracking
-    def mean(self, split_every=False):
-        sum = self.sum(split_every=split_every)
-        n = self.count(split_every=split_every)
-        return sum / n
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, *args, **kwargs):
-        from .groupby import CudfSeriesGroupBy
-
-        return CudfSeriesGroupBy(self, *args, **kwargs)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def list(self):
-        return ListMethods(self)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def struct(self):
-        return StructMethods(self)
-
-
-class Index(Series, dd.core.Index):
-    _partition_type = cudf.Index  # type: ignore
-
-
-@_dask_cudf_performance_tracking
-def _naive_var(ddf, meta, skipna, ddof, split_every, out):
-    num = ddf._get_numeric_data()
-    x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
-    x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every)
-    n = num.count(split_every=split_every)
-    name = ddf._token_prefix + "var"
-    result = map_partitions(
-        var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof
-    )
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _parallel_var(ddf, meta, skipna, split_every, out):
-    def _local_var(x, skipna):
-        if skipna:
-            n = x.count()
-            avg = x.mean(skipna=skipna)
-        else:
-            # Not skipping nulls, so might as well
-            # avoid the full `count` operation
-            n = len(x)
-            avg = x.sum(skipna=skipna) / n
-        m2 = ((x - avg) ** 2).sum(skipna=skipna)
-        return n, avg, m2
-
-    def _aggregate_var(parts):
-        n, avg, m2 = parts[0]
-        for i in range(1, len(parts)):
-            n_a, avg_a, m2_a = n, avg, m2
-            n_b, avg_b, m2_b = parts[i]
-            n = n_a + n_b
-            avg = (n_a * avg_a + n_b * avg_b) / n
-            delta = avg_b - avg_a
-            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
-        return n, avg, m2
-
-    def _finalize_var(vals):
-        n, _, m2 = vals
-        return m2 / (n - 1)
-
-    # Build graph
-    nparts = ddf.npartitions
-    if not split_every:
-        split_every = nparts
-    name = "var-" + tokenize(skipna, split_every, out)
-    local_name = "local-" + name
-    num = ddf._get_numeric_data()
-    dsk = {
-        (local_name, n, 0): (_local_var, (num._name, n), skipna)
-        for n in range(nparts)
-    }
-
-    # Use reduction tree
-    widths = [nparts]
-    while nparts > 1:
-        nparts = math.ceil(nparts / split_every)
-        widths.append(nparts)
-    height = len(widths)
-    for depth in range(1, height):
-        for group in range(widths[depth]):
-            p_max = widths[depth - 1]
-            lstart = split_every * group
-            lstop = min(lstart + split_every, p_max)
-            node_list = [
-                (local_name, p, depth - 1) for p in range(lstart, lstop)
-            ]
-            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
-    if height == 1:
-        group = depth = 0
-    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))
-
-    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
-    result = dd.core.new_dd_object(graph, name, meta, (None, None))
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _extract_meta(x):
-    """
-    Extract internal cache data (``_meta``) from dask_cudf objects
-    """
-    if isinstance(x, (Scalar, _Frame)):
-        return x._meta
-    elif isinstance(x, list):
-        return [_extract_meta(_x) for _x in x]
-    elif isinstance(x, tuple):
-        return tuple(_extract_meta(_x) for _x in x)
-    elif isinstance(x, dict):
-        return {k: _extract_meta(v) for k, v in x.items()}
-    return x
-
-
-@_dask_cudf_performance_tracking
-def _emulate(func, *args, **kwargs):
-    """
-    Apply a function using args / kwargs. If arguments contain dd.DataFrame /
-    dd.Series, using internal cache (``_meta``) for calculation
-    """
-    with raise_on_meta_error(funcname(func)):
-        return func(*_extract_meta(args), **_extract_meta(kwargs))
-
-
-@_dask_cudf_performance_tracking
-def align_partitions(args):
-    """Align partitions between dask_cudf objects.
-
-    Note that if all divisions are unknown, but have equal npartitions, then
-    they will be passed through unchanged.
-    """
-    dfs = [df for df in args if isinstance(df, _Frame)]
-    if not dfs:
-        return args
-
-    divisions = dfs[0].divisions
-    if not all(df.divisions == divisions for df in dfs):
-        raise NotImplementedError("Aligning mismatched partitions")
-    return args
-
-
-@_dask_cudf_performance_tracking
-def reduction(
-    args,
-    chunk=None,
-    aggregate=None,
-    combine=None,
-    meta=None,
-    token=None,
-    chunk_kwargs=None,
-    aggregate_kwargs=None,
-    combine_kwargs=None,
-    split_every=None,
-    **kwargs,
-):
-    """Generic tree reduction operation.
-
-    Parameters
-    ----------
-    args :
-        Positional arguments for the `chunk` function. All `dask.dataframe`
-        objects should be partitioned and indexed equivalently.
-    chunk : function [block-per-arg] -> block
-        Function to operate on each block of data
-    aggregate : function list-of-blocks -> block
-        Function to operate on the list of results of chunk
-    combine : function list-of-blocks -> block, optional
-        Function to operate on intermediate lists of results of chunk
-        in a tree-reduction. If not provided, defaults to aggregate.
-    $META
-    token : str, optional
-        The name to use for the output keys.
-    chunk_kwargs : dict, optional
-        Keywords for the chunk function only.
-    aggregate_kwargs : dict, optional
-        Keywords for the aggregate function only.
-    combine_kwargs : dict, optional
-        Keywords for the combine function only.
-    split_every : int, optional
-        Group partitions into groups of this size while performing a
-        tree-reduction. If set to False, no tree-reduction will be used,
-        and all intermediates will be concatenated and passed to ``aggregate``.
-        Default is 8.
-    kwargs :
-        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
-        ``combine``.
-    """
-    if chunk_kwargs is None:
-        chunk_kwargs = dict()
-    if aggregate_kwargs is None:
-        aggregate_kwargs = dict()
-    chunk_kwargs.update(kwargs)
-    aggregate_kwargs.update(kwargs)
-
-    if combine is None:
-        if combine_kwargs:
-            raise ValueError("`combine_kwargs` provided with no `combine`")
-        combine = aggregate
-        combine_kwargs = aggregate_kwargs
-    else:
-        if combine_kwargs is None:
-            combine_kwargs = dict()
-        combine_kwargs.update(kwargs)
-
-    if not isinstance(args, (tuple, list)):
-        args = [args]
-
-    npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)}
-    if len(npartitions) > 1:
-        raise ValueError("All arguments must have same number of partitions")
-    npartitions = npartitions.pop()
-
-    if split_every is None:
-        split_every = 8
-    elif split_every is False:
-        split_every = npartitions
-    elif split_every < 2 or not isinstance(split_every, int):
-        raise ValueError("split_every must be an integer >= 2")
-
-    token_key = tokenize(
-        token or (chunk, aggregate),
-        meta,
-        args,
-        chunk_kwargs,
-        aggregate_kwargs,
-        combine_kwargs,
-        split_every,
-    )
-
-    # Chunk
-    a = f"{token or funcname(chunk)}-chunk-{token_key}"
-    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
-        dsk = {
-            (a, 0, i): (chunk, key)
-            for i, key in enumerate(args[0].__dask_keys__())
-        }
-    else:
-        dsk = {
-            (a, 0, i): (
-                apply,
-                chunk,
-                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
-                chunk_kwargs,
-            )
-            for i in range(args[0].npartitions)
-        }
-
-    # Combine
-    b = f"{token or funcname(combine)}-combine-{token_key}"
-    k = npartitions
-    depth = 0
-    while k > split_every:
-        for part_i, inds in enumerate(partition_all(split_every, range(k))):
-            conc = (list, [(a, depth, i) for i in inds])
-            dsk[(b, depth + 1, part_i)] = (
-                (apply, combine, [conc], combine_kwargs)
-                if combine_kwargs
-                else (combine, conc)
-            )
-        k = part_i + 1
-        a = b
-        depth += 1
-
-    # Aggregate
-    b = f"{token or funcname(aggregate)}-agg-{token_key}"
-    conc = (list, [(a, depth, i) for i in range(k)])
-    if aggregate_kwargs:
-        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
-    else:
-        dsk[(b, 0)] = (aggregate, conc)
-
-    if meta is None:
-        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
-        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
-    meta = dask_make_meta(meta)
-
-    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
-    return dd.core.new_dd_object(graph, b, meta, (None, None))
-
-
-for name in (
-    "add",
-    "sub",
-    "mul",
-    "truediv",
-    "floordiv",
-    "mod",
-    "pow",
-    "radd",
-    "rsub",
-    "rmul",
-    "rtruediv",
-    "rfloordiv",
-    "rmod",
-    "rpow",
-):
-    meth = getattr(cudf.DataFrame, name)
-    DataFrame._bind_operator_method(name, meth, original=cudf.Series)
-
-    meth = getattr(cudf.Series, name)
-    Series._bind_operator_method(name, meth, original=cudf.Series)
-
-for name in ("lt", "gt", "le", "ge", "ne", "eq"):
-    meth = getattr(cudf.Series, name)
-    Series._bind_comparison_method(name, meth, original=cudf.Series)
diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py
deleted file mode 100644
index 7e01e91476d..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/groupby.py
+++ /dev/null
@@ -1,909 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from __future__ import annotations
-
-from functools import wraps
-
-import numpy as np
-import pandas as pd
-
-from dask.dataframe.core import (
-    DataFrame as DaskDataFrame,
-    aca,
-    split_out_on_cols,
-)
-from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy
-from dask.utils import funcname
-
-import cudf
-from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg
-
-# aggregations that are dask-cudf optimized
-OPTIMIZED_AGGS = (
-    "count",
-    "mean",
-    "std",
-    "var",
-    "sum",
-    "min",
-    "max",
-    list,
-    "first",
-    "last",
-)
-
-
-def _check_groupby_optimized(func):
-    """
-    Decorator for dask-cudf's groupby methods that returns the dask-cudf
-    optimized method if the groupby object is supported, otherwise
-    reverting to the upstream Dask method
-    """
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        gb = args[0]
-        if _groupby_optimized(gb):
-            return func(*args, **kwargs)
-        # note that we use upstream Dask's default kwargs for this call if
-        # none are specified; this shouldn't be an issue as those defaults are
-        # consistent with dask-cudf
-        return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs)
-
-    return wrapper
-
-
-class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def __getitem__(self, key):
-        if isinstance(key, list):
-            g = CudfDataFrameGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-        else:
-            g = CudfSeriesGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-
-        g._meta = g._meta[key]
-        return g
-
-    @_dask_cudf_performance_tracking
-    def _make_groupby_method_aggs(self, agg_name):
-        """Create aggs dictionary for aggregation methods"""
-
-        if isinstance(self.by, list):
-            return {c: agg_name for c in self.obj.columns if c not in self.by}
-        return {c: agg_name for c in self.obj.columns if c != self.by}
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("count"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("mean"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("std"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("var"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("sum"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("min"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("max"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs(list),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("first"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("last"),
-            split_every,
-            split_out,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
-                keys = self._meta.grouping.keys.names
-            else:
-                keys = self._meta.grouping.keys.name
-
-            return groupby_agg(
-                self.obj,
-                keys,
-                arg,
-                split_every=split_every,
-                split_out=split_out,
-                sep=self.sep,
-                sort=self.sort,
-                as_index=self.as_index,
-                shuffle_method=shuffle_method,
-                **self.dropna,
-            )
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "count"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "mean"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "std"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "var"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "sum"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "min"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "max"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: list},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "first"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "last"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if not isinstance(arg, dict):
-            arg = {self._slice: arg}
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            return _make_groupby_agg_call(
-                self, arg, split_every, split_out, shuffle_method
-            )[self._slice]
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-def _shuffle_aggregate(
-    ddf,
-    gb_cols,
-    chunk,
-    chunk_kwargs,
-    aggregate,
-    aggregate_kwargs,
-    split_every,
-    split_out,
-    token=None,
-    sort=None,
-    shuffle_method=None,
-):
-    # Shuffle-based groupby aggregation
-    # NOTE: This function is the dask_cudf version of
-    # dask.dataframe.groupby._shuffle_aggregate
-
-    # Step 1 - Chunkwise groupby operation
-    chunk_name = f"{token or funcname(chunk)}-chunk"
-    chunked = ddf.map_partitions(
-        chunk,
-        meta=chunk(ddf._meta, **chunk_kwargs),
-        token=chunk_name,
-        **chunk_kwargs,
-    )
-
-    # Step 2 - Perform global sort or shuffle
-    shuffle_npartitions = max(
-        chunked.npartitions // split_every,
-        split_out,
-    )
-    if sort and split_out > 1:
-        # Sort-based code path
-        result = (
-            chunked.repartition(npartitions=shuffle_npartitions)
-            .sort_values(
-                gb_cols,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-            .map_partitions(
-                aggregate,
-                meta=aggregate(chunked._meta, **aggregate_kwargs),
-                **aggregate_kwargs,
-            )
-        )
-    else:
-        # Hash-based code path
-        result = chunked.shuffle(
-            gb_cols,
-            npartitions=shuffle_npartitions,
-            ignore_index=True,
-            shuffle_method=shuffle_method,
-        ).map_partitions(
-            aggregate,
-            meta=aggregate(chunked._meta, **aggregate_kwargs),
-            **aggregate_kwargs,
-        )
-
-    # Step 3 - Repartition and return
-    if split_out < result.npartitions:
-        return result.repartition(npartitions=split_out)
-    return result
-
-
-@_dask_cudf_performance_tracking
-def groupby_agg(
-    ddf,
-    gb_cols,
-    aggs_in,
-    split_every=None,
-    split_out=None,
-    dropna=True,
-    sep="___",
-    sort=False,
-    as_index=True,
-    shuffle_method=None,
-):
-    """Optimized groupby aggregation for Dask-CuDF.
-
-    Parameters
-    ----------
-    ddf : DataFrame
-        DataFrame object to perform grouping on.
-    gb_cols : str or list[str]
-        Column names to group by.
-    aggs_in : str, list, or dict
-        Aggregations to perform.
-    split_every : int (optional)
-        How to group intermediate aggregates.
-    dropna : bool
-        Drop grouping key values corresponding to NA values.
-    as_index : bool
-        Currently ignored.
-    sort : bool
-        Sort the group keys, better performance is obtained when
-        not sorting.
-    shuffle_method : str (optional)
-        Control how shuffling of the DataFrame is performed.
-    sep : str
-        Internal usage.
-
-
-    Notes
-    -----
-    This "optimized" approach is more performant than the algorithm in
-    implemented in :meth:`DataFrame.apply` because it allows the cuDF
-    backend to perform multiple aggregations at once.
-
-    This aggregation algorithm only supports the following options
-
-    * "list"
-    * "count"
-    * "first"
-    * "last"
-    * "max"
-    * "mean"
-    * "min"
-    * "std"
-    * "sum"
-    * "var"
-
-
-    See Also
-    --------
-    DataFrame.groupby : generic groupby of a DataFrame
-    dask.dataframe.apply_concat_apply : for more description of the
-        split_every argument.
-
-    """
-    # Assert that aggregations are supported
-    aggs = _redirect_aggs(aggs_in)
-    if not _aggs_optimized(aggs, OPTIMIZED_AGGS):
-        raise ValueError(
-            f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. "
-            f"Aggregations must be specified with dict or list syntax."
-        )
-
-    # If split_every is False, we use an all-to-one reduction
-    if split_every is False:
-        split_every = max(ddf.npartitions, 2)
-
-    # Deal with default split_out and split_every params
-    split_every = split_every or 8
-    split_out = split_out or 1
-
-    # Standardize `gb_cols`, `columns`, and `aggs`
-    if isinstance(gb_cols, str):
-        gb_cols = [gb_cols]
-    columns = [c for c in ddf.columns if c not in gb_cols]
-    if not isinstance(aggs, dict):
-        aggs = {col: aggs for col in columns}
-
-    # Assert if our output will have a MultiIndex; this will be the case if
-    # any value in the `aggs` dict is not a string (i.e. multiple/named
-    # aggregations per column)
-    str_cols_out = True
-    aggs_renames = {}
-    for col in aggs:
-        if isinstance(aggs[col], str) or callable(aggs[col]):
-            aggs[col] = [aggs[col]]
-        elif isinstance(aggs[col], dict):
-            str_cols_out = False
-            col_aggs = []
-            for k, v in aggs[col].items():
-                aggs_renames[col, v] = k
-                col_aggs.append(v)
-            aggs[col] = col_aggs
-        else:
-            str_cols_out = False
-        if col in gb_cols:
-            columns.append(col)
-
-    # Construct meta
-    _aggs = aggs.copy()
-    if str_cols_out:
-        # Metadata should use `str` for dict values if that is
-        # what the user originally specified (column names will
-        # be str, rather than tuples).
-        for col in aggs:
-            _aggs[col] = _aggs[col][0]
-    _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
-    if aggs_renames:
-        col_array = []
-        agg_array = []
-        for col, agg in _meta.columns:
-            col_array.append(col)
-            agg_array.append(aggs_renames.get((col, agg), agg))
-        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    chunk = _groupby_partition_agg
-    chunk_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    combine = _tree_node_agg
-    combine_kwargs = {
-        "gb_cols": gb_cols,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    aggregate = _finalize_gb_agg
-    aggregate_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "final_columns": _meta.columns,
-        "as_index": as_index,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-        "str_cols_out": str_cols_out,
-        "aggs_renames": aggs_renames,
-    }
-
-    # Use shuffle_method=True for split_out>1
-    if sort and split_out > 1 and shuffle_method is None:
-        shuffle_method = "tasks"
-
-    # Check if we are using the shuffle-based algorithm
-    if shuffle_method:
-        # Shuffle-based aggregation
-        return _shuffle_aggregate(
-            ddf,
-            gb_cols,
-            chunk,
-            chunk_kwargs,
-            aggregate,
-            aggregate_kwargs,
-            split_every,
-            split_out,
-            token="cudf-aggregate",
-            sort=sort,
-            shuffle_method=shuffle_method
-            if isinstance(shuffle_method, str)
-            else None,
-        )
-
-    # Deal with sort/shuffle defaults
-    if split_out > 1 and sort:
-        raise ValueError(
-            "dask-cudf's groupby algorithm does not yet support "
-            "`sort=True` when `split_out>1`, unless a shuffle-based "
-            "algorithm is used. Please use `split_out=1`, group "
-            "with `sort=False`, or set `shuffle_method=True`."
-        )
-
-    # Determine required columns to enable column projection
-    required_columns = list(
-        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
-    )
-
-    return aca(
-        [ddf[required_columns]],
-        chunk=chunk,
-        chunk_kwargs=chunk_kwargs,
-        combine=combine,
-        combine_kwargs=combine_kwargs,
-        aggregate=aggregate,
-        aggregate_kwargs=aggregate_kwargs,
-        token="cudf-aggregate",
-        split_every=split_every,
-        split_out=split_out,
-        split_out_setup=split_out_on_cols,
-        split_out_setup_kwargs={"cols": gb_cols},
-        sort=sort,
-        ignore_index=True,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _make_groupby_agg_call(
-    gb, aggs, split_every, split_out, shuffle_method=None
-):
-    """Helper method to consolidate the common `groupby_agg` call for all
-    aggregations in one place
-    """
-
-    return groupby_agg(
-        gb.obj,
-        gb.by,
-        aggs,
-        split_every=split_every,
-        split_out=split_out,
-        sep=gb.sep,
-        sort=gb.sort,
-        as_index=gb.as_index,
-        shuffle_method=shuffle_method,
-        **gb.dropna,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _redirect_aggs(arg):
-    """Redirect aggregations to their corresponding name in cuDF"""
-    redirects = {
-        sum: "sum",
-        max: "max",
-        min: "min",
-        "collect": list,
-        "list": list,
-    }
-    if isinstance(arg, dict):
-        new_arg = dict()
-        for col in arg:
-            if isinstance(arg[col], list):
-                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
-            elif isinstance(arg[col], dict):
-                new_arg[col] = {
-                    k: redirects.get(v, v) for k, v in arg[col].items()
-                }
-            else:
-                new_arg[col] = redirects.get(arg[col], arg[col])
-        return new_arg
-    if isinstance(arg, list):
-        return [redirects.get(agg, agg) for agg in arg]
-    return redirects.get(arg, arg)
-
-
-@_dask_cudf_performance_tracking
-def _aggs_optimized(arg, supported: set):
-    """Check that aggregations in `arg` are a subset of `supported`"""
-    if isinstance(arg, (list, dict)):
-        if isinstance(arg, dict):
-            _global_set: set[str] = set()
-            for col in arg:
-                if isinstance(arg[col], list):
-                    _global_set = _global_set.union(set(arg[col]))
-                elif isinstance(arg[col], dict):
-                    _global_set = _global_set.union(set(arg[col].values()))
-                else:
-                    _global_set.add(arg[col])
-        else:
-            _global_set = set(arg)
-
-        return bool(_global_set.issubset(supported))
-    elif isinstance(arg, (str, type)):
-        return arg in supported
-    return False
-
-
-@_dask_cudf_performance_tracking
-def _groupby_optimized(gb):
-    """Check that groupby input can use dask-cudf optimized codepath"""
-    return isinstance(gb.obj, DaskDataFrame) and (
-        isinstance(gb.by, str)
-        or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
-    )
-
-
-def _make_name(col_name, sep="_"):
-    """Combine elements of `col_name` into a single string, or no-op if
-    `col_name` is already a string
-    """
-    if isinstance(col_name, str):
-        return col_name
-    return sep.join(name for name in col_name if name != "")
-
-
-@_dask_cudf_performance_tracking
-def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
-    """Initial partition-level aggregation task.
-
-    This is the first operation to be executed on each input
-    partition in `groupby_agg`.  Depending on `aggs`, four possible
-    groupby aggregations ("count", "sum", "min", and "max") are
-    performed.  The result is then partitioned (by hashing `gb_cols`)
-    into a number of distinct dictionary elements.  The number of
-    elements in the output dictionary (`split_out`) corresponds to
-    the number of partitions in the final output of `groupby_agg`.
-    """
-
-    # Modify dict for initial (partition-wise) aggregations
-    _agg_dict = {}
-    for col, agg_list in aggs.items():
-        _agg_dict[col] = set()
-        for agg in agg_list:
-            if agg in ("mean", "std", "var"):
-                _agg_dict[col].add("count")
-                _agg_dict[col].add("sum")
-            else:
-                _agg_dict[col].add(agg)
-        _agg_dict[col] = list(_agg_dict[col])
-        if set(agg_list).intersection({"std", "var"}):
-            pow2_name = _make_name((col, "pow2"), sep=sep)
-            df[pow2_name] = df[col].astype("float64").pow(2)
-            _agg_dict[pow2_name] = ["sum"]
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        _agg_dict
-    )
-    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _tree_node_agg(df, gb_cols, dropna, sort, sep):
-    """Node in groupby-aggregation reduction tree.
-
-    The input DataFrame (`df`) corresponds to the
-    concatenated output of one or more `_groupby_partition_agg`
-    tasks. In this function, "sum", "min" and/or "max" groupby
-    aggregations will be used to combine the statistics for
-    duplicate keys.
-    """
-
-    agg_dict = {}
-    for col in df.columns:
-        if col in gb_cols:
-            continue
-        agg = col.split(sep)[-1]
-        if agg in ("count", "sum"):
-            agg_dict[col] = ["sum"]
-        elif agg == "list":
-            agg_dict[col] = [list]
-        elif agg in OPTIMIZED_AGGS:
-            agg_dict[col] = [agg]
-        else:
-            raise ValueError(f"Unexpected aggregation: {agg}")
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        agg_dict
-    )
-
-    # Don't include the last aggregation in the column names
-    output_columns = [
-        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
-        for name in gb.columns
-    ]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
-    """Calculate variance (given count, sum, and sum-squared columns)."""
-
-    # Select count, sum, and sum-squared
-    n = df[count_name]
-    x = df[sum_name]
-    x2 = df[pow2_sum_name]
-
-    # Use sum-squared approach to get variance
-    var = x2 - x**2 / n
-    div = n - ddof
-    div[div < 1] = 1  # Avoid division by 0
-    var /= div
-
-    # Set appropriate NaN elements
-    # (since we avoided 0-division)
-    var[(n - ddof) == 0] = np.nan
-
-    return var
-
-
-@_dask_cudf_performance_tracking
-def _finalize_gb_agg(
-    gb_in,
-    gb_cols,
-    aggs,
-    columns,
-    final_columns,
-    as_index,
-    dropna,
-    sort,
-    sep,
-    str_cols_out,
-    aggs_renames,
-):
-    """Final aggregation task.
-
-    This is the final operation on each output partitions
-    of the `groupby_agg` algorithm.  This function must
-    take care of higher-order aggregations, like "mean",
-    "std" and "var".  We also need to deal with the column
-    index, the row index, and final sorting behavior.
-    """
-
-    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
-
-    # Deal with higher-order aggregations
-    for col in columns:
-        agg_list = aggs.get(col, [])
-        agg_set = set(agg_list)
-        if agg_set.intersection({"mean", "std", "var"}):
-            count_name = _make_name((col, "count"), sep=sep)
-            sum_name = _make_name((col, "sum"), sep=sep)
-            if agg_set.intersection({"std", "var"}):
-                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
-                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
-                if "var" in agg_list:
-                    name_var = _make_name((col, "var"), sep=sep)
-                    gb[name_var] = var
-                if "std" in agg_list:
-                    name_std = _make_name((col, "std"), sep=sep)
-                    gb[name_std] = np.sqrt(var)
-                gb.drop(columns=[pow2_sum_name], inplace=True)
-            if "mean" in agg_list:
-                mean_name = _make_name((col, "mean"), sep=sep)
-                gb[mean_name] = gb[sum_name] / gb[count_name]
-            if "sum" not in agg_list:
-                gb.drop(columns=[sum_name], inplace=True)
-            if "count" not in agg_list:
-                gb.drop(columns=[count_name], inplace=True)
-        if list in agg_list:
-            collect_name = _make_name((col, "list"), sep=sep)
-            gb[collect_name] = gb[collect_name].list.concat()
-
-    # Ensure sorted keys if `sort=True`
-    if sort:
-        gb = gb.sort_values(gb_cols)
-
-    # Set index if necessary
-    if as_index:
-        gb.set_index(gb_cols, inplace=True)
-
-    # Unflatten column names
-    col_array = []
-    agg_array = []
-    for col in gb.columns:
-        if col in gb_cols:
-            col_array.append(col)
-            agg_array.append("")
-        else:
-            name, agg = col.split(sep)
-            col_array.append(name)
-            agg_array.append(aggs_renames.get((name, agg), agg))
-    if str_cols_out:
-        gb.columns = col_array
-    else:
-        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    return gb[final_columns]
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
index 0421bd755f4..c544c32523f 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
@@ -1,11 +1 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from .csv import read_csv  # noqa: F401
-from .json import read_json  # noqa: F401
-from .orc import read_orc, to_orc  # noqa: F401
-from .text import read_text  # noqa: F401
-
-try:
-    from .parquet import read_parquet, to_parquet  # noqa: F401
-except ImportError:
-    pass
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py
deleted file mode 100644
index fa5400344f9..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-from warnings import warn
-
-from fsspec.utils import infer_compression
-
-from dask import dataframe as dd
-from dask.base import tokenize
-from dask.dataframe.io.csv import make_reader
-from dask.utils import apply, parse_bytes
-
-import cudf
-
-
-def read_csv(path, blocksize="default", **kwargs):
-    """
-    Read CSV files into a :class:`.DataFrame`.
-
-    This API parallelizes the :func:`cudf:cudf.read_csv` function in
-    the following ways:
-
-    It supports loading many files at once using globstrings:
-
-    >>> import dask_cudf
-    >>> df = dask_cudf.read_csv("myfiles.*.csv")
-
-    In some cases it can break up large files:
-
-    >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
-
-    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
-
-    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
-    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
-
-    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
-    supports many of the same keyword arguments with the same
-    performance guarantees. See the docstring for
-    :func:`cudf:cudf.read_csv` for more information on available
-    keyword arguments.
-
-    Parameters
-    ----------
-    path : str, path object, or file-like object
-        Either a path to a file (a str, :py:class:`pathlib.Path`, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3
-        locations), or any object with a read() method (such as
-        builtin :py:func:`open` file handler function or
-        :py:class:`~io.StringIO`).
-    blocksize : int or str, default "256 MiB"
-        The target task partition size. If ``None``, a single block
-        is used for each file.
-    **kwargs : dict
-        Passthrough key-word arguments that are sent to
-        :func:`cudf:cudf.read_csv`.
-
-    Notes
-    -----
-    If any of `skipfooter`/`skiprows`/`nrows` are passed,
-    `blocksize` will default to None.
-
-    Examples
-    --------
-    >>> import dask_cudf
-    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
-    >>> ddf.compute()
-       a      b
-    0  1     hi
-    1  2  hello
-    2  3     ai
-
-    """
-
-    # Handle `chunksize` deprecation
-    if "chunksize" in kwargs:
-        chunksize = kwargs.pop("chunksize", "default")
-        warn(
-            "`chunksize` is deprecated and will be removed in the future. "
-            "Please use `blocksize` instead.",
-            FutureWarning,
-        )
-        if blocksize == "default":
-            blocksize = chunksize
-
-    # Set default `blocksize`
-    if blocksize == "default":
-        if (
-            kwargs.get("skipfooter", 0) != 0
-            or kwargs.get("skiprows", 0) != 0
-            or kwargs.get("nrows", None) is not None
-        ):
-            # Cannot read in blocks if skipfooter,
-            # skiprows or nrows is passed.
-            blocksize = None
-        else:
-            blocksize = "256 MiB"
-
-    if "://" in str(path):
-        func = make_reader(cudf.read_csv, "read_csv", "CSV")
-        return func(path, blocksize=blocksize, **kwargs)
-    else:
-        return _internal_read_csv(path=path, blocksize=blocksize, **kwargs)
-
-
-def _internal_read_csv(path, blocksize="256 MiB", **kwargs):
-    if isinstance(blocksize, str):
-        blocksize = parse_bytes(blocksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    name = "read-csv-" + tokenize(
-        path, tokenize, **kwargs
-    )  # TODO: get last modified time
-
-    compression = kwargs.get("compression", "infer")
-
-    if compression == "infer":
-        # Infer compression from first path by default
-        compression = infer_compression(filenames[0])
-
-    if compression and blocksize:
-        # compressed CSVs reading must read the entire file
-        kwargs.pop("byte_range", None)
-        warn(
-            "Warning %s compression does not support breaking apart files\n"
-            "Please ensure that each individual file can fit in memory and\n"
-            "use the keyword ``blocksize=None to remove this message``\n"
-            "Setting ``blocksize=(size of file)``" % compression
-        )
-        blocksize = None
-
-    if blocksize is None:
-        return read_csv_without_blocksize(path, **kwargs)
-
-    # Let dask.dataframe generate meta
-    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    kwargs1 = kwargs.copy()
-    usecols = kwargs1.pop("usecols", None)
-    dtype = kwargs1.pop("dtype", None)
-    meta = dask_reader(filenames[0], **kwargs1)._meta
-    names = meta.columns
-    if usecols or dtype:
-        # Regenerate meta with original kwargs if
-        # `usecols` or `dtype` was specified
-        meta = dask_reader(filenames[0], **kwargs)._meta
-
-    dsk = {}
-    i = 0
-    dtypes = meta.dtypes.values
-
-    for fn in filenames:
-        size = os.path.getsize(fn)
-        for start in range(0, size, blocksize):
-            kwargs2 = kwargs.copy()
-            kwargs2["byte_range"] = (
-                start,
-                blocksize,
-            )  # specify which chunk of the file we care about
-            if start != 0:
-                kwargs2["names"] = names  # no header in the middle of the file
-                kwargs2["header"] = None
-            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
-
-            i += 1
-
-    divisions = [None] * (len(dsk) + 1)
-    return dd.core.new_dd_object(dsk, name, meta, divisions)
-
-
-def _read_csv(fn, dtypes=None, **kwargs):
-    return cudf.read_csv(fn, **kwargs)
-
-
-def read_csv_without_blocksize(path, **kwargs):
-    """Read entire CSV with optional compression (gzip/zip)
-
-    Parameters
-    ----------
-    path : str
-        path to files (support for glob)
-    """
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    name = "read-csv-" + tokenize(path, **kwargs)
-
-    meta_kwargs = kwargs.copy()
-    if "skipfooter" in meta_kwargs:
-        meta_kwargs.pop("skipfooter")
-    if "nrows" in meta_kwargs:
-        meta_kwargs.pop("nrows")
-    # Read "head" of first file (first 5 rows).
-    # Convert to empty df for metadata.
-    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
-
-    graph = {
-        (name, i): (apply, cudf.read_csv, [fn], kwargs)
-        for i, fn in enumerate(filenames)
-    }
-
-    divisions = [None] * (len(filenames) + 1)
-
-    return dd.core.new_dd_object(graph, name, meta, divisions)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py
deleted file mode 100644
index 98c5ceedb76..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/json.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-from functools import partial
-
-import numpy as np
-from fsspec.core import get_compression, get_fs_token_paths
-
-import dask
-from dask.utils import parse_bytes
-
-import cudf
-from cudf.core.column import as_column
-from cudf.utils.ioutils import _is_local_filesystem
-
-from dask_cudf.backends import _default_backend
-
-
-def _read_json_partition(
-    paths,
-    fs=None,
-    include_path_column=False,
-    path_converter=None,
-    **kwargs,
-):
-    # Transfer all data up front for remote storage
-    sources = (
-        paths
-        if fs is None
-        else fs.cat_ranges(
-            paths,
-            [0] * len(paths),
-            fs.sizes(paths),
-        )
-    )
-
-    if include_path_column:
-        # Add "path" column.
-        # Must iterate over sources sequentially
-        if not isinstance(include_path_column, str):
-            include_path_column = "path"
-        converted_paths = (
-            paths
-            if path_converter is None
-            else [path_converter(path) for path in paths]
-        )
-        dfs = []
-        for i, source in enumerate(sources):
-            df = cudf.read_json(source, **kwargs)
-            df[include_path_column] = as_column(
-                converted_paths[i], length=len(df)
-            )
-            dfs.append(df)
-        return cudf.concat(dfs)
-    else:
-        # Pass sources directly to cudf
-        return cudf.read_json(sources, **kwargs)
-
-
-def read_json(
-    url_path,
-    engine="auto",
-    blocksize=None,
-    orient="records",
-    lines=None,
-    compression="infer",
-    aggregate_files=True,
-    **kwargs,
-):
-    """Read JSON data into a :class:`.DataFrame`.
-
-    This function wraps :func:`dask.dataframe.read_json`, and passes
-    ``engine=partial(cudf.read_json, engine="auto")`` by default.
-
-    Parameters
-    ----------
-    url_path : str, list of str
-        Location to read from. If a string, can include a glob character to
-        find a set of file names.
-        Supports protocol specifications such as ``"s3://"``.
-    engine : str or Callable, default "auto"
-
-        If str, this value will be used as the ``engine`` argument
-        when :func:`cudf.read_json` is used to create each partition.
-        If a :obj:`~collections.abc.Callable`, this value will be used as the
-        underlying function used to create each partition from JSON
-        data. The default value is "auto", so that
-        ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to :func:`dask.dataframe.read_json` by default.
-    aggregate_files : bool or int
-        Whether to map multiple files to each output partition. If True,
-        the `blocksize` argument will be used to determine the number of
-        files in each partition. If any one file is larger than `blocksize`,
-        the `aggregate_files` argument will be ignored. If an integer value
-        is specified, the `blocksize` argument will be ignored, and that
-        number of files will be mapped to each partition. Default is True.
-    **kwargs :
-        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
-
-    Returns
-    -------
-    :class:`.DataFrame`
-
-    Examples
-    --------
-    Load single file
-
-    >>> from dask_cudf import read_json
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    Load large line-delimited JSON files using partitions of approx
-    256MB size
-
-    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
-
-    Load nested JSON data
-
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    See Also
-    --------
-    dask.dataframe.read_json
-
-    """
-
-    if lines is None:
-        lines = orient == "records"
-    if orient != "records" and lines:
-        raise ValueError(
-            'Line-delimited JSON is only available with orient="records".'
-        )
-    if blocksize and (orient != "records" or not lines):
-        raise ValueError(
-            "JSON file chunking only allowed for JSON-lines"
-            "input (orient='records', lines=True)."
-        )
-
-    inputs = []
-    if aggregate_files and blocksize or int(aggregate_files) > 1:
-        # Attempt custom read if we are mapping multiple files
-        # to each output partition. Otherwise, upstream logic
-        # is sufficient.
-
-        storage_options = kwargs.get("storage_options", {})
-        fs, _, paths = get_fs_token_paths(
-            url_path, mode="rb", storage_options=storage_options
-        )
-        if isinstance(aggregate_files, int) and aggregate_files > 1:
-            # Map a static file count to each partition
-            inputs = [
-                paths[offset : offset + aggregate_files]
-                for offset in range(0, len(paths), aggregate_files)
-            ]
-        elif aggregate_files is True and blocksize:
-            # Map files dynamically (using blocksize)
-            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
-            blocksize = parse_bytes(blocksize)
-            if all([file_size <= blocksize for file_size in file_sizes]):
-                counts = np.unique(
-                    np.floor(np.cumsum(file_sizes) / blocksize),
-                    return_counts=True,
-                )[1]
-                offsets = np.concatenate([[0], counts.cumsum()])
-                inputs = [
-                    paths[offsets[i] : offsets[i + 1]]
-                    for i in range(len(offsets) - 1)
-                ]
-
-    if inputs:
-        # Inputs were successfully populated.
-        # Use custom _read_json_partition function
-        # to generate each partition.
-
-        compression = get_compression(
-            url_path[0] if isinstance(url_path, list) else url_path,
-            compression,
-        )
-        _kwargs = dict(
-            orient=orient,
-            lines=lines,
-            compression=compression,
-            include_path_column=kwargs.get("include_path_column", False),
-            path_converter=kwargs.get("path_converter"),
-        )
-        if not _is_local_filesystem(fs):
-            _kwargs["fs"] = fs
-        # TODO: Generate meta more efficiently
-        meta = _read_json_partition(inputs[0][:1], **_kwargs)
-        return dask.dataframe.from_map(
-            _read_json_partition,
-            inputs,
-            meta=meta,
-            **_kwargs,
-        )
-
-    # Fall back to dask.dataframe.read_json
-    return _default_backend(
-        dask.dataframe.read_json,
-        url_path,
-        engine=(
-            partial(cudf.read_json, engine=engine)
-            if isinstance(engine, str)
-            else engine
-        ),
-        blocksize=blocksize,
-        orient=orient,
-        lines=lines,
-        compression=compression,
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py
deleted file mode 100644
index fcf684fd6c8..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from io import BufferedWriter, IOBase
-
-from fsspec.core import get_fs_token_paths
-from fsspec.utils import stringify_path
-from pyarrow import orc as orc
-
-from dask import dataframe as dd
-from dask.dataframe.io.utils import _get_pyarrow_dtypes
-
-import cudf
-
-
-def _read_orc_stripe(source, fs, columns=None, kwargs=None):
-    """Pull out specific columns from specific stripe"""
-    path, stripe = source
-    if kwargs is None:
-        kwargs = {}
-    with fs.open(path, "rb") as f:
-        df_stripe = cudf.read_orc(
-            f, stripes=[stripe], columns=columns, **kwargs
-        )
-    return df_stripe
-
-
-def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read ORC files into a :class:`.DataFrame`.
-
-    Note that this function is mostly borrowed from upstream Dask.
-
-    Parameters
-    ----------
-    path : str or list[str]
-        Location of file(s), which can be a full URL with protocol specifier,
-        and may include glob character if a single string.
-    columns : None or list[str]
-        Columns to load. If None, loads all.
-    filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out
-        row groups using statistics stored for each row group as
-        Parquet metadata. Row groups that do not match the given
-        filter predicate are not read. The predicate is expressed in
-        `disjunctive normal form (DNF)
-        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
-        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
-        boolean logical combinations of single column predicates. The
-        innermost tuples each describe a single column predicate. The
-        list of inner predicates is interpreted as a conjunction
-        (AND), forming a more selective and multiple column predicate.
-        Finally, the outermost list combines these filters as a
-        disjunction (OR). Predicates may also be passed as a list of
-        tuples. This form is interpreted as a single conjunction. To
-        express OR in predicates, one must use the (preferred)
-        notation of list of lists of tuples.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-
-    See Also
-    --------
-    dask.dataframe.read_orc
-
-    Returns
-    -------
-    dask_cudf.DataFrame
-
-    """
-
-    storage_options = storage_options or {}
-    fs, _, paths = get_fs_token_paths(
-        path, mode="rb", storage_options=storage_options
-    )
-    schema = None
-    nstripes_per_file = []
-    for path in paths:
-        with fs.open(path, "rb") as f:
-            o = orc.ORCFile(f)
-            if schema is None:
-                schema = o.schema
-            elif schema != o.schema:
-                raise ValueError(
-                    "Incompatible schemas while parsing ORC files"
-                )
-            nstripes_per_file.append(o.nstripes)
-    schema = _get_pyarrow_dtypes(schema, categories=None)
-    if columns is not None:
-        ex = set(columns) - set(schema)
-        if ex:
-            raise ValueError(
-                f"Requested columns ({ex}) not in schema ({set(schema)})"
-            )
-    else:
-        columns = list(schema)
-
-    with fs.open(paths[0], "rb") as f:
-        meta = cudf.read_orc(
-            f,
-            stripes=[0] if nstripes_per_file[0] else None,
-            columns=columns,
-            **kwargs,
-        )
-
-    sources = []
-    for path, n in zip(paths, nstripes_per_file):
-        for stripe in (
-            range(n)
-            if filters is None
-            else cudf.io.orc._filter_stripes(filters, path)
-        ):
-            sources.append((path, stripe))
-
-    return dd.from_map(
-        _read_orc_stripe,
-        sources,
-        args=[fs],
-        columns=columns,
-        kwargs=kwargs,
-        meta=meta,
-    )
-
-
-def write_orc_partition(df, path, fs, filename, compression="snappy"):
-    full_path = fs.sep.join([path, filename])
-    with fs.open(full_path, mode="wb") as out_file:
-        if not isinstance(out_file, IOBase):
-            out_file = BufferedWriter(out_file)
-        cudf.io.to_orc(df, out_file, compression=compression)
-    return full_path
-
-
-def to_orc(
-    df,
-    path,
-    write_index=True,
-    storage_options=None,
-    compression="snappy",
-    compute=True,
-    **kwargs,
-):
-    """
-    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
-
-    Parameters
-    ----------
-    df : DataFrame
-    path : str or pathlib.Path
-        Destination directory for data.  Prepend with protocol like ``s3://``
-        or ``hdfs://`` for remote data.
-    write_index : boolean, optional
-        Whether or not to write the index. Defaults to True.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-    compression : string or dict, optional
-    compute : bool, optional
-        If True (default) then the result is computed immediately. If
-        False then a :class:`~dask.delayed.Delayed` object is returned
-        for future computation.
-
-    """
-
-    from dask import compute as dask_compute, delayed
-
-    # TODO: Use upstream dask implementation once available
-    #       (see: Dask Issue#5596)
-
-    if hasattr(path, "name"):
-        path = stringify_path(path)
-    fs, _, _ = get_fs_token_paths(
-        path, mode="wb", storage_options=storage_options
-    )
-    # Trim any protocol information from the path before forwarding
-    path = fs._strip_protocol(path)
-
-    if write_index:
-        df = df.reset_index()
-    else:
-        # Not writing index - might as well drop it
-        df = df.reset_index(drop=True)
-
-    fs.mkdirs(path, exist_ok=True)
-
-    # Use i_offset and df.npartitions to define file-name list
-    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
-
-    # write parts
-    dwrite = delayed(write_orc_partition)
-    parts = [
-        dwrite(d, path, fs, filename, compression=compression)
-        for d, filename in zip(df.to_delayed(), filenames)
-    ]
-
-    if compute:
-        return dask_compute(*parts)
-
-    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0638e4a1c3..c0792663c7e 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 import itertools
 import warnings
 from functools import partial
@@ -8,7 +8,7 @@
 import pandas as pd
 from pyarrow import dataset as pa_ds, parquet as pq
 
-from dask import dataframe as dd
+import dask.dataframe as dd
 from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 
 try:
@@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 df._data[col_name] = col.astype(typ)
 
 
-def read_parquet(path, columns=None, **kwargs):
-    """
-    Read parquet files into a :class:`.DataFrame`.
-
-    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
-    to coordinate the execution of :func:`cudf.read_parquet`, and to
-    ultimately create a :class:`.DataFrame` collection.
-
-    See the :func:`dask.dataframe.read_parquet` documentation for
-    all available options.
-
-    Examples
-    --------
-    >>> from dask_cudf import read_parquet
-    >>> df = read_parquet("/path/to/dataset/")  # doctest: +SKIP
-
-    When dealing with one or more large parquet files having an
-    in-memory footprint >15% device memory, the ``split_row_groups``
-    argument should be used to map Parquet **row-groups** to DataFrame
-    partitions (instead of **files** to partitions). For example, the
-    following code will map each row-group to a distinct partition:
-
-    >>> df = read_parquet(..., split_row_groups=True)  # doctest: +SKIP
-
-    To map **multiple** row-groups to each partition, an integer can be
-    passed to ``split_row_groups`` to specify the **maximum** number of
-    row-groups allowed in each output partition:
-
-    >>> df = read_parquet(..., split_row_groups=10)  # doctest: +SKIP
-
-    See Also
-    --------
-    cudf.read_parquet
-    dask.dataframe.read_parquet
-    """
-    if isinstance(columns, str):
-        columns = [columns]
-
-    # Set "check_file_size" option to determine whether we
-    # should check the parquet-file size. This check is meant
-    # to "protect" users from `split_row_groups` default changes
-    check_file_size = kwargs.pop("check_file_size", 500_000_000)
-    if (
-        check_file_size
-        and ("split_row_groups" not in kwargs)
-        and ("chunksize" not in kwargs)
-    ):
-        # User is not specifying `split_row_groups` or `chunksize`,
-        # so we should warn them if/when a file is ~>0.5GB on disk.
-        # They can set `split_row_groups` explicitly to silence/skip
-        # this check
-        if "read" not in kwargs:
-            kwargs["read"] = {}
-        kwargs["read"]["check_file_size"] = check_file_size
-
-    return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs)
-
-
-to_parquet = partial(dd.to_parquet, engine=CudfEngine)
+to_parquet = dd.to_parquet
 
 if create_metadata_file_dd is None:
     create_metadata_file = create_metadata_file_dd
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py
deleted file mode 100644
index 3757c85c80c..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/text.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-
-import dask.dataframe as dd
-from dask.utils import parse_bytes
-
-import cudf
-
-
-def _read_text(source, **kwargs):
-    # Wrapper for cudf.read_text operation
-    fn, byte_range = source
-    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
-
-
-def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
-    if isinstance(chunksize, str):
-        chunksize = parse_bytes(chunksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    if chunksize and byte_range:
-        raise ValueError("Cannot specify both chunksize and byte_range.")
-
-    if chunksize:
-        sources = []
-        for fn in filenames:
-            size = os.path.getsize(fn)
-            for start in range(0, size, chunksize):
-                byte_range = (
-                    start,
-                    chunksize,
-                )  # specify which chunk of the file we care about
-                sources.append((fn, byte_range))
-    else:
-        sources = [(fn, byte_range) for fn in filenames]
-
-    return dd.from_map(
-        _read_text,
-        sources,
-        meta=cudf.Series([], dtype="O"),
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py
deleted file mode 100644
index a2ba4d1878e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/sorting.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import warnings
-from collections.abc import Iterator
-from functools import wraps
-
-import cupy
-import numpy as np
-import tlz as toolz
-
-from dask import config
-from dask.base import tokenize
-from dask.dataframe import methods
-from dask.dataframe.core import DataFrame, Index, Series
-from dask.dataframe.shuffle import rearrange_by_column
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M
-
-import cudf
-from cudf.api.types import _is_categorical_dtype
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
-
-
-def _deprecate_shuffle_kwarg(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        old_arg_value = kwargs.pop("shuffle", None)
-
-        if old_arg_value is not None:
-            new_arg_value = old_arg_value
-            msg = (
-                "the 'shuffle' keyword is deprecated, "
-                "use 'shuffle_method' instead."
-            )
-
-            warnings.warn(msg, FutureWarning)
-            if kwargs.get("shuffle_method") is not None:
-                msg = (
-                    "Can only specify 'shuffle' "
-                    "or 'shuffle_method', not both."
-                )
-                raise TypeError(msg)
-            kwargs["shuffle_method"] = new_arg_value
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-@_dask_cudf_performance_tracking
-def set_index_post(df, index_name, drop, column_dtype):
-    df2 = df.set_index(index_name, drop=drop)
-    df2.columns = df2.columns.astype(column_dtype)
-    return df2
-
-
-@_dask_cudf_performance_tracking
-def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
-    if ascending:
-        partitions = divisions.searchsorted(s, side="right") - 1
-    else:
-        partitions = (
-            len(divisions) - divisions.searchsorted(s, side="right") - 1
-        )
-    partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = (
-        0 if ascending else (len(divisions) - 2)
-    )
-    partitions[s._columns[0].isnull().values] = (
-        len(divisions) - 2 if na_position == "last" else 0
-    )
-    return partitions
-
-
-@_dask_cudf_performance_tracking
-def _quantile(a, q):
-    n = len(a)
-    if not len(a):
-        return None, n
-    return (
-        a.quantile(q=q.tolist(), interpolation="nearest", method="table"),
-        n,
-    )
-
-
-@_dask_cudf_performance_tracking
-def merge_quantiles(finalq, qs, vals):
-    """Combine several quantile calculations of different data.
-    [NOTE: Same logic as dask.array merge_percentiles]
-    """
-    if isinstance(finalq, Iterator):
-        finalq = list(finalq)
-    finalq = np.array(finalq)
-    qs = list(map(list, qs))
-    vals = list(vals)
-    vals, Ns = zip(*vals)
-    Ns = list(Ns)
-
-    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
-    if not L:
-        raise ValueError("No non-trivial arrays found")
-    qs, vals, Ns = L
-
-    if len(vals) != len(qs) or len(Ns) != len(qs):
-        raise ValueError("qs, vals, and Ns parameters must be the same length")
-
-    # transform qs and Ns into number of observations between quantiles
-    counts = []
-    for q, N in zip(qs, Ns):
-        count = np.empty(len(q))
-        count[1:] = np.diff(q)
-        count[0] = q[0]
-        count *= N
-        counts.append(count)
-
-    def _append_counts(val, count):
-        val["_counts"] = count
-        return val
-
-    # Sort by calculated quantile values, then number of observations.
-    combined_vals_counts = cudf.core.reshape._merge_sorted(
-        [*map(_append_counts, vals, counts)]
-    )
-    combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values)
-    combined_vals = combined_vals_counts.drop(columns=["_counts"])
-
-    # quantile-like, but scaled by total number of observations
-    combined_q = np.cumsum(combined_counts)
-
-    # rescale finalq quantiles to match combined_q
-    desired_q = finalq * sum(Ns)
-
-    # TODO: Support other interpolation methods
-    # For now - Always use "nearest" for interpolation
-    left = np.searchsorted(combined_q, desired_q, side="left")
-    right = np.searchsorted(combined_q, desired_q, side="right") - 1
-    np.minimum(left, len(combined_vals) - 1, left)  # don't exceed max index
-    lower = np.minimum(left, right)
-    upper = np.maximum(left, right)
-    lower_residual = np.abs(combined_q[lower] - desired_q)
-    upper_residual = np.abs(combined_q[upper] - desired_q)
-    mask = lower_residual > upper_residual
-    index = lower  # alias; we no longer need lower
-    index[mask] = upper[mask]
-    rv = combined_vals.iloc[index]
-    return rv.reset_index(drop=True)
-
-
-@_dask_cudf_performance_tracking
-def _approximate_quantile(df, q):
-    """Approximate quantiles of DataFrame or Series.
-    [NOTE: Same logic as dask.dataframe Series quantile]
-    """
-    # current implementation needs q to be sorted so
-    # sort if array-like, otherwise leave it alone
-    q_ndarray = np.array(q)
-    if q_ndarray.ndim > 0:
-        q_ndarray.sort(kind="mergesort")
-        q = q_ndarray
-
-    # Lets assume we are dealing with a DataFrame throughout
-    if isinstance(df, (Series, Index)):
-        df = df.to_frame()
-    assert isinstance(df, DataFrame)
-    final_type = df._meta._constructor
-
-    # Create metadata
-    meta = df._meta_nonempty.quantile(q=q, method="table")
-
-    # Define final action (create df with quantiles as index)
-    def finalize_tsk(tsk):
-        return (final_type, tsk)
-
-    return_type = df.__class__
-
-    # pandas/cudf uses quantile in [0, 1]
-    # numpy / cupy uses [0, 100]
-    qs = np.asarray(q)
-    token = tokenize(df, qs)
-
-    if len(qs) == 0:
-        name = "quantiles-" + token
-        empty_index = cudf.Index([], dtype=float)
-        return Series(
-            {
-                (name, 0): final_type(
-                    {col: [] for col in df.columns},
-                    name=df.name,
-                    index=empty_index,
-                )
-            },
-            name,
-            df._meta,
-            [None, None],
-        )
-    else:
-        new_divisions = [np.min(q), np.max(q)]
-
-    name = "quantiles-1-" + token
-    val_dsk = {
-        (name, i): (_quantile, key, qs)
-        for i, key in enumerate(df.__dask_keys__())
-    }
-
-    name2 = "quantiles-2-" + token
-    merge_dsk = {
-        (name2, 0): finalize_tsk(
-            (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk))
-        )
-    }
-    dsk = toolz.merge(val_dsk, merge_dsk)
-    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df])
-    df = return_type(graph, name2, meta, new_divisions)
-
-    def set_quantile_index(df):
-        df.index = q
-        return df
-
-    df = df.map_partitions(set_quantile_index, meta=meta)
-    return df
-
-
-@_dask_cudf_performance_tracking
-def quantile_divisions(df, by, npartitions):
-    qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
-    divisions = _approximate_quantile(df[by], qn).compute()
-    columns = divisions.columns
-
-    # TODO: Make sure divisions are correct for all dtypes..
-    if (
-        len(columns) == 1
-        and df[columns[0]].dtype != "object"
-        and not _is_categorical_dtype(df[columns[0]].dtype)
-    ):
-        dtype = df[columns[0]].dtype
-        divisions = divisions[columns[0]].astype("int64")
-        divisions.iloc[-1] += 1
-        divisions = sorted(
-            divisions.drop_duplicates().astype(dtype).to_arrow().tolist(),
-            key=lambda x: (x is None, x),
-        )
-    else:
-        for col in columns:
-            dtype = df[col].dtype
-            if dtype != "object":
-                divisions[col] = divisions[col].astype("int64")
-                divisions[col].iloc[-1] += 1
-                divisions[col] = divisions[col].astype(dtype)
-            else:
-                if last := divisions[col].iloc[-1]:
-                    val = chr(ord(last[0]) + 1)
-                else:
-                    val = "this string intentionally left empty"  # any but ""
-                divisions[col].iloc[-1] = val
-        divisions = divisions.drop_duplicates().sort_index()
-    return divisions
-
-
-@_deprecate_shuffle_kwarg
-@_dask_cudf_performance_tracking
-def sort_values(
-    df,
-    by,
-    max_branch=None,
-    divisions=None,
-    set_divisions=False,
-    ignore_index=False,
-    ascending=True,
-    na_position="last",
-    shuffle_method=None,
-    sort_function=None,
-    sort_function_kwargs=None,
-):
-    """Sort by the given list/tuple of column names."""
-
-    if not isinstance(ascending, bool):
-        raise ValueError("ascending must be either True or False")
-    if na_position not in ("first", "last"):
-        raise ValueError("na_position must be either 'first' or 'last'")
-
-    npartitions = df.npartitions
-    if isinstance(by, tuple):
-        by = list(by)
-    elif not isinstance(by, list):
-        by = [by]
-
-    # parse custom sort function / kwargs if provided
-    sort_kwargs = {
-        "by": by,
-        "ascending": ascending,
-        "na_position": na_position,
-    }
-    if sort_function is None:
-        sort_function = M.sort_values
-    if sort_function_kwargs is not None:
-        sort_kwargs.update(sort_function_kwargs)
-
-    # handle single partition case
-    if npartitions == 1:
-        return df.map_partitions(sort_function, **sort_kwargs)
-
-    # Step 1 - Calculate new divisions (if necessary)
-    if divisions is None:
-        divisions = quantile_divisions(df, by, npartitions)
-
-    # Step 2 - Perform repartitioning shuffle
-    meta = df._meta._constructor_sliced([0])
-    if not isinstance(divisions, (cudf.Series, cudf.DataFrame)):
-        dtype = df[by[0]].dtype
-        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)
-
-    partitions = df[by].map_partitions(
-        _set_partitions_pre,
-        divisions=divisions,
-        ascending=ascending,
-        na_position=na_position,
-        meta=meta,
-    )
-
-    df2 = df.assign(_partitions=partitions)
-    df3 = rearrange_by_column(
-        df2,
-        "_partitions",
-        max_branch=max_branch,
-        npartitions=len(divisions) - 1,
-        shuffle_method=_get_shuffle_method(shuffle_method),
-        ignore_index=ignore_index,
-    ).drop(columns=["_partitions"])
-    df3.divisions = (None,) * (df3.npartitions + 1)
-
-    # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(sort_function, **sort_kwargs)
-    if not isinstance(divisions, cudf.DataFrame) and set_divisions:
-        # Can't have multi-column divisions elsewhere in dask (yet)
-        df4.divisions = tuple(methods.tolist(divisions))
-
-    return df4
-
-
-def get_default_shuffle_method():
-    # Note that `dask.utils.get_default_shuffle_method`
-    # will return "p2p" by default when a distributed
-    # client is present. Dask-cudf supports "p2p", but
-    # will not use it by default (yet)
-    default = config.get("dataframe.shuffle.method", "tasks")
-    if default not in _SHUFFLE_SUPPORT:
-        default = "tasks"
-    return default
-
-
-def _get_shuffle_method(shuffle_method):
-    # Utility to set the shuffle_method-kwarg default
-    # and to validate user-specified options
-    shuffle_method = shuffle_method or get_default_shuffle_method()
-    if shuffle_method not in _SHUFFLE_SUPPORT:
-        raise ValueError(
-            "Dask-cudf only supports the following shuffle "
-            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}"
-        )
-
-    return shuffle_method
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index fceaaf185e8..f33733d9583 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import warnings
 from collections.abc import Iterator
@@ -11,14 +11,12 @@
 from packaging.version import Version
 from pandas.api.types import is_scalar
 
-import dask.dataframe as dd
 from dask import config
 from dask.array.dispatch import percentile_lookup
 from dask.dataframe.backends import (
     DataFrameBackendEntrypoint,
     PandasBackendEntrypoint,
 )
-from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
     concat_dispatch,
@@ -28,6 +26,8 @@
     hash_object_dispatch,
     is_categorical_dtype_dispatch,
     make_meta_dispatch,
+    meta_nonempty,
+    partd_encode_dispatch,
     pyarrow_schema_dispatch,
     to_pyarrow_table_dispatch,
     tolist_dispatch,
@@ -46,13 +46,6 @@
 from cudf.api.types import is_string_dtype
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
-from ._legacy.core import DataFrame, Index, Series
-
-get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
-get_parallel_type.register(cudf.Series, lambda _: Series)
-get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
-
-
 # Required for Arrow filesystem support in read_parquet
 PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
 
@@ -318,7 +311,7 @@ def tolist_cudf(obj):
 
 
 @is_categorical_dtype_dispatch.register(
-    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
+    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype)  # , Series)
 )
 @_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
@@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
 
-# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0
-try:
-    from dask.dataframe.dispatch import partd_encode_dispatch
-
-    @partd_encode_dispatch.register(cudf.DataFrame)
-    def _simple_cudf_encode(_):
-        # Basic pickle-based encoding for a partd k-v store
-        import pickle
+@partd_encode_dispatch.register(cudf.DataFrame)
+def _simple_cudf_encode(_):
+    # Basic pickle-based encoding for a partd k-v store
+    import pickle
 
-        import partd
+    import partd
 
-        def join(dfs):
-            if not dfs:
-                return cudf.DataFrame()
-            else:
-                return cudf.concat(dfs)
-
-        dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
-        return partial(partd.Encode, dumps, pickle.loads, join)
+    def join(dfs):
+        if not dfs:
+            return cudf.DataFrame()
+        else:
+            return cudf.concat(dfs)
 
-except ImportError:
-    pass
+    dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
+    return partial(partd.Encode, dumps, pickle.loads, join)
 
 
 def _default_backend(func, *args, **kwargs):
@@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs):
     return data
 
 
-# Define "cudf" backend engine to be registered with Dask
-class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
-    """Backend-entrypoint class for Dask-DataFrame
+# Define the "cudf" backend for "legacy" Dask DataFrame
+class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint):
+    """Backend-entrypoint class for legacy Dask-DataFrame
 
     This class is registered under the name "cudf" for the
-    ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
-    Dask-DataFrame will use the methods defined in this class
-    in place of ``dask.dataframe.<creation-method>`` when the
-    "dataframe.backend" configuration is set to "cudf":
-
-    Examples
-    --------
-    >>> import dask
-    >>> import dask.dataframe as dd
-    >>> with dask.config.set({"dataframe.backend": "cudf"}):
-    ...     ddf = dd.from_dict({"a": range(10)})
-    >>> type(ddf)
-    <class 'dask_cudf._legacy.core.DataFrame'>
+    ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``.
+    This "legacy" backend is only used for CSV support.
     """
 
-    @classmethod
-    def to_backend_dispatch(cls):
-        return to_cudf_dispatch
-
-    @classmethod
-    def to_backend(cls, data: dd.core._Frame, **kwargs):
-        if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)):
-            # Already a cudf-backed collection
-            _unsupported_kwargs("cudf", "cudf", kwargs)
-            return data
-        return data.map_partitions(cls.to_backend_dispatch(), **kwargs)
-
-    @staticmethod
-    def from_dict(
-        data,
-        npartitions,
-        orient="columns",
-        dtype=None,
-        columns=None,
-        constructor=cudf.DataFrame,
-    ):
-        return _default_backend(
-            dd.from_dict,
-            data,
-            npartitions=npartitions,
-            orient=orient,
-            dtype=dtype,
-            columns=columns,
-            constructor=constructor,
-        )
-
-    @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
-        from dask_cudf._legacy.io.parquet import CudfEngine
-
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dd.read_parquet,
-            *args,
-            engine=CudfEngine,
-            **kwargs,
-        )
-
-    @staticmethod
-    def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json
-
-        return read_json(*args, **kwargs)
 
-    @staticmethod
-    def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io import read_orc
-
-        return read_orc(*args, **kwargs)
-
-    @staticmethod
-    def read_csv(*args, **kwargs):
-        from dask_cudf._legacy.io import read_csv
-
-        return read_csv(*args, **kwargs)
-
-    @staticmethod
-    def read_hdf(*args, **kwargs):
-        # HDF5 reader not yet implemented in cudf
-        warnings.warn(
-            "read_hdf is not yet implemented in cudf/dask_cudf. "
-            "Moving to cudf from pandas. Expect poor performance!"
-        )
-        return _default_backend(dd.read_hdf, *args, **kwargs).to_backend(
-            "cudf"
-        )
-
-
-# Define "cudf" backend entrypoint for dask-expr
-class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
+# Define the "cudf" backend for expr-based Dask DataFrame
+class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
     """Backend-entrypoint class for Dask-Expressions
 
     This class is registered under the name "cudf" for the
-    ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``.
+    ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``.
     Dask-DataFrame will use the methods defined in this class
     in place of ``dask_expr.<creation-method>`` when the
     "dataframe.backend" configuration is set to "cudf":
@@ -746,12 +649,12 @@ def read_csv(
 
     @staticmethod
     def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json as read_json_impl
+        from dask_cudf.io.json import read_json as read_json_impl
 
         return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
 
         return legacy_read_orc(*args, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5fd217209ec..32461104ef9 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,56 +1,41 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import textwrap
+import warnings
+from importlib import import_module
 
 import dask.dataframe as dd
-from dask.tokenize import tokenize
 
 import cudf
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 # This module provides backward compatibility for legacy import patterns.
-if dd.DASK_EXPR_ENABLED:
-    from dask_cudf._expr.collection import (
-        DataFrame,
-        Index,
-        Series,
-    )
-else:
-    from dask_cudf._legacy.core import DataFrame, Index, Series  # noqa: F401
-
+from dask_cudf._expr.collection import (
+    DataFrame,  # noqa: F401
+    Index,  # noqa: F401
+    Series,  # noqa: F401
+)
 
 concat = dd.concat
 
 
 @_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
-    from dask_cudf import QUERY_PLANNING_ON
-
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
             "dask_cudf does not support MultiIndex Dataframes."
         )
 
-    # Dask-expr doesn't support the `name` argument
-    name = {}
-    if not QUERY_PLANNING_ON:
-        name = {
-            "name": name
-            or ("from_cudf-" + tokenize(data, npartitions or chunksize))
-        }
-
     return dd.from_pandas(
         data,
         npartitions=npartitions,
         chunksize=chunksize,
         sort=sort,
-        **name,
     )
 
 
-from_cudf.__doc__ = (
-    textwrap.dedent(
-        """
+from_cudf.__doc__ = textwrap.dedent(
+    """
         Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
 
         This function is a thin wrapper around
@@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
         arguments (described below) excepting that it operates on cuDF
         rather than pandas objects.\n
         """
-    )
-    # TODO: `dd.from_pandas.__doc__` is empty when
-    # `DASK_DATAFRAME__QUERY_PLANNING=True`
-    # since dask-expr does not provide a docstring for from_pandas.
-    + textwrap.dedent(dd.from_pandas.__doc__ or "")
-)
+) + textwrap.dedent(dd.from_pandas.__doc__)
+
+
+def _deprecated_api(old_api, new_api=None, rec=None):
+    def inner_func(*args, **kwargs):
+        if new_api:
+            # Use alternative
+            msg = f"{old_api} is now deprecated. "
+            msg += rec or f"Please use {new_api} instead."
+            warnings.warn(msg, FutureWarning)
+            new_attr = new_api.split(".")
+            module = import_module(".".join(new_attr[:-1]))
+            return getattr(module, new_attr[-1])(*args, **kwargs)
+
+        # No alternative - raise an error
+        raise NotImplementedError(
+            f"{old_api} is no longer supported. " + (rec or "")
+        )
+
+    return inner_func
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 9bca33e414a..a5175c9bbe7 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
+from dask_cudf.core import _deprecated_api
 
 from . import csv, json, orc, parquet, text  # noqa: F401
 
@@ -15,20 +15,13 @@
 )
 to_orc = _deprecated_api(
     "dask_cudf.io.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.orc.to_orc",
     rec="Please use the DataFrame.to_orc method instead.",
 )
 read_text = _deprecated_api(
     "dask_cudf.io.read_text", new_api="dask_cudf.read_text"
 )
-if QUERY_PLANNING_ON:
-    read_parquet = parquet.read_parquet
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = parquet.read_parquet
 to_parquet = _deprecated_api(
     "dask_cudf.io.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 29f98b14511..e36ee04d827 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import os
 from glob import glob
@@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs):
     >>> import dask_cudf
     >>> df = dask_cudf.read_csv("myfiles.*.csv")
 
-    In some cases it can break up large files:
+    It can break up large files if blocksize is specified:
 
     >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
 
-    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
+    It can read CSV files from external resources (e.g. S3, HTTP, FTP):
 
     >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
     >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
@@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs):
     ----------
     path : str, path object, or file-like object
         Either a path to a file (a str, :py:class:`pathlib.Path`, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3
-        locations), or any object with a read() method (such as
+        ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3
+        locations), or any object with a ``read()`` method (such as
         builtin :py:func:`open` file handler function or
         :py:class:`~io.StringIO`).
     blocksize : int or str, default "256 MiB"
         The target task partition size. If ``None``, a single block
         is used for each file.
     **kwargs : dict
-        Passthrough key-word arguments that are sent to
+        Passthrough keyword arguments that are sent to
         :func:`cudf:cudf.read_csv`.
 
     Notes
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 8f85ea54c0a..3022ebb2a5b 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,8 +1,209 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+from functools import partial
 
-read_json = _deprecated_api(
-    "dask_cudf.io.json.read_json",
-    new_api="dask_cudf.read_json",
-)
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
+import dask
+from dask.utils import parse_bytes
+
+import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
+
+from dask_cudf.backends import _default_backend
+
+
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
+    """Read JSON data into a :class:`.DataFrame`.
+
+    This function wraps :func:`dask.dataframe.read_json`, and passes
+    ``engine=partial(cudf.read_json, engine="auto")`` by default.
+
+    Parameters
+    ----------
+    url_path : str, list of str
+        Location to read from. If a string, can include a glob character to
+        find a set of file names.
+        Supports protocol specifications such as ``"s3://"``.
+    engine : str or Callable, default "auto"
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~collections.abc.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
+    **kwargs :
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
+
+    Returns
+    -------
+    :class:`.DataFrame`
+
+    Examples
+    --------
+    Load single file
+
+    >>> from dask_cudf import read_json
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    Load large line-delimited JSON files using partitions of approx
+    256MB size
+
+    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
+
+    Load nested JSON data
+
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    See Also
+    --------
+    dask.dataframe.read_json
+
+    """
+
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
+    return _default_backend(
+        dask.dataframe.read_json,
+        url_path,
+        engine=(
+            partial(cudf.read_json, engine=engine)
+            if isinstance(engine, str)
+            else engine
+        ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 5219cdacc31..5de28751912 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,13 +1,195 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from dask_cudf import _deprecated_api
-
-read_orc = _deprecated_api(
-    "dask_cudf.io.orc.read_orc",
-    new_api="dask_cudf.read_orc",
-)
-to_orc = _deprecated_api(
-    "dask_cudf.io.orc.to_orc",
-    new_api="dask_cudf._legacy.io.orc.to_orc",
-    rec="Please use the DataFrame.to_orc method instead.",
-)
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+
+from io import BufferedWriter, IOBase
+
+from fsspec.core import get_fs_token_paths
+from fsspec.utils import stringify_path
+from pyarrow import orc as orc
+
+from dask import dataframe as dd
+from dask.dataframe.io.utils import _get_pyarrow_dtypes
+
+import cudf
+
+
+def _read_orc_stripe(source, fs, columns=None, kwargs=None):
+    """Pull out specific columns from specific stripe"""
+    path, stripe = source
+    if kwargs is None:
+        kwargs = {}
+    with fs.open(path, "rb") as f:
+        df_stripe = cudf.read_orc(
+            f, stripes=[stripe], columns=columns, **kwargs
+        )
+    return df_stripe
+
+
+def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
+    """Read ORC files into a :class:`.DataFrame`.
+
+    Note that this function is mostly borrowed from upstream Dask.
+
+    Parameters
+    ----------
+    path : str or list[str]
+        Location of file(s), which can be a full URL with protocol specifier,
+        and may include glob character if a single string.
+    columns : None or list[str]
+        Columns to load. If None, loads all.
+    filters : None or list of tuple or list of lists of tuples
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+
+    See Also
+    --------
+    dask.dataframe.read_orc
+
+    Returns
+    -------
+    dask_cudf.DataFrame
+
+    """
+
+    storage_options = storage_options or {}
+    fs, _, paths = get_fs_token_paths(
+        path, mode="rb", storage_options=storage_options
+    )
+    schema = None
+    nstripes_per_file = []
+    for path in paths:
+        with fs.open(path, "rb") as f:
+            o = orc.ORCFile(f)
+            if schema is None:
+                schema = o.schema
+            elif schema != o.schema:
+                raise ValueError(
+                    "Incompatible schemas while parsing ORC files"
+                )
+            nstripes_per_file.append(o.nstripes)
+    schema = _get_pyarrow_dtypes(schema, categories=None)
+    if columns is not None:
+        ex = set(columns) - set(schema)
+        if ex:
+            raise ValueError(
+                f"Requested columns ({ex}) not in schema ({set(schema)})"
+            )
+    else:
+        columns = list(schema)
+
+    with fs.open(paths[0], "rb") as f:
+        meta = cudf.read_orc(
+            f,
+            stripes=[0] if nstripes_per_file[0] else None,
+            columns=columns,
+            **kwargs,
+        )
+
+    sources = []
+    for path, n in zip(paths, nstripes_per_file):
+        for stripe in (
+            range(n)
+            if filters is None
+            else cudf.io.orc._filter_stripes(filters, path)
+        ):
+            sources.append((path, stripe))
+
+    return dd.from_map(
+        _read_orc_stripe,
+        sources,
+        args=[fs],
+        columns=columns,
+        kwargs=kwargs,
+        meta=meta,
+    )
+
+
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
+    full_path = fs.sep.join([path, filename])
+    with fs.open(full_path, mode="wb") as out_file:
+        if not isinstance(out_file, IOBase):
+            out_file = BufferedWriter(out_file)
+        cudf.io.to_orc(df, out_file, compression=compression)
+    return full_path
+
+
+def to_orc(
+    df,
+    path,
+    write_index=True,
+    storage_options=None,
+    compression="snappy",
+    compute=True,
+    **kwargs,
+):
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str or pathlib.Path
+        Destination directory for data.  Prepend with protocol like ``s3://``
+        or ``hdfs://`` for remote data.
+    write_index : boolean, optional
+        Whether or not to write the index. Defaults to True.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+    compression : string or dict, optional
+    compute : bool, optional
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
+    """
+
+    from dask import compute as dask_compute, delayed
+
+    # TODO: Use upstream dask implementation once available
+    #       (see: Dask Issue#5596)
+
+    if hasattr(path, "name"):
+        path = stringify_path(path)
+    fs, _, _ = get_fs_token_paths(
+        path, mode="wb", storage_options=storage_options
+    )
+    # Trim any protocol information from the path before forwarding
+    path = fs._strip_protocol(path)
+
+    if write_index:
+        df = df.reset_index()
+    else:
+        # Not writing index - might as well drop it
+        df = df.reset_index(drop=True)
+
+    fs.mkdirs(path, exist_ok=True)
+
+    # Use i_offset and df.npartitions to define file-name list
+    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
+
+    # write parts
+    dwrite = delayed(write_orc_partition)
+    parts = [
+        dwrite(d, path, fs, filename, compression=compression)
+        for d, filename in zip(df.to_delayed(), filenames)
+    ]
+
+    if compute:
+        return dask_compute(*parts)
+
+    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ba6209c4820..a953dce787d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -37,10 +37,9 @@ def TaskList(*x):
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
-
 # Dask-expr imports CudfEngine from this module
 from dask_cudf._legacy.io.parquet import CudfEngine
+from dask_cudf.core import _deprecated_api
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -832,15 +831,8 @@ def read_parquet_expr(
     )
 
 
-if QUERY_PLANNING_ON:
-    read_parquet = read_parquet_expr
-    read_parquet.__doc__ = read_parquet_expr.__doc__
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.parquet.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = read_parquet_expr
+read_parquet.__doc__ = read_parquet_expr.__doc__
 to_parquet = _deprecated_api(
     "dask_cudf.io.parquet.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index f5509cf91c3..48eca13e16f 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import math
 import os
@@ -11,10 +11,6 @@
 from dask.utils import tmpfile
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
@@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path):
     with pytest.warns(match="dask_cudf.io.read_json is now deprecated"):
         df2 = dask_cudf.io.read_json(path)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"):
-        df2 = dask_cudf.io.json.read_json(path)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index b6064d851ca..4aac463420b 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -12,10 +12,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
@@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"):
         df2 = dask_cudf.io.read_orc(paths)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"):
-        df2 = dask_cudf.io.orc.read_orc(paths)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6efe6c4f388..9f7031f4d2a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import glob
 import math
@@ -16,11 +16,6 @@
 
 import dask_cudf
 from dask_cudf._legacy.io.parquet import create_metadata_file
-from dask_cudf.tests.utils import (
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
@@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir):
     dd.assert_eq(df, new_ddf)
 
 
-@skip_dask_expr("Not necessary in dask-expr")
-def test_check_file_size(tmpdir):
-    # Test simple file-size check to help warn users
-    # of upstream change to `split_row_groups` default
-    fn = str(tmpdir.join("test.parquet"))
-    cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn)
-    with pytest.warns(match="large parquet file"):
-        # Need to use `dask_cudf._legacy.io` path
-        # TODO: Remove outdated `check_file_size` functionality
-        dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute()
-
-
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
@@ -626,7 +607,6 @@ def test_timezone_column(tmpdir):
     dd.assert_eq(got, expect)
 
 
-@require_dask_expr()
 @pytest.mark.skipif(
     not dask_cudf.backends.PYARROW_GE_15,
     reason="Requires pyarrow 15",
@@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"):
         dask_cudf.io.to_parquet(df, tmpdir)
 
-    if dask_cudf.QUERY_PLANNING_ON:
-        df2 = dask_cudf.io.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-
-        df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-    else:
-        with pytest.warns(match="legacy dask_cudf.io.read_parquet"):
-            df2 = dask_cudf.io.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
 
-        with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"):
-            df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 90907f6fb99..7c53b89a883 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import os
 import socket
@@ -14,7 +14,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises():
         pytest.param(
             "arrow",
             marks=pytest.mark.skipif(
-                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                not dask_cudf.backends.PYARROW_GE_15,
                 reason="Not supported",
             ),
         ),
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e35b6411a9d..f4d59334e03 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import os
 
@@ -9,10 +9,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
@@ -42,7 +38,3 @@ def test_deprecated_api_paths():
     with pytest.warns(match="dask_cudf.io.read_text is now deprecated"):
         df2 = dask_cudf.io.read_text(text_file, delimiter=".")
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"):
-        df2 = dask_cudf.io.text.read_text(text_file, delimiter=".")
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
index 1caf4e81d8e..eb1d007cc16 100644
--- a/python/dask_cudf/dask_cudf/io/text.py
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -1,8 +1,56 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+import os
+from glob import glob
 
-read_text = _deprecated_api(
-    "dask_cudf.io.text.read_text",
-    new_api="dask_cudf.read_text",
-)
+import dask.dataframe as dd
+from dask.utils import parse_bytes
+
+import cudf
+
+
+def _read_text(source, **kwargs):
+    # Wrapper for cudf.read_text operation
+    fn, byte_range = source
+    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
+
+
+def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
+    if isinstance(chunksize, str):
+        chunksize = parse_bytes(chunksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    if chunksize and byte_range:
+        raise ValueError("Cannot specify both chunksize and byte_range.")
+
+    if chunksize:
+        sources = []
+        for fn in filenames:
+            size = os.path.getsize(fn)
+            for start in range(0, size, chunksize):
+                byte_range = (
+                    start,
+                    chunksize,
+                )  # specify which chunk of the file we care about
+                sources.append((fn, byte_range))
+    else:
+        sources = [(fn, byte_range) for fn in filenames]
+
+    return dd.from_map(
+        _read_text,
+        sources,
+        meta=cudf.Series([], dtype="O"),
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 3fbb2aacd2c..c6b01a648eb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,7 +13,6 @@
 from cudf.testing._utils import does_not_raise
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 #############################################################################
 #                        Datetime Accessor                                  #
@@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7101fb7e00a..31957a106ff 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import random
 
@@ -9,18 +9,12 @@
 
 import dask
 from dask import dataframe as dd
-from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty
+from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty
 from dask.utils import M
 
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 rng = np.random.default_rng(seed=0)
 
@@ -299,37 +293,6 @@ def test_set_index_sorted():
             gddf1.set_index("val", sorted=True)
 
 
-@pytest.mark.parametrize("nelem", [10, 200, 1333])
-@pytest.mark.parametrize("index", [None, "myindex"])
-def test_rearrange_by_divisions(nelem, index):
-    with dask.config.set(scheduler="single-threaded"):
-        rng = np.random.default_rng(seed=0)
-        df = pd.DataFrame(
-            {
-                "x": rng.integers(0, 20, size=nelem),
-                "y": rng.normal(size=nelem),
-                "z": rng.choice(["dog", "cat", "bird"], nelem),
-            }
-        )
-        df["z"] = df["z"].astype("category")
-
-        ddf1 = dd.from_pandas(df, npartitions=4)
-        gdf1 = dask_cudf.from_cudf(
-            cudf.DataFrame.from_pandas(df), npartitions=4
-        )
-        ddf1.index.name = index
-        gdf1.index.name = index
-        divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20)
-
-        expect = dd.shuffle.rearrange_by_divisions(
-            ddf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        result = dd.shuffle.rearrange_by_divisions(
-            gdf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        dd.assert_eq(expect, result)
-
-
 def test_assign():
     rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
@@ -393,44 +356,6 @@ def test_setitem_scalar_datetime():
     np.testing.assert_array_equal(got["z"], df["z"])
 
 
-@skip_dask_expr("Not relevant for dask-expr")
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda: pd.DataFrame(
-            {"A": rng.random(10), "B": rng.random(10)},
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.DataFrame(
-            {
-                "A": rng.random(10),
-                "B": list("a" * 10),
-                "C": pd.Series(
-                    [str(20090101 + i) for i in range(10)],
-                    dtype="datetime64[ns]",
-                ),
-            },
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.Series(list("abcdefghijklmnop")),
-        lambda: pd.Series(
-            rng.random(10),
-            index=pd.Index(
-                [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]"
-            ),
-        ),
-    ],
-)
-def test_repr(func):
-    pdf = func()
-    gdf = cudf.from_pandas(pdf)
-    gddf = dd.from_pandas(gdf, npartitions=3, sort=False)
-
-    assert repr(gddf)
-    if hasattr(pdf, "_repr_html_"):
-        assert gddf._repr_html_()
-
-
 @pytest.mark.skip(reason="datetime indexes not fully supported in cudf")
 @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"])
 @pytest.mark.parametrize("stop", ["1d", "3d", "8h"])
@@ -657,20 +582,20 @@ def test_hash_object_dispatch(index):
     )
 
     # DataFrame
-    result = dd.core.hash_object_dispatch(obj, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # Series
-    result = dd.core.hash_object_dispatch(obj["x"], index=index)
+    result = dd.dispatch.hash_object_dispatch(obj["x"], index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # DataFrame with MultiIndex
     obj_multi = obj.set_index(["x", "z"], drop=True)
-    result = dd.core.hash_object_dispatch(obj_multi, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj_multi, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
@@ -784,7 +709,6 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -800,7 +724,6 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -814,7 +737,6 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
@@ -864,7 +786,7 @@ def test_merging_categorical_columns():
 
     ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2)
 
-    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+    ddf_1 = ddf_1.categorize(columns=["cat_col"])
 
     df_2 = cudf.DataFrame(
         {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
@@ -872,7 +794,7 @@ def test_merging_categorical_columns():
 
     ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2)
 
-    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    ddf_2 = ddf_2.categorize(columns=["cat_col"])
 
     expected = cudf.DataFrame(
         {
@@ -932,14 +854,9 @@ def func(x):
 
     result = ds.map_partitions(func, meta=s.values)
 
-    if QUERY_PLANNING_ON:
-        # Check Array and round-tripped DataFrame
-        dask.array.assert_eq(result, func(s))
-        dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
-    else:
-        # Legacy version still carries numpy metadata
-        # See: https://github.com/dask/dask/issues/11017
-        dask.array.assert_eq(result.compute(), func(s))
+    # Check Array and round-tripped DataFrame
+    dask.array.assert_eq(result, func(s))
+    dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
 
 
 def test_implicit_array_conversion_cupy_sparse():
@@ -981,7 +898,6 @@ def test_series_isin_error():
         ddf.isin([1, 5, "a"]).compute()
 
 
-@require_dask_expr()
 def test_to_backend_simplify():
     # Check that column projection is not blocked by to_backend
     with dask.config.set({"dataframe.backend": "pandas"}):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 9bd3b506db0..11ca0c6a783 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,12 +13,7 @@
 from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
-from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    xfail_dask_expr,
-)
+from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf):
         expect = getattr(gdf_grouped, aggregation)()
         actual = getattr(ddf_grouped, aggregation)()
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
     dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
     if not series:
         expect = gdf_grouped.agg({"x": aggregation})
         actual = ddf_grouped.agg({"x": aggregation})
 
-        if not QUERY_PLANNING_ON:
-            assert_cudf_groupby_layers(actual)
-
         dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
 
@@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
-        # groupby.agg should add an explicit getitem layer
-        # to improve/enable column projection
-        assert hlg_layer(actual.dask, "getitem")
-
     dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)
 
 
@@ -556,20 +538,13 @@ def test_groupby_categorical_key():
         True,
         pytest.param(
             False,
-            marks=xfail_dask_expr("as_index not supported in dask-expr"),
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    "fused",
-    [
-        True,
-        pytest.param(
-            False,
-            marks=require_dask_expr("Not supported by legacy API"),
+            marks=pytest.mark.xfail(
+                reason="as_index not supported in dask-expr"
+            ),
         ),
     ],
 )
+@pytest.mark.parametrize("fused", [True, False])
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -590,19 +565,16 @@ def test_groupby_agg_params(
         "c": ["mean", "std", "var"],
     }
 
-    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
+    fused_kwarg = {"fused": fused}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
     # Avoid using as_index when query-planning is enabled
-    if QUERY_PLANNING_ON:
-        with pytest.warns(FutureWarning, match="argument is now deprecated"):
-            # Should warn when `as_index` is used
-            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
-        maybe_as_index = {"as_index": as_index} if as_index is False else {}
-    else:
-        maybe_as_index = {"as_index": as_index}
+    with pytest.warns(FutureWarning, match="argument is now deprecated"):
+        # Should warn when `as_index` is used
+        ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+    maybe_as_index = {"as_index": as_index} if as_index is False else {}
 
     # Check `sort=True` behavior
     if split_out == 1:
@@ -671,7 +643,6 @@ def test_groupby_agg_params(
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -711,7 +682,6 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg):
     )
 
 
-@xfail_dask_expr("Co-alignment check fails in dask-expr")
+@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr")
 def test_groupby_with_list_of_series():
     df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]})
     gdf = dask_cudf.from_cudf(df, npartitions=2)
@@ -773,7 +743,6 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
@@ -833,7 +802,7 @@ def test_groupby_all_columns(func):
     expect = func(ddf)
     actual = func(gddf)
 
-    dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON)
+    dd.assert_eq(expect, actual, check_names=False)
 
 
 def test_groupby_shuffle():
@@ -870,15 +839,3 @@ def test_groupby_shuffle():
     # NOTE: `shuffle_method=True` should be default
     got = gddf.groupby("a", sort=False).agg(spec, split_out=2)
     dd.assert_eq(expect, got.compute().sort_index())
-
-    if not QUERY_PLANNING_ON:
-        # Sorted aggregation fails with split_out>1 when shuffle is False
-        # (sort=True, split_out=2, shuffle_method=False)
-        with pytest.raises(ValueError):
-            gddf.groupby("a", sort=True).agg(
-                spec, shuffle_method=False, split_out=2
-            )
-
-        # Check shuffle kwarg deprecation
-        with pytest.warns(match="'shuffle' keyword is deprecated"):
-            gddf.groupby("a", sort=True).agg(spec, shuffle=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 0b7c7855e07..2d05345bc4a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -8,12 +8,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
-
-# No dask-expr support
-pytestmark = xfail_dask_expr(
-    "Newer dask version needed", lt_version="2024.5.0"
-)
 
 
 def test_get_dummies_cat():
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 02c815427f3..68d6e72660e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -10,7 +10,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -67,7 +66,6 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index b44b3f939e7..ef6765f39d1 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -1,22 +1,12 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
-import pytest
-from packaging.version import Version
 
-import dask
 import dask.dataframe as dd
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON
-
-if QUERY_PLANNING_ON:
-    DASK_VERSION = Version(dask.__version__)
-else:
-    DASK_VERSION = None
-
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
     rng = np.random.default_rng(seed=0)
@@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
     gdf = cudf.DataFrame.from_pandas(df)
     dgf = dd.from_pandas(gdf, npartitions=npartitions)
     return df, dgf
-
-
-_default_reason = "Not compatible with dask-expr"
-
-
-def skip_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        skip = QUERY_PLANNING_ON
-    return pytest.mark.skipif(skip, reason=reason)
-
-
-def xfail_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        xfail = QUERY_PLANNING_ON
-    return pytest.mark.xfail(xfail, reason=reason)
-
-
-def require_dask_expr(reason="requires dask-expr"):
-    return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index a8cb696d7f6..5b8b98c2b55 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -39,15 +39,15 @@ classifiers = [
 ]
 
 [project.entry-points."dask.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfBackendEntrypoint"
+cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint"
 
 [project.entry-points."dask_expr.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
+cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
     "dask-cuda==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13,<0.0.18",
+    "numba-cuda>=0.2.0,<0.3.0",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
@@ -102,8 +102,5 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
-    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
-    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
-    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
index 2d070ddda69..fbd478f963f 100644
--- a/python/pylibcudf/pylibcudf/hashing.pxd
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128(
     uint64_t seed=*
 )
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=*
+)
 
 cpdef Column xxhash_64(
     Table input,
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
index a849f5d0729..d535d842a18 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyi
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int]
 
 def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ...
 def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ...
+def xxhash_32(input: Table, seed: int = ...) -> Column: ...
 def xxhash_64(input: Table, seed: int = ...) -> Column: ...
 def md5(input: Table) -> Column: ...
 def sha1(input: Table) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index 548cffc0ce8..1f093b20c6b 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport (
     sha256 as cpp_sha256,
     sha384 as cpp_sha384,
     sha512 as cpp_sha512,
+    xxhash_32 as cpp_xxhash_32,
     xxhash_64 as cpp_xxhash_64,
 )
 from pylibcudf.libcudf.table.table cimport table
@@ -30,6 +31,7 @@ __all__ = [
     "sha256",
     "sha384",
     "sha512",
+    "xxhash_32",
     "xxhash_64",
 ]
 
@@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128(
     return Table.from_libcudf(move(c_result))
 
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the xxHash 32-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`xxhash_32`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint32_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+
+    cdef unique_ptr[column] c_result
+    with  nogil:
+        c_result = cpp_xxhash_32(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
 cpdef Column xxhash_64(
     Table input,
     uint64_t seed=DEFAULT_HASH_SEED
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 4e8a01b41a5..46fdf62cd6b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
         const table_view& input
     ) except +libcudf_exception_handler
 
+    cdef unique_ptr[column] xxhash_32(
+        const table_view& input,
+        const uint32_t seed
+    ) except +libcudf_exception_handler
+
     cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed
diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
index 83fb50fa4ef..7096dbe14ff 100644
--- a/python/pylibcudf/pylibcudf/tests/test_hashing.py
+++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import hashlib
 import struct
@@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0):
 
 
 def hash_combine_32(lhs, rhs):
-    return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))
+    return np.uint32(
+        int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32
+    )
 
 
 def uint_hash_combine_32(lhs, rhs):
@@ -80,22 +82,6 @@ def list_struct_table():
     return data
 
 
-def python_hash_value(x, method):
-    if method == "murmurhash3_x86_32":
-        return libcudf_mmh3_x86_32(x)
-    elif method == "murmurhash3_x64_128":
-        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
-        hasher.update(x)
-        # libcudf returns a tuple of two 64-bit integers
-        return hasher.utupledigest()
-    elif method == "xxhash_64":
-        return xxhash.xxh64(
-            x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
-        ).intdigest()
-    else:
-        return getattr(hashlib, method)(x).hexdigest()
-
-
 @pytest.mark.parametrize(
     "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"]
 )
@@ -115,6 +101,23 @@ def py_hasher(val):
     assert_column_eq(got, expect)
 
 
+def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return xxhash.xxh32(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.xxhash_32(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+
+    assert_column_eq(got, expect)
+
+
 def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl):
     def py_hasher(val):
         return xxhash.xxh64(
@@ -125,7 +128,9 @@ def py_hasher(val):
         [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
         type=pa.uint64(),
     )
-    got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0)
+    got = plc.hashing.xxhash_64(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
 
     assert_column_eq(got, expect)