diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 49ca5ca0fb9..9d79733703c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -13,6 +13,7 @@ jobs:
   # Please keep pr-builder as the top job here
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -54,6 +55,18 @@ jobs:
       - name: Telemetry setup
         if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: cudf
   changed-files:
     secrets: inherit
     needs: telemetry-setup
@@ -328,16 +341,11 @@ jobs:
         run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
 
   telemetry-summarize:
-    runs-on: ubuntu-latest
+    # This job must use a self-hosted runner to record telemetry traces.
+    runs-on: linux-amd64-cpu4
     needs: pr-builder
     if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
-      - name: Load stashed telemetry env vars
-        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
-        with:
-            load_service_name: true
       - name: Telemetry summarize
-        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
-        with:
-          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
+        uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4290d013fe4..52d8f659611 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -35,6 +35,10 @@ rapids-mamba-retry install \
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
 aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
@@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
 RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
+
+exit ${EXITCODE}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f17b57b0a4..9cbacee8e8d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -276,7 +276,7 @@ rapids_cpm_init()
 
 # Not using rapids-cmake since we never want to find, always download.
 CPMAddPackage(
-  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG
+  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG
   c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55
 )
 rapids_make_logger(cudf EXPORT_SET cudf-exports)
@@ -916,7 +916,9 @@ if(CUDF_LARGE_STRINGS_DISABLED)
 endif()
 
 # Define logging level
-target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}")
+target_compile_definitions(
+  cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=CUDF_LOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}"
+)
 
 # Enable remote IO through KvikIO
 target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
@@ -1105,7 +1107,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl)
     if(CUDF_BUILD_STACKTRACE_DEBUG)
       target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
     endif()
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8e5ea900efa..749e1b628ee 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
 endfunction()
 
 # ##################################################################################################
-# * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
+# * copying benchmarks
+# -----------------------------------------------------------------------------
+ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
@@ -351,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/subword.cpp)
-
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH
+  text/edit_distance.cpp
+  text/hash_ngrams.cpp
+  text/jaccard.cpp
+  text/minhash.cpp
+  text/ngrams.cpp
+  text/normalize.cpp
+  text/replace.cpp
+  text/subword.cpp
+  text/tokenize.cpp
+  text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
deleted file mode 100644
index 51106c72137..00000000000
--- a/cpp/benchmarks/column/concatenate.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/concatenate.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <algorithm>
-#include <vector>
-
-class Concatenate : public cudf::benchmark {};
-
-template <typename T, bool Nullable>
-static void BM_concatenate(benchmark::State& state)
-{
-  cudf::size_type const num_rows = state.range(0);
-  cudf::size_type const num_cols = state.range(1);
-
-  auto input         = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                     row_count{num_rows},
-                                     Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  auto input_columns = input->view();
-  std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
-}
-
-#define CONCAT_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                                \
-    ->Unit(benchmark::kMillisecond)                                         \
-    ->UseManualTime();
-
-CONCAT_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_BENCHMARK_DEFINE(int64_t, true)
-
-template <typename T, bool Nullable>
-static void BM_concatenate_tables(benchmark::State& state)
-{
-  cudf::size_type const num_rows   = state.range(0);
-  cudf::size_type const num_cols   = state.range(1);
-  cudf::size_type const num_tables = state.range(2);
-
-  std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
-  std::generate_n(tables.begin(), num_tables, [&]() {
-    return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                 row_count{num_rows},
-                                 Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  });
-
-  // Generate table views
-  std::vector<cudf::table_view> table_views(num_tables);
-  std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
-    return table->view();
-  });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(table_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
-}
-
-#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                           \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                               \
-    ->Unit(benchmark::kMillisecond)                                                \
-    ->UseManualTime();
-
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
-
-class ConcatenateStrings : public cudf::benchmark {};
-
-template <bool Nullable>
-static void BM_concatenate_strings(benchmark::State& state)
-{
-  using column_wrapper = cudf::test::strings_column_wrapper;
-
-  auto const num_rows  = state.range(0);
-  auto const num_chars = state.range(1);
-  auto const num_cols  = state.range(2);
-
-  std::string str(num_chars, 'a');
-
-  // Create owning columns
-  std::vector<column_wrapper> columns;
-  columns.reserve(num_cols);
-  std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() {
-    auto iter = thrust::make_constant_iterator(c_str);
-    if (Nullable) {
-      auto count_it = thrust::make_counting_iterator(0);
-      auto valid_iter =
-        thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; });
-      return column_wrapper(iter, iter + num_rows, valid_iter);
-    } else {
-      return column_wrapper(iter, iter + num_rows);
-    }
-  });
-
-  // Generate column views
-  std::vector<cudf::column_view> column_views;
-  column_views.reserve(columns.size());
-  std::transform(
-    columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows *
-                          (sizeof(int32_t) + num_chars));  // offset + chars
-}
-
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                                   \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_strings<nullable>(st); }               \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                            \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                               \
-    ->Unit(benchmark::kMillisecond)                                                 \
-    ->UseManualTime();
-
-CONCAT_STRINGS_BENCHMARK_DEFINE(false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp
new file mode 100644
index 00000000000..586b479d0ad
--- /dev/null
+++ b/cpp/benchmarks/copying/concatenate.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void bench_concatenate(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const nulls    = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  auto input = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
+  auto input_columns = input->view();
+  auto column_views  = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int64_t>(num_rows * num_cols);
+  state.add_global_memory_writes<int64_t>(num_rows * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate)
+  .set_name("concatenate")
+  .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
+  .add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
+  .add_float64_axis("nulls", {0.0, 0.3});
+
+static void bench_concatenate_strings(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const nulls     = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .null_probability(nulls);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = column->view();
+
+  auto column_views = std::vector<cudf::column_view>(num_cols, input);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const sv = cudf::strings_column_view(input);
+  state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
+  state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate_strings)
+  .set_name("concatenate_strings")
+  .add_int64_axis("num_rows", {256, 512, 4096, 16384})
+  .add_int64_axis("num_cols", {2, 8, 64, 256})
+  .add_int64_axis("row_width", {32, 128})
+  .add_float64_axis("nulls", {0.0, 0.3});
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
index 6ffa90edb8f..0ad1ae30f8c 100644
--- a/cpp/benchmarks/text/edit_distance.cpp
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -27,15 +27,11 @@
 static void bench_edit_distance(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input1(strings_table->view().column(0));
@@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state)
 
 NVBENCH_BENCH(bench_edit_distance)
   .set_name("edit_distance")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144});
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
index 4e5daf83a3c..7577cf00c0f 100644
--- a/cpp/benchmarks/text/hash_ngrams.cpp
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -27,16 +27,12 @@
 static void bench_hash_ngrams(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
@@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state)
 
 NVBENCH_BENCH(bench_hash_ngrams)
   .set_name("hash_ngrams")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {16384, 32768, 262144})
   .add_int64_axis("ngrams", {5, 10});
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d5b74da6773..5506501138b 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -28,17 +28,13 @@
 static void bench_jaccard(nvbench::state& state)
 {
   auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width       = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width       = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width       = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const substring_width = static_cast<cudf::size_type>(state.get_int64("substring_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const input_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
@@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 1024, 2048})
   .add_int64_axis("num_rows", {32768, 131072, 262144})
-  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 71bccd80d39..594dc0de28a 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -28,16 +28,12 @@
 static void bench_normalize(nvbench::state& state)
 {
   auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width      = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width      = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const normalize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 767ebab3eee..24ca4e5dfd7 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
                                  "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
@@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index dd8df695d3e..0b4e3bdefa5 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/file_utilities.hpp>
 
@@ -24,6 +21,8 @@
 
 #include <nvtext/subword_tokenize.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -54,40 +53,33 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_subword_tokenizer(benchmark::State& state)
+static void bench_subword_tokenizer(nvbench::state& state)
 {
-  auto const nrows = static_cast<cudf::size_type>(state.range(0));
-  std::vector<char const*> h_strings(nrows, "This is a test ");
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+
+  std::vector<char const*> h_strings(num_rows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
-  uint32_t max_sequence_length = 64;
-  uint32_t stride              = 48;
-  uint32_t do_truncate         = 0;
-  uint32_t do_lower            = 1;
-  //
-  auto vocab = nvtext::load_vocabulary_file(hash_file);
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                           *vocab,
-                                           max_sequence_length,
-                                           stride,
-                                           do_lower,
-                                           do_truncate);
-  }
-}
+  uint32_t max_sequence = 64;
+  uint32_t stride       = 48;
+  uint32_t do_truncate  = 0;
+  uint32_t do_lower     = 1;
 
-class Subword : public cudf::benchmark {};
+  auto input = cudf::strings_column_view{strings};
 
-#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
-  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
-  BENCHMARK_REGISTER_F(Subword, name)                                                            \
-    ->RangeMultiplier(2)                                                                         \
-    ->Range(1 << 10, 1 << 17)                                                                    \
-    ->UseManualTime()                                                                            \
-    ->Unit(benchmark::kMillisecond);
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * max_sequence);
 
-SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result =
+      nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate);
+  });
+}
 
-// BENCHMARK_MAIN();
+NVBENCH_BENCH(bench_subword_tokenizer)
+  .set_name("subword_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index e83310e0343..b9590c5539f 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -31,17 +31,13 @@
 static void bench_tokenize(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const tokenize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
@@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_tokenize)
   .set_name("tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 523d277df18..0502f375d99 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state)
 {
   auto const stream    = cudf::get_default_stream();
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto const column = [num_rows, row_width] {
+  auto const column = [num_rows, min_width, max_width] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+      cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
@@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_vocab_tokenize)
   .set_name("vocab_tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
deleted file mode 100644
index adc3dddc59c..00000000000
--- a/cpp/benchmarks/text/word_minhash.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <nvtext/minhash.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-static void bench_word_minhash(nvbench::state& state)
-{
-  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
-  auto const base64     = state.get_int64("hash_type") == 64;
-
-  data_profile const strings_profile =
-    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
-  auto strings_table =
-    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
-
-  auto const num_offsets = (num_rows / row_width) + 1;
-  auto offsets           = cudf::sequence(num_offsets,
-                                cudf::numeric_scalar<cudf::size_type>(0),
-                                cudf::numeric_scalar<cudf::size_type>(row_width));
-
-  auto source = cudf::make_lists_column(num_offsets - 1,
-                                        std::move(offsets),
-                                        std::move(strings_table->release().front()),
-                                        0,
-                                        rmm::device_buffer{});
-
-  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-
-  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
-  auto chars_size = input.chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
-                         : nvtext::word_minhash(source->view(), seeds.view());
-  });
-}
-
-NVBENCH_BENCH(bench_word_minhash)
-  .set_name("word_minhash")
-  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
-  .add_int64_axis("row_width", {10, 100, 1000})
-  .add_int64_axis("seed_count", {2, 25})
-  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index c440643037b..b0c48e04710 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,11 +14,6 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
-
   if(NOT BUILD_SHARED_LIBS)
     set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
   else()
@@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow)
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
+    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
deleted file mode 100644
index e9a36fcb567..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
+++ /dev/null
@@ -1,38 +0,0 @@
-diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h
-index caa6be4..70ec8a2 100644
---- a/src/nanoarrow/common/inline_buffer.h
-+++ b/src/nanoarrow/common/inline_buffer.h
-@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
- }
- 
- static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
--  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
-+  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT
-                    ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
-                    ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
-                    ((values[7] + 0x7f) & 0x80));
-@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l
-     // set bits within a single byte
-     const uint8_t only_byte_mask =
-         i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask);
--    bits[bytes_begin] &= only_byte_mask;
-+    bits[bytes_begin] &= only_byte_mask;  // NOLINT
-     bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask);
-     return;
-   }
- 
-   // set/clear trailing bits of first byte
--  bits[bytes_begin] &= first_byte_mask;
-+  bits[bytes_begin] &= first_byte_mask;  // NOLINT
-   bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask);
- 
-   if (bytes_end - bytes_begin > 2) {
-@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap,
-   n_remaining -= n_full_bytes * 8;
-   if (n_remaining > 0) {
-     // Zero out the last byte
--    *out_cursor = 0x00;
-+    *out_cursor = 0x00;  // NOLINT
-     for (int i = 0; i < n_remaining; i++) {
-       ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]);
-     }
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
deleted file mode 100644
index d529787e7c8..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json
+++ /dev/null
@@ -1,18 +0,0 @@
-
-{
-  "packages" : {
-    "nanoarrow" : {
-      "version" : "0.6.0.dev",
-      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
-      "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb",
-      "git_shallow" : false,
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
-          "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index db6d5255616..aacb5ccfede 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -33,11 +33,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
+#include <type_traits>
 
 /**
  * @file column_device_view.cuh
@@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf {
  *
  */
 struct nullate {
-  struct YES : std::bool_constant<true> {};
-  struct NO : std::bool_constant<false> {};
+  struct YES : cuda::std::bool_constant<true> {};
+  struct NO : cuda::std::bool_constant<false> {};
   /**
    * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than
    * compile time. The calling code is responsible for specifying whether or not nulls are
@@ -80,7 +82,7 @@ struct nullate {
      * @return `true` if nulls are expected in the operation in which this object is applied,
      * otherwise false
      */
-    constexpr operator bool() const noexcept { return value; }
+    CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; }
     bool value;  ///< True if nulls are expected
   };
 };
@@ -319,14 +321,14 @@ class alignas(16) column_device_view_base {
   }
 
   template <typename C, typename T, typename = void>
-  struct has_element_accessor_impl : std::false_type {};
+  struct has_element_accessor_impl : cuda::std::false_type {};
 
   template <typename C, typename T>
   struct has_element_accessor_impl<
     C,
     T,
-    void_t<decltype(std::declval<C>().template element<T>(std::declval<size_type>()))>>
-    : std::true_type {};
+    void_t<decltype(cuda::std::declval<C>().template element<T>(cuda::std::declval<size_type>()))>>
+    : cuda::std::true_type {};
 };
 // @cond
 // Forward declaration
@@ -442,7 +444,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return string_view instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
     char const* d_strings = static_cast<char const*>(_data);
@@ -501,7 +503,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return dictionary32 instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, dictionary32>)>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     size_type index    = element_index + offset();  // account for this view's _offset
     auto const indices = d_children[0];
@@ -519,7 +521,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return numeric::fixed_point representing the element at this index
    */
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
     using rep        = typename T::rep;
@@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return `true` if `column_device_view::element<T>()` has a valid overload, `false` otherwise
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<column_device_view, T>::value;
   }
@@ -1032,7 +1034,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return Reference to the element at the specified index
    */
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  __device__ [[nodiscard]] T& element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T& element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }
@@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return `true` if `mutable_column_device_view::element<T>()` has a valid overload, `false`
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<mutable_column_device_view, T>::value;
   }
@@ -1425,13 +1427,13 @@ struct pair_rep_accessor {
 
  private:
   template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
+  [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i);
   }
 
   template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
+  [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i).value();
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index de53e7586cd..59011f7b138 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,12 +32,11 @@
 #include <thrust/fill.h>
 
 #include <type_traits>
-#include <vector>
 
 namespace cudf {
 namespace detail {
 template <typename T>
-constexpr bool is_product_supported()
+CUDF_HOST_DEVICE constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
@@ -216,12 +216,12 @@ struct identity_initializer {
  * @throw cudf::logic_error if column type is not fixed-width
  *
  * @param table The table of columns to initialize.
- * @param aggs A vector of aggregation operations corresponding to the table
+ * @param aggs A span of aggregation operations corresponding to the table
  * columns. The aggregations determine the identity value for each column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream);
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp
index 16ca06c6561..090dc8b62b6 100644
--- a/cpp/include/cudf/detail/device_scalar.hpp
+++ b/cpp/include/cudf/detail/device_scalar.hpp
@@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar<T> {
   [[nodiscard]] T value(rmm::cuda_stream_view stream) const
   {
     cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
-    return bounce_buffer[0];
+    return std::move(bounce_buffer[0]);
   }
 
   void set_value_async(T const& value, rmm::cuda_stream_view stream)
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 61a8e9f7ec3..72cdc3d8067 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -74,9 +74,10 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
-                                                      thread_index_type block_id,
-                                                      thread_index_type num_threads_per_block)
+  __device__ static constexpr thread_index_type global_thread_id(
+    thread_index_type thread_id,
+    thread_index_type block_id,
+    thread_index_type num_threads_per_block)
   {
     return thread_id + block_id * num_threads_per_block;
   }
@@ -114,8 +115,8 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
-                                                 thread_index_type num_blocks_per_grid)
+  __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                            thread_index_type num_blocks_per_grid)
   {
     return num_threads_per_block * num_blocks_per_grid;
   }
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index d16be5e22dd..923cd04479d 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -29,6 +29,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/functional>
+
 #include <type_traits>
 
 namespace cudf {
@@ -42,7 +44,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
-  return std::min(lhs, rhs);
+  return cuda::std::min(lhs, rhs);
 }
 
 /**
@@ -53,7 +55,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
-  return std::max(lhs, rhs);
+  return cuda::std::max(lhs, rhs);
 }
 }  // namespace detail
 
@@ -68,20 +70,20 @@ struct DeviceSum {
   }
 
   template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{typename T::duration{0}};
   }
 
   template <typename T,
             std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{0};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
@@ -109,7 +111,7 @@ struct DeviceCount {
   }
 
   template <typename T>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{};
   }
@@ -129,7 +131,7 @@ struct DeviceMin {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::max()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -143,7 +145,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
@@ -161,7 +163,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::max_value());
   }
@@ -181,7 +183,7 @@ struct DeviceMax {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::min()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -195,7 +197,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
@@ -212,7 +214,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::lowest_value());
   }
@@ -229,13 +231,13 @@ struct DeviceProduct {
   }
 
   template <typename T, std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{1};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 957b6b70fe2..2e3d71815c0 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -187,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr auto absolute_value(T value) -> T
+CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index fce08b4a5c4..9e68bafb09a 100644
--- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -22,6 +22,7 @@
 #include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 #include <cstring>
 
@@ -183,7 +184,7 @@ struct floating_converter {
    * @param integer_rep The bit-casted floating value to extract the exponent from
    * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+  CUDF_HOST_DEVICE inline static cuda::std::pair<IntegralType, int> get_significand_and_pow2(
     IntegralType integer_rep)
   {
     // Extract the significand
@@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
@@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
index 0ec41a20ef1..fd3455e761d 100644
--- a/cpp/include/cudf/hashing/detail/hash_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -18,7 +18,8 @@
 
 #include <cudf/utilities/traits.hpp>
 
-#include <limits>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 
 namespace cudf::hashing::detail {
 
@@ -29,7 +30,7 @@ template <typename T>
 T __device__ inline normalize_nans(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+    if (cuda::std::isnan(key)) { return cuda::std::numeric_limits<T>::quiet_NaN(); }
   }
   return key;
 }
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index a978e54a1b9..7cb80081a95 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -82,7 +82,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
  * @param rhs The second hash value
  * @return Combined hash value
  */
-constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
+CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
 {
   return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2));
 }
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 85349a421b1..84957ab9f1d 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -31,7 +31,7 @@ namespace strings::detail {
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this is not the first byte of the character
  */
-constexpr bool is_utf8_continuation_char(unsigned char chr)
+CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr)
 {
   // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
   return (chr & 0xC0) == 0x80;
@@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr)
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this the first byte of the character
  */
-constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); }
+CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr)
+{
+  return not is_utf8_continuation_char(chr);
+}
 
 /**
  * @brief This will return true if the passed in byte could be the start of
@@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin
  * @param byte The byte to be tested
  * @return true if this can be the first byte of a character
  */
-constexpr bool is_valid_begin_utf8_char(uint8_t byte)
+CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte)
 {
   // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of:
   //  0b0vvvvvvv a 1 byte character
@@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte)
  * @param character Single character
  * @return Number of bytes
  */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character)
 {
   return 1 + static_cast<size_type>((character & 0x0000'FF00u) > 0) +
          static_cast<size_type>((character & 0x00FF'0000u) > 0) +
@@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character)
  * @param byte Byte from an encoded character.
  * @return Number of bytes.
  */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte)
 {
   return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
          + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
@@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-constexpr size_type to_char_utf8(char const* str, char_utf8& character)
+CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character)
  * @param[out] str Output array.
  * @return The number of bytes in the character
  */
-constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
@@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
  * @param utf8_char Single UTF-8 character to convert.
  * @return Code-point for the UTF-8 character.
  */
-constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
+CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
   if (utf8_char < 0x0000'0080)  // single-byte pass thru
@@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
  * @param unchr Character code-point to convert.
  * @return Single UTF-8 character.
  */
-constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
+CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
   if (unchr < 0x0000'0080)  // single byte utf8
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 1ae4c3703b2..f0040e069d8 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -31,6 +31,8 @@
 #include <thrust/execution_policy.h>
 #endif
 
+#include <cuda/std/utility>
+
 #include <algorithm>
 
 // This file should only include device code logic.
@@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte
  * @param pos Character position to count to
  * @return The number of bytes and the left over non-counted position value
  */
-__device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
-                                                                              size_type pos)
+__device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
+  string_view d_str, size_type pos)
 {
   size_type bytes    = 0;
   auto ptr           = d_str.data();
@@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const
 __device__ inline size_type string_view::byte_offset(size_type pos) const
 {
   if (length() == size_bytes()) return pos;
-  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
+  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
 }
 
 __device__ inline int string_view::compare(string_view const& in) const
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 504c31057ae..33f3176d2c6 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -54,7 +54,7 @@ class string_view {
    *
    * @return The number of characters in this string
    */
-  __device__ [[nodiscard]] inline size_type length() const;
+  [[nodiscard]] __device__ inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    *
@@ -119,13 +119,13 @@ class string_view {
    *
    * @return new iterator pointing to the beginning of this string
    */
-  __device__ [[nodiscard]] inline const_iterator begin() const;
+  [[nodiscard]] __device__ inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    *
    * @return new iterator pointing past the end of this string
    */
-  __device__ [[nodiscard]] inline const_iterator end() const;
+  [[nodiscard]] __device__ inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
@@ -140,7 +140,7 @@ class string_view {
    * @param pos Character position
    * @return Byte offset from data() for a given character position
    */
-  __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const;
+  [[nodiscard]] __device__ inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -155,7 +155,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ [[nodiscard]] inline int compare(string_view const& str) const;
+  [[nodiscard]] __device__ inline int compare(string_view const& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -225,7 +225,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if str is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type find(string_view const& str,
+  [[nodiscard]] __device__ inline size_type find(string_view const& str,
                                                  size_type pos   = 0,
                                                  size_type count = -1) const;
   /**
@@ -253,7 +253,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type find(char_utf8 character,
+  [[nodiscard]] __device__ inline size_type find(char_utf8 character,
                                                  size_type pos   = 0,
                                                  size_type count = -1) const;
   /**
@@ -266,7 +266,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type rfind(string_view const& str,
+  [[nodiscard]] __device__ inline size_type rfind(string_view const& str,
                                                   size_type pos   = 0,
                                                   size_type count = -1) const;
   /**
@@ -294,7 +294,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type rfind(char_utf8 character,
+  [[nodiscard]] __device__ inline size_type rfind(char_utf8 character,
                                                   size_type pos   = 0,
                                                   size_type count = -1) const;
 
@@ -306,7 +306,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const;
+  [[nodiscard]] __device__ inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -386,7 +386,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
+  [[nodiscard]] __device__ inline size_type character_offset(size_type bytepos) const;
 
   /**
    * @brief Common internal implementation for string_view::find and string_view::rfind.
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 3f33c70c29a..8214ea6e83b 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -33,6 +33,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <cuda/std/tuple>
 #include <cuda/std/utility>
 #include <thrust/detail/use_default.h>
@@ -48,11 +50,8 @@
 #include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
-#include <limits>
 #include <memory>
-#include <optional>
 #include <type_traits>
-#include <utility>
 
 namespace CUDF_EXPORT cudf {
 
@@ -287,15 +286,16 @@ class device_row_comparator {
    * `null_order::BEFORE` for all columns.
    * @param comparator Physical element relational comparison functor.
    */
-  device_row_comparator(Nullate check_nulls,
-                        table_device_view lhs,
-                        table_device_view rhs,
-                        device_span<detail::dremel_device_view const> l_dremel_device_views,
-                        device_span<detail::dremel_device_view const> r_dremel_device_views,
-                        std::optional<device_span<int const>> depth                  = std::nullopt,
-                        std::optional<device_span<order const>> column_order         = std::nullopt,
-                        std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-                        PhysicalElementComparator comparator                         = {}) noexcept
+  device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    device_span<detail::dremel_device_view const> l_dremel_device_views,
+    device_span<detail::dremel_device_view const> r_dremel_device_views,
+    cuda::std::optional<device_span<int const>> depth                  = cuda::std::nullopt,
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel(l_dremel_device_views),
@@ -331,9 +331,9 @@ class device_row_comparator {
     Nullate check_nulls,
     table_device_view lhs,
     table_device_view rhs,
-    std::optional<device_span<order const>> column_order         = std::nullopt,
-    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-    PhysicalElementComparator comparator                         = {}) noexcept
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel{},
@@ -410,7 +410,7 @@ class device_row_comparator {
 
       return cuda::std::pair(_comparator(_lhs.element<Element>(lhs_element_index),
                                          _rhs.element<Element>(rhs_element_index)),
-                             std::numeric_limits<int>::max());
+                             cuda::std::numeric_limits<int>::max());
     }
 
     /**
@@ -455,7 +455,7 @@ class device_row_comparator {
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits<int>::max());
+          return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits<int>::max());
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
@@ -607,7 +607,7 @@ class device_row_comparator {
   __device__ constexpr weak_ordering operator()(size_type const lhs_index,
                                                 size_type const rhs_index) const noexcept
   {
-    int last_null_depth = std::numeric_limits<int>::max();
+    int last_null_depth = cuda::std::numeric_limits<int>::max();
     size_type list_column_index{-1};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; }
@@ -626,9 +626,9 @@ class device_row_comparator {
       // here, otherwise the current code would be failing.
       auto const [l_dremel_i, r_dremel_i] =
         _lhs.column(i).type().id() == type_id::LIST
-          ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
-                            optional_dremel_view(_r_dremel[list_column_index]))
-          : std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+          ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
+                                  optional_dremel_view(_r_dremel[list_column_index]))
+          : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
 
       auto element_comp = element_comparator{_check_nulls,
                                              _lhs.column(i),
@@ -658,9 +658,9 @@ class device_row_comparator {
   device_span<detail::dremel_device_view const> const _l_dremel;
   device_span<detail::dremel_device_view const> const _r_dremel;
   Nullate const _check_nulls;
-  std::optional<device_span<int const>> const _depth;
-  std::optional<device_span<order const>> const _column_order;
-  std::optional<device_span<null_order const>> const _null_precedence;
+  cuda::std::optional<device_span<int const>> const _depth;
+  cuda::std::optional<device_span<order const>> const _column_order;
+  cuda::std::optional<device_span<null_order const>> const _null_precedence;
   PhysicalElementComparator const _comparator;
 };  // class device_row_comparator
 
@@ -882,10 +882,10 @@ struct preprocessed_table {
    * @return Device array containing respective column orders. If no explicit column orders were
    * specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<order const>> column_order() const
+  [[nodiscard]] cuda::std::optional<device_span<order const>> column_order() const
   {
-    return _column_order.size() ? std::optional<device_span<order const>>(_column_order)
-                                : std::nullopt;
+    return _column_order.size() ? cuda::std::optional<device_span<order const>>(_column_order)
+                                : cuda::std::nullopt;
   }
 
   /**
@@ -895,10 +895,11 @@ struct preprocessed_table {
    * @return Device array containing respective column null precedence. If no explicit column null
    * precedences were specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<null_order const>> null_precedence() const
+  [[nodiscard]] cuda::std::optional<device_span<null_order const>> null_precedence() const
   {
-    return _null_precedence.size() ? std::optional<device_span<null_order const>>(_null_precedence)
-                                   : std::nullopt;
+    return _null_precedence.size()
+             ? cuda::std::optional<device_span<null_order const>>(_null_precedence)
+             : cuda::std::nullopt;
   }
 
   /**
@@ -909,9 +910,10 @@ struct preprocessed_table {
    * @return std::optional<device_span<int const>> Device array containing respective column depths.
    * If there are no nested columns in the table then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<int const>> depths() const
+  [[nodiscard]] cuda::std::optional<device_span<int const>> depths() const
   {
-    return _depths.size() ? std::optional<device_span<int const>>(_depths) : std::nullopt;
+    return _depths.size() ? cuda::std::optional<device_span<int const>>(_depths)
+                          : cuda::std::nullopt;
   }
 
   [[nodiscard]] device_span<detail::dremel_device_view const> dremel_device_views() const
@@ -940,8 +942,8 @@ struct preprocessed_table {
   rmm::device_uvector<size_type> const _depths;
 
   // Dremel encoding of list columns used for the comparison algorithm
-  std::optional<std::vector<detail::dremel_data>> _dremel_data;
-  std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
+  cuda::std::optional<std::vector<detail::dremel_data>> _dremel_data;
+  cuda::std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
 
   // Intermediate columns generated from transforming nested children columns into
   // integers columns using `cudf::rank()`, need to be kept alive.
@@ -1808,7 +1810,7 @@ class element_hasher {
   __device__ element_hasher(
     Nullate nulls,
     uint32_t seed             = DEFAULT_HASH_SEED,
-    hash_value_type null_hash = std::numeric_limits<hash_value_type>::max()) noexcept
+    hash_value_type null_hash = cuda::std::numeric_limits<hash_value_type>::max()) noexcept
     : _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
   {
   }
@@ -1892,7 +1894,7 @@ class device_row_hasher {
    */
   template <template <typename> class hash_fn>
   class element_hasher_adapter {
-    static constexpr hash_value_type NULL_HASH     = std::numeric_limits<hash_value_type>::max();
+    static constexpr hash_value_type NULL_HASH = cuda::std::numeric_limits<hash_value_type>::max();
     static constexpr hash_value_type NON_NULL_HASH = 0;
 
    public:
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 409b8c825bb..9443bd5cb52 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -266,7 +266,7 @@ class data_type {
    *
    * @param id The type's identifier
    */
-  explicit constexpr data_type(type_id id) : _id{id} {}
+  CUDF_HOST_DEVICE explicit constexpr data_type(type_id id) : _id{id} {}
 
   /**
    * @brief Construct a new `data_type` object for `numeric::fixed_point`
@@ -284,14 +284,17 @@ class data_type {
    *
    * @return The type identifier
    */
-  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    *
    * @return The scale
    */
-  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t scale() const noexcept
+  {
+    return _fixed_point_scale;
+  }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/memory_resource.hpp b/cpp/include/cudf/utilities/memory_resource.hpp
index b562574fd79..eaba466557b 100644
--- a/cpp/include/cudf/utilities/memory_resource.hpp
+++ b/cpp/include/cudf/utilities/memory_resource.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/memory_resource.hpp>
-
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 2273a89892b..e7b76946248 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -69,52 +70,22 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;  ///< The extent of the span
 
-  constexpr span_base() noexcept {}
+  CUDF_HOST_DEVICE constexpr span_base() noexcept {}
   /**
    * @brief Constructs a span from a pointer and a size.
    *
    * @param data Pointer to the first element in the span.
    * @param size The number of elements in the span.
    */
-  constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
+  CUDF_HOST_DEVICE constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
-  constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
    * @return Reference to this span.
    */
-  constexpr span_base& operator=(span_base const&) noexcept = default;
-
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr reference front() const { return _data[0]; }
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling last on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; }
-  // not noexcept due to undefined behavior when idx < 0 || idx >= size
-  /**
-   * @brief Returns a reference to the idx-th element of the sequence.
-   *
-   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
-   * size()).
-   *
-   * @param idx the index of the element to access
-   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
-   */
-  constexpr reference operator[](size_type idx) const { return _data[idx]; }
+  CUDF_HOST_DEVICE constexpr span_base& operator=(span_base const&) noexcept = default;
 
   /**
    * @brief Returns an iterator to the first element of the span.
@@ -123,7 +94,7 @@ class span_base {
    *
    * @return An iterator to the first element of the span
    */
-  [[nodiscard]] constexpr iterator begin() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator begin() const noexcept { return _data; }
   /**
    * @brief Returns an iterator to the element following the last element of the span.
    *
@@ -131,32 +102,36 @@ class span_base {
    *
    * @return An iterator to the element following the last element of the span
    */
-  [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator end() const noexcept { return _data + _size; }
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr pointer data() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr pointer data() const noexcept { return _data; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return The number of elements in the span
    */
-  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size() const noexcept { return _size; }
   /**
    * @brief Returns the size of the sequence in bytes.
    *
    * @return The size of the sequence in bytes
    */
-  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size_bytes() const noexcept
+  {
+    return sizeof(T) * _size;
+  }
+
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -180,9 +155,9 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
- private:
-  pointer _data{nullptr};
-  size_type _size{0};
+ protected:
+  pointer _data{nullptr};  ///< Pointer to the first element in the span
+  size_type _size{0};      ///< The number of elements in the span
 };
 
 }  // namespace detail
@@ -288,6 +263,39 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  constexpr typename base::reference operator[](size_type idx) const { return this->_data[idx]; }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference front() const { return this->_data[0]; }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
 
   /**
    * @brief Returns whether the data is device accessible (e.g. pinned memory)
@@ -339,7 +347,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
   using base = cudf::detail::span_base<T, Extent, device_span<T, Extent>>;  ///< Base type
   using base::base;
 
-  constexpr device_span() noexcept : base() {}  // required to compile on centos
+  CUDF_HOST_DEVICE constexpr device_span() noexcept : base() {}  // required to compile on centos
 
   /// Constructor from container
   /// @param in The container to construct the span from
@@ -374,11 +382,51 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
-  constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
+  CUDF_HOST_DEVICE constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
 
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  __device__ constexpr typename base::reference operator[](size_type idx) const
+  {
+    return this->_data[idx];
+  }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference front() const
+  {
+    return this->_data[0];
+  }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
+
   /**
    * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
    *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index dae1cd38832..0f4bde204fa 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -169,7 +169,7 @@ bool is_equality_comparable(data_type type);
  * @return false  `T` is not numeric
  */
 template <typename T>
-constexpr inline bool is_numeric()
+CUDF_HOST_DEVICE constexpr inline bool is_numeric()
 {
   return cuda::std::is_arithmetic<T>();
 }
@@ -271,9 +271,9 @@ bool is_unsigned(data_type type);
  * @return true if the iterator's value type is unsigned
  */
 template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+CUDF_HOST_DEVICE constexpr inline bool is_signed_iterator()
 {
-  return std::is_signed_v<typename std::iterator_traits<Iterator>::value_type>;
+  return cuda::std::is_signed_v<typename cuda::std::iterator_traits<Iterator>::value_type>;
 }
 
 /**
@@ -356,9 +356,9 @@ bool is_numeric_not_bool(data_type type);
  * @return false  `T` is not floating point
  */
 template <typename T>
-constexpr inline bool is_floating_point()
+CUDF_HOST_DEVICE constexpr inline bool is_floating_point()
 {
-  return std::is_floating_point_v<T>;
+  return cuda::std::is_floating_point_v<T>;
 }
 
 /**
@@ -415,7 +415,7 @@ bool is_boolean(data_type type);
  * @return false  `T` is not a timestamp
  */
 template <typename T>
-constexpr inline bool is_timestamp()
+CUDF_HOST_DEVICE constexpr inline bool is_timestamp()
 {
   return is_timestamp_t<T>::value;
 }
@@ -439,13 +439,14 @@ bool is_timestamp(data_type type);
  * @return false  `T` is not a fixed-point type
  */
 template <typename T>
-constexpr inline bool is_fixed_point()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T> ||
-         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
+  return cuda::std::is_same_v<numeric::decimal32, T> ||
+         cuda::std::is_same_v<numeric::decimal64, T> ||
+         cuda::std::is_same_v<numeric::decimal128, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
@@ -465,7 +466,7 @@ bool is_fixed_point(data_type type);
  * @return false  `T` is not a duration
  */
 template <typename T>
-constexpr inline bool is_duration()
+CUDF_HOST_DEVICE constexpr inline bool is_duration()
 {
   return is_duration_t<T>::value;
 }
@@ -489,7 +490,7 @@ bool is_duration(data_type type);
  * @return false  `T` is neither a duration nor a timestamp type
  */
 template <typename T>
-constexpr inline bool is_chrono()
+CUDF_HOST_DEVICE constexpr inline bool is_chrono()
 {
   return is_duration<T>() || is_timestamp<T>();
 }
@@ -557,7 +558,7 @@ bool is_dictionary(data_type type);
  * @return false `T` corresponds to a variable-width element type
  */
 template <typename T>
-constexpr inline bool is_fixed_width()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_width()
 {
   // TODO Add fixed width wrapper types
   // Is a category fixed width?
@@ -590,10 +591,11 @@ class string_view;
  * @return false `T` corresponds to a "simple" type
  */
 template <typename T>
-constexpr inline bool is_compound()
+CUDF_HOST_DEVICE constexpr inline bool is_compound()
 {
-  return std::is_same_v<T, cudf::string_view> or std::is_same_v<T, cudf::dictionary32> or
-         std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::string_view> or
+         cuda::std::is_same_v<T, cudf::dictionary32> or cuda::std::is_same_v<T, cudf::list_view> or
+         cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
@@ -622,9 +624,9 @@ bool is_compound(data_type type);
  * @return false T is not a nested type
  */
 template <typename T>
-constexpr inline bool is_nested()
+CUDF_HOST_DEVICE constexpr inline bool is_nested()
 {
-  return std::is_same_v<T, cudf::list_view> || std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::list_view> || cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index d915c85bf85..3a6ff36c424 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -17,15 +17,14 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <vector>
-
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream)
 {
   // TODO: Initialize all the columns in a single kernel instead of invoking one
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec63504a414..0e31a0b6cf5 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -32,10 +32,10 @@ namespace binops {
 namespace compiled {
 
 template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-constexpr bool is_bool_result()
+CUDF_HOST_DEVICE constexpr bool is_bool_result()
 {
-  using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
-  return std::is_same_v<bool, ReturnType>;
+  using ReturnType = cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+  return cuda::std::is_same_v<bool, ReturnType>;
 }
 
 /**
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 7eb80c4249e..d36dacca739 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/detail/is_element_valid.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -30,15 +31,14 @@ bool is_element_valid_sync(column_view const& col_view,
   CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
   if (!col_view.nullable()) { return true; }
 
-  bitmask_type word = 0;
   // null_mask() returns device ptr to bitmask without offset
   size_type const index = element_index + col_view.offset();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
-                                col_view.null_mask() + word_index(index),
-                                sizeof(bitmask_type),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
+
+  auto const word =
+    cudf::detail::make_host_vector_sync(
+      device_span<bitmask_type const>{col_view.null_mask() + word_index(index), 1}, stream)
+      .front();
+
   return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
 }
 
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index fc244521617..9dc39f01ab3 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -60,13 +61,12 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
   // A buffer of CPU memory is allocated to hold the ColumnDeviceView
   // objects. Once filled, the CPU memory is copied to device memory
   // and then set into the d_children member pointer.
-  std::vector<char> staging_buffer(descendant_storage_bytes);
+  auto staging_buffer = detail::make_host_vector<char>(descendant_storage_bytes, stream);
 
   // Each ColumnDeviceView instance may have child objects that
   // require setting some internal device pointers before being copied
   // from CPU to device.
-  rmm::device_buffer* const descendant_storage =
-    new rmm::device_buffer(descendant_storage_bytes, stream);
+  auto const descendant_storage = new rmm::device_uvector<char>(descendant_storage_bytes, stream);
 
   auto deleter = [descendant_storage](ColumnDeviceView* v) {
     v->destroy();
@@ -77,13 +77,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
     new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter};
 
   // copy the CPU memory with all the children into device memory
-  CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
-                                staging_buffer.data(),
-                                descendant_storage->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  stream.synchronize();
+  detail::cuda_memcpy<char>(*descendant_storage, staging_buffer, stream);
 
   return result;
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d8419760120..6fc49afd7ac 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    cudaMemcpyAsync(m_view.begin<T>() + count,
+                    v.begin<T>(),
+                    v.size() * sizeof(T),
+                    cudaMemcpyDeviceToDevice,
+                    stream.value());
     count += v.size();
   }
 
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e9443980320..3413f75357b 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -35,6 +35,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -997,7 +998,8 @@ struct packed_split_indices_and_src_buf_info {
       src_buf_info_size(
         cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)),
       // host-side
-      h_indices_and_source_info(indices_size + src_buf_info_size),
+      h_indices_and_source_info{
+        detail::make_host_vector<uint8_t>(indices_size + src_buf_info_size, stream)},
       h_indices{reinterpret_cast<size_type*>(h_indices_and_source_info.data())},
       h_src_buf_info{
         reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size)}
@@ -1024,15 +1026,18 @@ struct packed_split_indices_and_src_buf_info {
       reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
                                    indices_size + src_buf_info_size);
 
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(
+      device_span<uint8_t>{static_cast<uint8_t*>(d_indices_and_source_info.data()),
+                           h_indices_and_source_info.size()},
+      h_indices_and_source_info,
+      stream);
   }
 
   size_type const indices_size;
   std::size_t const src_buf_info_size;
   std::size_t offset_stack_size;
 
-  std::vector<uint8_t> h_indices_and_source_info;
+  detail::host_vector<uint8_t> h_indices_and_source_info;
   rmm::device_buffer d_indices_and_source_info;
 
   size_type* const h_indices;
@@ -1054,27 +1059,26 @@ struct packed_partition_buf_size_and_dst_buf_info {
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
       // host-side
-      h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size),
+      h_buf_sizes_and_dst_info{
+        detail::make_host_vector<uint8_t>(buf_sizes_size + dst_buf_info_size, stream)},
       h_buf_sizes{reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data())},
       h_dst_buf_info{
-        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
+        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size),
+        num_bufs,
+        h_buf_sizes_and_dst_info.get_allocator().is_device_accessible()},
       // device-side
-      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr),
+      d_buf_sizes_and_dst_info(h_buf_sizes_and_dst_info.size(), stream, temp_mr),
       d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
       // destination buffer info
-      d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
-        static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
+      d_dst_buf_info{
+        reinterpret_cast<dst_buf_info*>(d_buf_sizes_and_dst_info.data() + buf_sizes_size), num_bufs}
   {
   }
 
   void copy_to_host()
   {
     // DtoH buf sizes and col info back to the host
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                                  d_buf_sizes,
-                                  buf_sizes_size + dst_buf_info_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(h_buf_sizes_and_dst_info, d_buf_sizes_and_dst_info, stream);
   }
 
   rmm::cuda_stream_view const stream;
@@ -1083,13 +1087,13 @@ struct packed_partition_buf_size_and_dst_buf_info {
   std::size_t const buf_sizes_size;
   std::size_t const dst_buf_info_size;
 
-  std::vector<uint8_t> h_buf_sizes_and_dst_info;
+  detail::host_vector<uint8_t> h_buf_sizes_and_dst_info;
   std::size_t* const h_buf_sizes;
-  dst_buf_info* const h_dst_buf_info;
+  host_span<dst_buf_info> const h_dst_buf_info;
 
-  rmm::device_buffer d_buf_sizes_and_dst_info;
+  rmm::device_uvector<uint8_t> d_buf_sizes_and_dst_info;
   std::size_t* const d_buf_sizes;
-  dst_buf_info* const d_dst_buf_info;
+  device_span<dst_buf_info> const d_dst_buf_info;
 };
 
 // Packed block of memory 3:
@@ -1105,11 +1109,12 @@ struct packed_src_and_dst_pointers {
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
       // host-side
-      h_src_and_dst_buffers(src_bufs_size + dst_bufs_size),
+      h_src_and_dst_buffers{
+        detail::make_host_vector<uint8_t>(src_bufs_size + dst_bufs_size, stream)},
       h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
       h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
       // device-side
-      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)},
+      d_src_and_dst_buffers{h_src_and_dst_buffers.size(), stream, temp_mr},
       d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
       d_dst_bufs{reinterpret_cast<uint8_t**>(
         reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
@@ -1120,18 +1125,18 @@ struct packed_src_and_dst_pointers {
 
   void copy_to_device()
   {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(),
-                                  h_src_and_dst_buffers.data(),
-                                  src_bufs_size + dst_bufs_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(
+      device_span<uint8_t>{static_cast<uint8_t*>(d_src_and_dst_buffers.data()),
+                           d_src_and_dst_buffers.size()},
+      h_src_and_dst_buffers,
+      stream);
   }
 
   rmm::cuda_stream_view const stream;
   std::size_t const src_bufs_size;
   std::size_t const dst_bufs_size;
 
-  std::vector<uint8_t> h_src_and_dst_buffers;
+  detail::host_vector<uint8_t> h_src_and_dst_buffers;
   uint8_t const** const h_src_bufs;
   uint8_t** const h_dst_bufs;
 
@@ -1204,7 +1209,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
       num_partitions, num_bufs, stream, temp_mr);
 
-  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info.begin();
   auto const d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
 
   auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(
@@ -1517,26 +1522,19 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
    */
   if (user_buffer_size != 0) {
     // copy the batch offsets back to host
-    std::vector<std::size_t> h_offsets(num_batches + 1);
-    {
-      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, temp_mr);
+    auto const h_offsets = [&] {
+      rmm::device_uvector<std::size_t> offsets(num_batches + 1, stream, temp_mr);
       auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
         0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
 
-      thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
+      thrust::exclusive_scan(rmm::exec_policy_nosync(stream, temp_mr),
                              batch_byte_size_iter,
-                             batch_byte_size_iter + num_batches + 1,
+                             batch_byte_size_iter + offsets.size(),
                              offsets.begin());
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
-                                    offsets.data(),
-                                    sizeof(std::size_t) * offsets.size(),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
-
       // the next part is working on the CPU, so we want to synchronize here
-      stream.synchronize();
-    }
+      return detail::make_host_vector_sync(offsets, stream);
+    }();
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
@@ -1675,7 +1673,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
         if (bytes == 0) { return {1, 0}; }
 
         // The number of batches we want to subdivide this buffer into
-        std::size_t const num_batches = std::max(
+        std::size_t const num_batches = cuda::std::max(
           std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
 
         // NOTE: leaving batch size as a separate parameter for future tuning
@@ -1698,7 +1696,7 @@ void copy_data(int num_batches_to_copy,
                int starting_batch,
                uint8_t const** d_src_bufs,
                uint8_t** d_dst_bufs,
-               rmm::device_uvector<dst_buf_info>& d_dst_buf_info,
+               device_span<dst_buf_info> d_dst_buf_info,
                uint8_t* user_buffer,
                rmm::cuda_stream_view stream)
 {
@@ -1832,15 +1830,9 @@ struct contiguous_split_state {
                           keys + num_batches_total,
                           values,
                           thrust::make_discard_iterator(),
-                          dst_valid_count_output_iterator{d_orig_dst_buf_info});
-
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info,
-                                  d_orig_dst_buf_info,
-                                  partition_buf_size_and_dst_buf_info->dst_buf_info_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+                          dst_valid_count_output_iterator{d_orig_dst_buf_info.begin()});
 
-    stream.synchronize();
+    detail::cuda_memcpy<dst_buf_info>(h_orig_dst_buf_info, d_orig_dst_buf_info, stream);
 
     // not necessary for the non-chunked case, but it makes it so further calls to has_next
     // return false, just in case
@@ -1888,7 +1880,7 @@ struct contiguous_split_state {
     }
 
     auto& h_dst_buf_info  = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
-    auto cur_dst_buf_info = h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info.data();
     detail::metadata_builder mb{input.num_columns()};
 
     populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
@@ -1926,7 +1918,7 @@ struct contiguous_split_state {
 
     // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
     chunk_iter_state = compute_batches(num_bufs,
-                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info,
+                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info.data(),
                                        partition_buf_size_and_dst_buf_info->h_buf_sizes,
                                        num_partitions,
                                        user_buffer_size,
@@ -1962,7 +1954,7 @@ struct contiguous_split_state {
     auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
     auto& h_dst_bufs     = src_and_dst_pointers->h_dst_bufs;
 
-    auto cur_dst_buf_info = h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info.data();
     detail::metadata_builder mb(input.num_columns());
 
     for (std::size_t idx = 0; idx < num_partitions; idx++) {
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index e8b29a0e7a8..9c9a4c97bff 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -60,7 +60,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   rmm::cuda_stream_view stream)
 {
   // flatten the aggs to a table that can be operated on by aggregate_row
-  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream);
   auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
index 6025686953e..d2830f7d905 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
index 00db149c6d9..671ee2ea31f 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -25,6 +25,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -44,7 +45,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
index 0777b9ffd93..437823a3fea 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -19,6 +19,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -34,7 +35,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
index 209e2b7f20a..7cb3f8f190b 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullabl
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   nullable_global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index e1dbf2a3d9e..9648d942513 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -61,7 +61,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
     d_row_equal,
     probing_scheme_t{d_row_hash},
     cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cuco::storage<GROUPBY_BUCKET_SIZE>{},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index d353830780f..f86a93109be 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
-  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+  __shared__ cuco::bucket<cudf::size_type, GROUPBY_BUCKET_SIZE> buckets[bucket_extent.value()];
 
   auto raw_set = cuco::static_set_ref{
     cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     global_set.key_eq(),
     probing_scheme_t{global_set.hash_function()},
     cuco::thread_scope_block,
-    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
-      window_extent, windows}};
+    cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, decltype(bucket_extent)>{
+      bucket_extent, buckets}};
   auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
 
   auto const block = cooperative_groups::this_thread_block();
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index bc32e306b3f..a835736235c 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        host_span<cudf::aggregation::Kind const> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -107,7 +108,7 @@ template void extract_populated_keys<nullable_global_set_t>(
 template cudf::table create_sparse_results_table<global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table<global_set_t>(
 template cudf::table create_sparse_results_table<nullable_global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   nullable_global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index 8155ce852e0..4e2fa81bdb7 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -20,12 +20,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <vector>
-
 namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes and returns a device vector containing all populated keys in
@@ -47,7 +46,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        host_span<cudf::aggregation::Kind const> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
index b2048a9fbb8..a533f7a6448 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
@@ -102,12 +103,15 @@ class groupby_simple_aggregations_collector final
 };
 
 // flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+std::tuple<table_view,
+           cudf::detail::host_vector<aggregation::Kind>,
+           std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests,
+                         rmm::cuda_stream_view stream)
 {
   std::vector<column_view> columns;
   std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
+  auto agg_kinds = cudf::detail::make_empty_host_vector<aggregation::Kind>(requests.size(), stream);
 
   for (auto const& request : requests) {
     auto const& agg_v = request.aggregations;
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
index dfad51f27d4..e3c17ca972c 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -26,7 +26,10 @@
 namespace cudf::groupby::detail::hash {
 
 // flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests);
+std::tuple<table_view,
+           cudf::detail::host_vector<aggregation::Kind>,
+           std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests,
+                         rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
index 37a61c1a22c..b71e20938d6 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -170,7 +170,8 @@ void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation c
     cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
   auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
   mutable_table_view var_table_view{{var_result->mutable_view()}};
-  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+  cudf::detail::initialize_with_identity(
+    var_table_view, host_span<cudf::aggregation::Kind const>(&agg.kind, 1), stream);
 
   thrust::for_each_n(
     rmm::exec_policy_nosync(stream),
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index f950e03e0fb..92925e11bac 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -27,7 +27,7 @@ namespace cudf::groupby::detail::hash {
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
 /// Number of slots per thread
-CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BUCKET_SIZE = 1;
 
 /// Thread block size
 CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
@@ -48,9 +48,9 @@ using shmem_extent_t =
   cuco::extent<cudf::size_type,
                static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
 
-/// Number of windows needed by each shared memory hash set
-CUDF_HOST_DEVICE auto constexpr window_extent =
-  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+/// Number of buckets needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr bucket_extent =
+  cuco::make_bucket_extent<GROUPBY_CG_SIZE, GROUPBY_BUCKET_SIZE>(shmem_extent_t{});
 
 using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
@@ -75,7 +75,7 @@ using global_set_t = cuco::static_set<cudf::size_type,
                                       row_comparator_t,
                                       probing_scheme_t,
                                       cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                      cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                cuco::extent<int64_t>,
@@ -83,7 +83,7 @@ using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                nullable_row_comparator_t,
                                                probing_scheme_t,
                                                cudf::detail::cuco_allocator<char>,
-                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                               cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 template <typename Op>
 using hash_set_ref_t = cuco::static_set_ref<
@@ -91,7 +91,7 @@ using hash_set_ref_t = cuco::static_set_ref<
   cuda::thread_scope_device,
   row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 
 template <typename Op>
@@ -100,6 +100,6 @@ using nullable_hash_set_ref_t = cuco::static_set_ref<
   cuda::thread_scope_device,
   nullable_row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 65bd5ac408f..583357d9090 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/pair.h>
@@ -185,7 +186,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
-      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+      return unequal ? row_index_in_group + 1 : cuda::std::numeric_limits<size_type>::max();
     },
     DeviceMin{},
     has_nested_nulls(table_view{{grouped_values}}),
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 86835ea8a67..5082ad01327 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -107,7 +107,10 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
     if (values.is_empty()) { return result; }
 
     auto result_table = mutable_table_view({*result});
-    cudf::detail::initialize_with_identity(result_table, {K}, stream);
+    // Need an address of the aggregation kind to pass to the span
+    auto const kind = K;
+    cudf::detail::initialize_with_identity(
+      result_table, host_span<aggregation::Kind const>(&kind, 1), stream);
 
     auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
     auto values_view = column_device_view::create(values, stream);
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 43df7f325ac..ccdd097fa9c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -25,6 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/array>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -83,7 +84,8 @@ class murmur_device_row_hasher {
                                           hash_value_type const seed) const noexcept
     {
       if (check_nulls && col.is_null(row_index)) {
-        return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
+        return {cuda::std::numeric_limits<uint64_t>::max(),
+                cuda::std::numeric_limits<uint64_t>::max()};
       }
       auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index eb002cf9c6f..52f31667ff0 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -37,7 +38,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -252,7 +252,7 @@ struct HasherDispatcher {
   {
     Element const& key = input_col.element<Element>(row_index);
     if (isnan(key)) {
-      Element nan = std::numeric_limits<Element>::quiet_NaN();
+      Element nan = cuda::std::numeric_limits<Element>::quiet_NaN();
       hasher->process_fixed_width(nan);
     } else if (key == Element{0.0}) {
       hasher->process_fixed_width(Element{0.0});
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index bdbe13b1ffb..5e74148ceaf 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -72,7 +73,7 @@ class device_row_hasher {
                                           hash_value_type const _seed) const noexcept
     {
       if (_check_nulls && col.is_null(row_index)) {
-        return std::numeric_limits<hash_value_type>::max();
+        return cuda::std::numeric_limits<hash_value_type>::max();
       }
       auto const hasher = XXHash_64<T>{_seed};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 9bf66369d6a..4c05d78292b 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -142,7 +142,7 @@ enum logicaltype_kind_e {
  *
  * @return true if the logical type is supported, false otherwise.
  */
-inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+CUDF_HOST_DEVICE inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
 {
   switch (logical_kind) {
     case logicaltype_date: return true;
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index b48e49ffd78..9b01272ac70 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -65,7 +65,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+  CUDF_HOST_DEVICE constexpr unsnap_state_s() noexcept {
+  }  // required to compile on ctk-12.2 + aarch64
 
   uint8_t const* base{};           ///< base ptr of compressed stream
   uint8_t const* end{};            ///< end of compressed stream
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 7f0b5e07b09..e05353ee822 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -21,13 +21,13 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
-#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 0e70984b39c..2a75c034dc8 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,6 +18,7 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -342,8 +343,9 @@ class WriteCoalescingCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __device__ __forceinline__ StateVectorTransitionOp(
-    TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
+  __device__ __forceinline__
+  StateVectorTransitionOp(TransitionTableT const& transition_table,
+                          cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
@@ -360,7 +362,7 @@ class StateVectorTransitionOp {
   }
 
  public:
-  std::array<StateIndexT, NUM_INSTANCES>& state_vector;
+  cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector;
   TransitionTableT const& transition_table;
 };
 
@@ -620,7 +622,7 @@ struct AgentDFA {
     SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
-    std::array<StateIndexT, NUM_STATES>& state_vector)
+    cuda::std::array<StateIndexT, NUM_STATES>& state_vector)
   {
     using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, TransitionTableT>;
 
@@ -796,10 +798,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
-    std::array<StateIndexT, NUM_STATES> state_vector;
+    cuda::std::array<StateIndexT, NUM_STATES> state_vector;
 
     // Initialize the seed state transition vector with the identity vector
-    thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector));
+    thrust::sequence(thrust::seq, cuda::std::begin(state_vector), cuda::std::end(state_vector));
 
     // Compute the state transition vector
     agent_dfa.GetThreadStateTransitionVector<NUM_STATES>(symbol_matcher,
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 0cb5c382631..7facc6497ed 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -180,9 +180,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
     if (t + i < dict.map_slots.size()) {
-      auto window = dict.map_slots.begin() + t + i;
-      // Collect all slots from each window.
-      for (auto& slot : *window) {
+      auto bucket = dict.map_slots.begin() + t + i;
+      // Collect all slots from each bucket.
+      for (auto& slot : *bucket) {
         auto const key = slot.first;
         if (key != KEY_SENTINEL) {
           auto loc       = counter.fetch_add(1, memory_order_relaxed);
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 0949fafe9a4..654ee1e012c 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -47,16 +47,16 @@ using slot_type   = cuco::pair<key_type, mapped_type>;
 auto constexpr map_cg_size =
   1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
       ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
-auto constexpr window_size =
+auto constexpr bucket_size =
   1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
 auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
                                           ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
-using storage_type     = cuco::aow_storage<slot_type,
-                                       window_size,
-                                       cuco::extent<std::size_t>,
-                                       cudf::detail::cuco_allocator<char>>;
+using storage_type     = cuco::bucket_storage<slot_type,
+                                          bucket_size,
+                                          cuco::extent<std::size_t>,
+                                          cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = typename storage_type::window_type;
+using bucket_type      = typename storage_type::bucket_type;
 using slot_type        = cuco::pair<key_type, mapped_type>;
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
@@ -193,7 +193,7 @@ struct StripeStream {
  */
 struct stripe_dictionary {
   // input
-  device_span<window_type> map_slots;  // hash map (windows) storage
+  device_span<bucket_type> map_slots;  // hash map (buckets) storage
   uint32_t column_idx      = 0;        // column index
   size_type start_row      = 0;        // first row in the stripe
   size_type start_rowgroup = 0;        // first rowgroup in the stripe
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 8e532b01788..6b9c19368dc 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,7 +19,6 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
-#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -30,6 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/logger.hpp>
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index b85ebf2fa1a..b5f9b894c46 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -210,7 +210,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
+  populate_chunk_hash_maps_kernel(device_span<bucket_type> const map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -239,7 +239,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<window_type> const map_storage,
+  collect_map_entries_kernel(device_span<bucket_type> const map_storage,
                              device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -251,11 +251,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
-  // Iterate over all windows in the map.
+  // Iterate over all buckets in the map.
   for (; t < chunk.dict_map_size; t += block_size) {
-    auto window = map_storage.data() + chunk.dict_map_offset + t;
-    // Collect all slots from each window.
-    for (auto& slot : *window) {
+    auto bucket = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each bucket.
+    for (auto& slot : *bucket) {
       auto const key = slot.first;
       if (key != KEY_SENTINEL) {
         auto const loc = counter.fetch_add(1, memory_order_relaxed);
@@ -272,7 +272,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
+  get_dictionary_indices_kernel(device_span<bucket_type> const map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -302,7 +302,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   s_ck_start_val_idx);
 }
 
-void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+void populate_chunk_hash_maps(device_span<bucket_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -311,7 +311,7 @@ void populate_chunk_hash_maps(device_span<window_type> const map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(device_span<window_type> const map_storage,
+void collect_map_entries(device_span<bucket_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -320,7 +320,7 @@ void collect_map_entries(device_span<window_type> const map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(device_span<window_type> const map_storage,
+void get_dictionary_indices(device_span<bucket_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 7c09764da2d..800875f7448 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -34,7 +34,7 @@ using slot_type   = cuco::pair<key_type, mapped_type>;
 auto constexpr map_cg_size =
   1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
       ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
-auto constexpr window_size =
+auto constexpr bucket_size =
   1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
 auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
                                           ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
@@ -43,12 +43,12 @@ auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};
 auto constexpr SCOPE          = cuda::thread_scope_block;
 
-using storage_type     = cuco::aow_storage<slot_type,
-                                       window_size,
-                                       cuco::extent<std::size_t>,
-                                       cudf::detail::cuco_allocator<char>>;
+using storage_type     = cuco::bucket_storage<slot_type,
+                                          bucket_size,
+                                          cuco::extent<std::size_t>,
+                                          cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = typename storage_type::window_type;
+using bucket_type      = typename storage_type::bucket_type;
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
@@ -100,7 +100,7 @@ inline size_type __device__ row_to_value_idx(size_type idx,
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+void populate_chunk_hash_maps(device_span<bucket_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
@@ -111,7 +111,7 @@ void populate_chunk_hash_maps(device_span<window_type> const map_storage,
  * @param chunks Flat span of chunks to compact hash maps for
  * @param stream CUDA stream to use
  */
-void collect_map_entries(device_span<window_type> const map_storage,
+void collect_map_entries(device_span<bucket_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
@@ -128,7 +128,7 @@ void collect_map_entries(device_span<window_type> const map_storage,
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(device_span<window_type> const map_storage,
+void get_dictionary_indices(device_span<bucket_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index bcdae4cbd3b..d9ea10a695e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -550,7 +550,7 @@ void decode_page_headers(pass_intermediate_data& pass,
 {
   CUDF_FUNC_RANGE();
 
-  auto iter = thrust::make_counting_iterator(0);
+  auto iter = thrust::counting_iterator<size_t>(0);
   rmm::device_uvector<size_t> chunk_page_counts(pass.chunks.size() + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy_nosync(stream),
@@ -562,7 +562,7 @@ void decode_page_headers(pass_intermediate_data& pass,
         return static_cast<size_t>(
           i >= num_chunks ? 0 : chunks[i].num_data_pages + chunks[i].num_dict_pages);
       }),
-    0,
+    size_t{0},
     thrust::plus<size_t>{});
   rmm::device_uvector<chunk_page_info> d_chunk_page_info(pass.chunks.size(), stream);
   thrust::for_each(rmm::exec_policy_nosync(stream),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 188e6a8c0d8..6db92462498 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1302,7 +1302,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     } else {
       chunk.use_dictionary = true;
       chunk.dict_map_size =
-        static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
+        static_cast<cudf::size_type>(cuco::make_bucket_extent<map_cg_size, bucket_size>(
           static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
       chunk.dict_map_offset = total_map_storage_size;
       total_map_storage_size += chunk.dict_map_size;
@@ -1317,7 +1317,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     total_map_storage_size,
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
   // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
-  device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
+  device_span<bucket_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
   // Synchronize
   chunks.host_to_device_async(stream);
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 58698c6a19d..50d823ade88 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <cuda/std/limits>
+
 namespace cudf::io::statistics {
 
 /**
@@ -30,15 +32,19 @@ class byte_array_view {
  public:
   using element_type = std::byte const;  ///< The type of the elements in the byte array
 
-  constexpr byte_array_view() noexcept {}
+  CUDF_HOST_DEVICE constexpr byte_array_view() noexcept {}
   /**
    * @brief Constructs a byte_array_view from a pointer and a size.
    *
    * @param data Pointer to the first element in the byte array.
    * @param size The number of elements in the byte array.
    */
-  constexpr byte_array_view(element_type* data, std::size_t size) : _data(data, size) {}
-  constexpr byte_array_view(byte_array_view const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr byte_array_view(element_type* data, std::size_t size)
+    : _data(data, size)
+  {
+  }
+  CUDF_HOST_DEVICE constexpr byte_array_view(byte_array_view const&) noexcept =
+    default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
@@ -55,14 +61,20 @@ class byte_array_view {
    * @param idx The index of the element to access.
    * @return A reference to the idx-th element of the byte_array_view, i.e., `_data.data()[idx]`.
    */
-  [[nodiscard]] constexpr element_type& operator[](std::size_t idx) const { return _data[idx]; }
+  [[nodiscard]] __device__ constexpr element_type& operator[](std::size_t idx) const
+  {
+    return _data[idx];
+  }
 
   /**
    * @brief Returns a pointer to the beginning of the byte_array_view.
    *
    * @return A pointer to the first element of the byte_array_view.
    */
-  [[nodiscard]] constexpr element_type* data() const noexcept { return _data.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr element_type* data() const noexcept
+  {
+    return _data.data();
+  }
 
   /**
    * @brief Returns the number of elements in the byte_array_view.
@@ -76,7 +88,10 @@ class byte_array_view {
    *
    * @return The size of the byte_array_view in bytes
    */
-  [[nodiscard]] constexpr std::size_t size_bytes() const noexcept { return _data.size_bytes(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr std::size_t size_bytes() const noexcept
+  {
+    return _data.size_bytes();
+  }
 
   /**
    * @brief Comparing target byte_array_view with this byte_array_view. Each byte in the array is
@@ -98,9 +113,9 @@ class byte_array_view {
     auto const* ptr2 = rhs.data();
     if ((ptr1 == ptr2) && (len1 == len2)) { return 0; }
     // if I am max, I am greater than the argument
-    if (ptr1 == nullptr && len1 == std::numeric_limits<std::size_t>::max()) { return 1; }
+    if (ptr1 == nullptr && len1 == cuda::std::numeric_limits<std::size_t>::max()) { return 1; }
     // if the argument is max, it is greater than me
-    if (ptr2 == nullptr && len2 == std::numeric_limits<std::size_t>::max()) { return -1; }
+    if (ptr2 == nullptr && len2 == cuda::std::numeric_limits<std::size_t>::max()) { return -1; }
     std::size_t idx = 0;
     for (; (idx < len1) && (idx < len2); ++idx) {
       if (ptr1[idx] != ptr2[idx]) {
@@ -170,7 +185,7 @@ class byte_array_view {
    */
   [[nodiscard]] __device__ inline static byte_array_view max()
   {
-    return {nullptr, std::numeric_limits<std::size_t>::max()};
+    return {nullptr, cuda::std::numeric_limits<std::size_t>::max()};
   }
 
  private:
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 01db781c766..dc023e69423 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -30,6 +30,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <cuda/std/limits>
 #include <math_constants.h>
 #include <thrust/extrema.h>
 
@@ -246,9 +247,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
       if (!chunk.has_minmax) { return true; }
-      return std::numeric_limits<E>::max() / chunk.non_nulls >=
+      return cuda::std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
-             std::numeric_limits<E>::lowest() / chunk.non_nulls <=
+             cuda::std::numeric_limits<E>::lowest() / chunk.non_nulls <=
                static_cast<E>(chunk.minimum_value);
     }
     return true;
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 734067582f7..75e45a68842 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,12 +30,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
 
-#include <optional>
-
 using cudf::device_span;
 
 namespace cudf {
@@ -183,7 +182,7 @@ constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' -
  * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-constexpr bool is_infinity(char const* begin, char const* end)
+CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
 {
   if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
@@ -208,9 +207,9 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ std::optional<T> parse_numeric(char const* begin,
-                                                   char const* end,
-                                                   parse_options_view const& opts)
+__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
+                                                         char const* end,
+                                                         parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -267,7 +266,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::optional<T>{}; }
+  if (!all_digits_valid) { return cuda::std::optional<T>{}; }
 
   return value * sign;
 }
@@ -524,7 +523,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
+    auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
@@ -573,7 +572,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
@@ -602,7 +601,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index caea8dabb88..c0efc5b6f20 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -82,8 +82,8 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-__host__ __device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                         device_span<char const> key)
+CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                      device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 47864c25c5f..a60cbbb8db2 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -20,7 +20,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cmath>
+#include <cuda/std/cmath>
+#include <cuda/std/functional>
 
 namespace cudf {
 namespace detail {
@@ -96,12 +97,12 @@ struct quantile_index {
 
   CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
-    quantile = std::min(std::max(quantile, 0.0), 1.0);
+    quantile = cuda::std::min(cuda::std::max(quantile, 0.0), 1.0);
 
     double val = quantile * (count - 1);
     lower      = std::floor(val);
-    higher     = static_cast<size_type>(std::ceil(val));
-    nearest    = static_cast<size_type>(std::nearbyint(val));
+    higher     = static_cast<size_type>(cuda::std::ceil(val));
+    nearest    = static_cast<size_type>(cuda::std::nearbyint(val));
     fraction   = val - lower;
   }
 };
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 98fd9f679c8..21d8c95e199 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -218,9 +218,8 @@ struct minmax_functor {
     auto dev_result = reduce<cudf::string_view>(col, stream);
     // copy the minmax_pair to the host; does not copy the strings
     using OutputType = minmax_pair<cudf::string_view>;
-    OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
+
+    auto const host_result = dev_result.value(stream);
     // strings are copied to create the scalars here
     return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
             std::make_unique<string_scalar>(host_result.max_val, true, stream, mr)};
@@ -236,10 +235,8 @@ struct minmax_functor {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
     // copy the minmax_pair to the host to call get_element
-    using OutputType = minmax_pair<T>;
-    OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
+    using OutputType       = minmax_pair<T>;
+    OutputType host_result = dev_result.value(stream);
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
     return {detail::get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 4ec2174a96f..4b0b08fe251 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -114,11 +114,10 @@ string_scalar::operator std::string() const { return this->to_string(cudf::get_d
 
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
-  std::string result;
-  result.resize(_data.size());
-  CUDF_CUDA_TRY(
-    cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  std::string result(size(), '\0');
+  detail::cuda_memcpy(host_span<char>{result.data(), result.size()},
+                      device_span<char const>{data(), _data.size()},
+                      stream);
   return result;
 }
 
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 2df404048f7..d22fb04696c 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -186,7 +186,7 @@ class reprog_device {
    *            Specify -1 to match any virtual positions past the end of the string.
    * @return If match found, returns character positions of the matches.
    */
-  __device__ [[nodiscard]] inline match_result find(int32_t const thread_idx,
+  [[nodiscard]] __device__ inline match_result find(int32_t const thread_idx,
                                                     string_view const d_str,
                                                     string_view::const_iterator begin,
                                                     cudf::size_type end = -1) const;
@@ -205,7 +205,7 @@ class reprog_device {
    * @param group_id The specific group to return its matching position values.
    * @return If valid, returns the character position of the matched group in the given string,
    */
-  __device__ [[nodiscard]] inline match_result extract(int32_t const thread_idx,
+  [[nodiscard]] __device__ inline match_result extract(int32_t const thread_idx,
                                                        string_view const d_str,
                                                        string_view::const_iterator begin,
                                                        cudf::size_type end,
@@ -225,17 +225,17 @@ class reprog_device {
   /**
    * @brief Returns the regex instruction object for a given id.
    */
-  __device__ [[nodiscard]] inline reinst get_inst(int32_t id) const;
+  [[nodiscard]] __device__ inline reinst get_inst(int32_t id) const;
 
   /**
    * @brief Returns the regex class object for a given id.
    */
-  __device__ [[nodiscard]] inline reclass_device get_class(int32_t id) const;
+  [[nodiscard]] __device__ inline reclass_device get_class(int32_t id) const;
 
   /**
    * @brief Executes the regex pattern on the given string.
    */
-  __device__ [[nodiscard]] inline match_result regexec(string_view const d_str,
+  [[nodiscard]] __device__ inline match_result regexec(string_view const d_str,
                                                        reljunk jnk,
                                                        string_view::const_iterator begin,
                                                        cudf::size_type end,
@@ -244,7 +244,7 @@ class reprog_device {
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
-  __device__ [[nodiscard]] inline match_result call_regexec(
+  [[nodiscard]] __device__ inline match_result call_regexec(
     int32_t const thread_idx,
     string_view const d_str,
     string_view::const_iterator begin,
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index e34a1e12015..906f09e4d82 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -81,11 +81,11 @@ struct alignas(8) relist {
     return true;
   }
 
-  __device__ [[nodiscard]] __forceinline__ restate get_state(int16_t idx) const
+  [[nodiscard]] __device__ __forceinline__ restate get_state(int16_t idx) const
   {
     return restate{ranges[idx * stride], inst_ids[idx * stride]};
   }
-  __device__ [[nodiscard]] __forceinline__ int16_t get_size() const { return size; }
+  [[nodiscard]] __device__ __forceinline__ int16_t get_size() const { return size; }
 
  private:
   int16_t size{};
@@ -101,7 +101,7 @@ struct alignas(8) relist {
     mask[pos >> 3] |= uc;
   }
 
-  __device__ [[nodiscard]] __forceinline__ bool readMask(int32_t pos) const
+  [[nodiscard]] __device__ __forceinline__ bool readMask(int32_t pos) const
   {
     u_char const uc = mask[pos >> 3];
     return static_cast<bool>((uc >> (pos & 7)) & 1);
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index 3d11b641b3f..902e13fe75e 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -17,7 +17,9 @@
 #include "strings/regex/regcomp.h"
 #include "strings/regex/regex.cuh"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -66,10 +68,11 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
                        cudf::util::round_up_safe(classes_size, sizeof(char32_t));
 
   // allocate memory to store all the prog data in a flat contiguous buffer
-  std::vector<u_char> h_buffer(memsize);                        // copy everything into here;
-  auto h_ptr    = h_buffer.data();                              // this is our running host ptr;
-  auto d_buffer = new rmm::device_buffer(memsize, stream);      // output device memory;
-  auto d_ptr    = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
+  auto h_buffer =
+    cudf::detail::make_host_vector<u_char>(memsize, stream);  // copy everything into here;
+  auto h_ptr    = h_buffer.data();                            // this is our running host ptr;
+  auto d_buffer = new rmm::device_uvector<u_char>(memsize, stream);  // output device memory;
+  auto d_ptr    = d_buffer->data();                                  // running device pointer
 
   // create our device object; this is managed separately and returned to the caller
   auto* d_prog = new reprog_device(h_prog);
@@ -113,8 +116,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   d_prog->_prog_size = memsize + sizeof(reprog_device);
 
   // copy flat prog to device memory
-  CUDF_CUDA_TRY(
-    cudaMemcpyAsync(d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async<u_char>(*d_buffer, h_buffer, stream);
 
   // build deleter to cleanup device memory
   auto deleter = [d_buffer](reprog_device* t) {
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 94bc81ec933..4ed66622508 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -34,6 +34,7 @@
 
 #include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -142,7 +143,7 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
     if (stop < 0) { return d_str.size_bytes(); }
     if (stop <= start) { return begin; }
     // we count from `begin` instead of recounting from the beginning of the string
-    return begin + std::get<0>(bytes_to_character_position(
+    return begin + cuda::std::get<0>(bytes_to_character_position(
                      string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start));
   }();
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 4c39fc96397..a74b19aae28 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -35,6 +35,7 @@
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -140,14 +141,16 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
-      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+      first_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
     stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
-      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+      last_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
     }
 
     d_output[str_idx] = (first_byte < last_byte)
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index b13ad0a7de8..ee51a426eac 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -19,6 +19,8 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -198,8 +200,8 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::getline(hash_file, line);
   result.num_bins = str_to_uint32(line, line_no++);
 
-  std::vector<uint64_t> bin_coefficients(result.num_bins);
-  std::vector<uint16_t> bin_offsets(result.num_bins);
+  auto bin_coefficients = cudf::detail::make_host_vector<uint64_t>(result.num_bins, stream);
+  auto bin_offsets      = cudf::detail::make_host_vector<uint16_t>(result.num_bins, stream);
 
   for (int i = 0; i < result.num_bins; ++i) {
     std::getline(hash_file, line);
@@ -216,7 +218,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 
   std::getline(hash_file, line);
   uint64_t hash_table_length = str_to_uint64(line, line_no++);
-  std::vector<uint64_t> table(hash_table_length);
+  auto table                 = cudf::detail::make_host_vector<uint64_t>(hash_table_length, stream);
 
   std::generate(table.begin(), table.end(), [&hash_file, &line_no]() {
     std::string line;
@@ -239,33 +241,32 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
-                                table.data(),
-                                table.size() * sizeof(uint64_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint64_t>(
+    cudf::device_span<uint64_t>(result.table->mutable_view().data<uint64_t>(), table.size()),
+    table,
+    stream);
 
   result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
                                                       bin_coefficients.size(),
                                                       cudf::mask_state::UNALLOCATED,
                                                       stream,
                                                       mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
-                                bin_coefficients.data(),
-                                bin_coefficients.size() * sizeof(uint64_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint64_t>(
+    cudf::device_span<uint64_t>(result.bin_coefficients->mutable_view().data<uint64_t>(),
+                                bin_coefficients.size()),
+    bin_coefficients,
+    stream);
 
   result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16},
                                                  bin_offsets.size(),
                                                  cudf::mask_state::UNALLOCATED,
                                                  stream,
                                                  mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
-                                bin_offsets.data(),
-                                bin_offsets.size() * sizeof(uint16_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint16_t>(
+    cudf::device_span<uint16_t>(result.bin_offsets->mutable_view().data<uint16_t>(),
+                                bin_offsets.size()),
+    bin_offsets,
+    stream);
 
   auto cp_metadata            = detail::get_codepoint_metadata(stream);
   auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp
index 89f76237de6..e3fdc177b50 100644
--- a/cpp/tests/streams/replace_test.cpp
+++ b/cpp/tests/streams/replace_test.cpp
@@ -104,9 +104,9 @@ TEST_F(ReplaceTest, NormalizeNansAndZeros)
 
 TEST_F(ReplaceTest, NormalizeNansAndZerosMutable)
 {
-  auto nan          = std::numeric_limits<double>::quiet_NaN();
-  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
-  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
-  cudf::mutable_column_view mutable_view = cudf::column(input, cudf::test::get_default_stream());
-  cudf::normalize_nans_and_zeros(mutable_view, cudf::test::get_default_stream());
+  auto nan   = std::numeric_limits<double>::quiet_NaN();
+  auto data  = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  auto input = cudf::test::fixed_width_column_wrapper<double>(data.begin(), data.end()).release();
+  auto view  = input->mutable_view();
+  cudf::normalize_nans_and_zeros(view, cudf::test::get_default_stream());
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index 44767f1e9d3..7a83efc6e3d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -878,7 +878,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numpy==1.23.*
-              - pyarrow==14.0.0
+              - pyarrow==14.*
           - matrix:
             packages:
       - output_types: conda
@@ -903,7 +903,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numpy==1.24.*
-              - pyarrow==14.0.1
+              - pyarrow==14.*
           - matrix:
             packages:
   test_python_cudf_polars:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7aa8f9f4a1c..09214803c0c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -439,6 +439,8 @@ def _generate_namespaces(namespaces):
     # Sphinx doesn't know how to distinguish between the ORC and Parquet
     # definitions because Breathe doesn't to preserve namespaces for enums.
     "TypeKind",
+    # Span subclasses access base class members
+    "base::",
 }
 
 _domain_objects = None
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8cec8af3c67..410fd57691e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,9 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx
-                   stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
-)
+set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
@@ -24,8 +22,3 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
-target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
-
-include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
-target_link_libraries(interop PUBLIC nanoarrow)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 001e5cbb676..6b5a7814e48 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -2,12 +2,7 @@
 import numpy as np
 
 from . import (
-    copying,
     groupby,
-    interop,
-    sort,
-    stream_compaction,
-    string_casting,
     strings_udf,
 )
 
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
deleted file mode 100644
index ef544dc89eb..00000000000
--- a/python/cudf/cudf/_lib/copying.pyx
+++ /dev/null
@@ -1,451 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-import pylibcudf
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.core.abc import Serializable
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.scalar cimport DeviceScalar
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
-import pylibcudf as plc
-from pylibcudf.contiguous_split cimport PackedColumns as PlcPackedColumns
-
-
-def _gather_map_is_valid(
-    gather_map: "cudf.core.column.ColumnBase",
-    nrows: int,
-    check_bounds: bool,
-    nullify: bool,
-) -> bool:
-    """Returns true if gather map is valid.
-
-    A gather map is valid if empty or all indices are within the range
-    ``[-nrows, nrows)``, except when ``nullify`` is specified.
-    """
-    if not check_bounds or nullify or len(gather_map) == 0:
-        return True
-    gm_min, gm_max = gather_map.minmax()
-    return gm_min >= -nrows and gm_max < nrows
-
-
-@acquire_spill_lock()
-def copy_column(Column input_column):
-    """
-    Deep copies a column
-
-    Parameters
-    ----------
-    input_columns : column to be copied
-
-    Returns
-    -------
-    Deep copied column
-    """
-    return Column.from_pylibcudf(
-        input_column.to_pylibcudf(mode="read").copy()
-    )
-
-
-@acquire_spill_lock()
-def _copy_range_in_place(Column input_column,
-                         Column target_column,
-                         size_type input_begin,
-                         size_type input_end,
-                         size_type target_begin):
-    pylibcudf.copying.copy_range(
-        input_column.to_pylibcudf(mode="write"),
-        target_column.to_pylibcudf(mode="write"),
-        input_begin,
-        input_end,
-        target_begin
-    )
-
-
-def _copy_range(Column input_column,
-                Column target_column,
-                size_type input_begin,
-                size_type input_end,
-                size_type target_begin):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.copy_range(
-            input_column.to_pylibcudf(mode="read"),
-            target_column.to_pylibcudf(mode="read"),
-            input_begin,
-            input_end,
-            target_begin
-        )
-    )
-
-
-@acquire_spill_lock()
-def copy_range(Column source_column,
-               Column target_column,
-               size_type source_begin,
-               size_type source_end,
-               size_type target_begin,
-               size_type target_end,
-               bool inplace):
-    """
-    Copy a contiguous range from a source to a target column
-
-    Notes
-    -----
-    Expects the source and target ranges to have been sanitised to be
-    in-range for the source and target column respectively. For
-    example via ``slice.indices``.
-    """
-
-    msg = "Source and target ranges must be same length"
-    assert source_end - source_begin == target_end - target_begin, msg
-    if target_end >= target_begin and inplace:
-        # FIXME: Are we allowed to do this when inplace=False?
-        return target_column
-
-    if inplace:
-        _copy_range_in_place(source_column, target_column,
-                             source_begin, source_end, target_begin)
-    else:
-        return _copy_range(source_column, target_column,
-                           source_begin, source_end, target_begin)
-
-
-@acquire_spill_lock()
-def gather(
-    list columns,
-    Column gather_map,
-    bool nullify=False
-):
-    tbl = pylibcudf.copying.gather(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        gather_map.to_pylibcudf(mode="read"),
-        pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify
-        else pylibcudf.copying.OutOfBoundsPolicy.DONT_CHECK
-    )
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def scatter(list sources, Column scatter_map, list target_columns,
-            bool bounds_check=True):
-    """
-    Scattering source into target as per the scatter map.
-    `source` can be a list of scalars, or a list of columns. The number of
-    items in `sources` must equal the number of `target_columns` to scatter.
-    """
-    # TODO: Only single column scatter is used, we should explore multi-column
-    # scatter for frames for performance increase.
-
-    if len(sources) != len(target_columns):
-        raise ValueError("Mismatched number of source and target columns.")
-
-    if len(sources) == 0:
-        return []
-
-    if bounds_check:
-        n_rows = len(target_columns[0])
-        if not (
-            (scatter_map >= -n_rows).all()
-            and (scatter_map < n_rows).all()
-        ):
-            raise IndexError(
-                f"index out of bounds for column of size {n_rows}"
-            )
-
-    tbl = pylibcudf.copying.scatter(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in sources])
-        if isinstance(sources[0], Column)
-        else [(<DeviceScalar> as_device_scalar(slr)).c_value for slr in sources],
-        scatter_map.to_pylibcudf(mode="read"),
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
-    )
-
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def column_empty_like(Column input_column):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.empty_like(
-            input_column.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def column_allocate_like(Column input_column, size=None):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.allocate_like(
-            input_column.to_pylibcudf(mode="read"),
-            size,
-        )
-    )
-
-
-@acquire_spill_lock()
-def columns_empty_like(list input_columns):
-    return columns_from_pylibcudf_table(
-        pylibcudf.copying.empty_like(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns])
-        )
-    )
-
-
-@acquire_spill_lock()
-def column_slice(Column input_column, object indices):
-    return [
-        Column.from_pylibcudf(c)
-        for c in pylibcudf.copying.slice(
-            input_column.to_pylibcudf(mode="read"),
-            list(indices),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def columns_slice(list input_columns, object indices):
-    return [
-        columns_from_pylibcudf_table(tbl)
-        for tbl in pylibcudf.copying.slice(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]),
-            list(indices),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def column_split(Column input_column, object splits):
-    return [
-        Column.from_pylibcudf(c)
-        for c in pylibcudf.copying.split(
-            input_column.to_pylibcudf(mode="read"),
-            list(splits),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def columns_split(list input_columns, object splits):
-    return [
-        columns_from_pylibcudf_table(tbl)
-        for tbl in pylibcudf.copying.split(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]),
-            list(splits),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def copy_if_else(object lhs, object rhs, Column boolean_mask):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.copy_if_else(
-            lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column)
-            else (<DeviceScalar> as_device_scalar(lhs)).c_value,
-            rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column)
-            else (<DeviceScalar> as_device_scalar(rhs)).c_value,
-            boolean_mask.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def boolean_mask_scatter(list input_, list target_columns,
-                         Column boolean_mask):
-    """Copy the target columns, replacing masked rows with input data.
-
-    The ``input_`` data can be a list of columns or as a list of scalars.
-    A list of input columns will be used to replace corresponding rows in the
-    target columns for which the boolean mask is ``True``. For the nth ``True``
-    in the boolean mask, the nth row in ``input_`` is used to replace. A list
-    of input scalars will replace all rows in the target columns for which the
-    boolean mask is ``True``.
-    """
-    if len(input_) != len(target_columns):
-        raise ValueError("Mismatched number of input and target columns.")
-
-    if len(input_) == 0:
-        return []
-
-    tbl = pylibcudf.copying.boolean_mask_scatter(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_])
-        if isinstance(input_[0], Column)
-        else [(<DeviceScalar> as_device_scalar(i)).c_value for i in input_],
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
-        boolean_mask.to_pylibcudf(mode="read"),
-    )
-
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def shift(Column input, int offset, object fill_value=None):
-    cdef DeviceScalar fill
-
-    if isinstance(fill_value, DeviceScalar):
-        fill = fill_value
-    else:
-        fill = as_device_scalar(fill_value, input.dtype)
-
-    col = pylibcudf.copying.shift(
-        input.to_pylibcudf(mode="read"),
-        offset,
-        fill.c_value,
-    )
-    return Column.from_pylibcudf(col)
-
-
-@acquire_spill_lock()
-def get_element(Column input_column, size_type index):
-    return DeviceScalar.from_pylibcudf(
-        pylibcudf.copying.get_element(
-            input_column.to_pylibcudf(mode="read"),
-            index,
-        ),
-        dtype=input_column.dtype,
-    )
-
-
-class PackedColumns(Serializable):
-    """
-    A packed representation of a Frame, with all columns residing
-    in a single GPU memory buffer.
-    """
-
-    def __init__(
-        self,
-        PlcPackedColumns data,
-        object column_names = None,
-        object index_names = None,
-        object column_dtypes = None
-    ):
-        self._metadata, self._gpu_data = data.release()
-        self.column_names=column_names
-        self.index_names=index_names
-        self.column_dtypes=column_dtypes
-
-    def __reduce__(self):
-        return self.deserialize, self.serialize()
-
-    @property
-    def __cuda_array_interface__(self):
-        return self._gpu_data.__cuda_array_interface__
-
-    def serialize(self):
-        header = {}
-        frames = []
-        gpu_data = as_buffer(
-            data = self._gpu_data.obj.ptr,
-            size = self._gpu_data.obj.size,
-            owner=self,
-            exposed=True
-        )
-        data_header, data_frames = gpu_data.serialize()
-        header["data"] = data_header
-        frames.extend(data_frames)
-
-        header["column-names"] = self.column_names
-        header["index-names"] = self.index_names
-        header["metadata"] = self._metadata.tobytes()
-        for name, dtype in self.column_dtypes.items():
-            dtype_header, dtype_frames = dtype.device_serialize()
-            self.column_dtypes[name] = (
-                dtype_header,
-                (len(frames), len(frames) + len(dtype_frames)),
-            )
-            frames.extend(dtype_frames)
-        header["column-dtypes"] = self.column_dtypes
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header, frames):
-        column_dtypes = {}
-        for name, dtype in header["column-dtypes"].items():
-            dtype_header, (start, stop) = dtype
-            column_dtypes[name] = Serializable.device_deserialize(
-                dtype_header, frames[start:stop]
-            )
-        return cls(
-            plc.contiguous_split.pack(
-                plc.contiguous_split.unpack_from_memoryviews(
-                    memoryview(header["metadata"]),
-                    plc.gpumemoryview(frames[0]),
-                )
-            ),
-            header["column-names"],
-            header["index-names"],
-            column_dtypes,
-        )
-
-    @classmethod
-    def from_py_table(cls, input_table, keep_index=True):
-        if keep_index and (
-            not isinstance(input_table.index, cudf.RangeIndex)
-            or input_table.index.start != 0
-            or input_table.index.stop != len(input_table)
-            or input_table.index.step != 1
-        ):
-            columns = input_table._index._columns + input_table._columns
-            index_names = input_table._index_names
-        else:
-            columns = input_table._columns
-            index_names = None
-
-        column_names = input_table._column_names
-        column_dtypes = {}
-        for name, col in input_table._column_labels_and_values:
-            if isinstance(
-                col.dtype,
-                (cudf.core.dtypes._BaseDtype, cudf.core.dtypes.CategoricalDtype)
-            ):
-                column_dtypes[name] = col.dtype
-
-        return cls(
-            plc.contiguous_split.pack(
-                plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read") for col in columns
-                    ]
-                )
-            ),
-            column_names,
-            index_names,
-            column_dtypes,
-        )
-
-    def unpack(self):
-        output_table = cudf.DataFrame._from_data(*data_from_pylibcudf_table(
-            plc.contiguous_split.unpack_from_memoryviews(
-                self._metadata,
-                self._gpu_data
-            ),
-            self.column_names,
-            self.index_names
-        ))
-        for name, dtype in self.column_dtypes.items():
-            output_table._data[name] = (
-                output_table._data[name]._with_type_metadata(dtype)
-            )
-
-        return output_table
-
-
-def pack(input_table, keep_index=True):
-    """
-    Pack the columns of a cudf Frame into a single GPU memory buffer.
-    """
-    return PackedColumns.from_py_table(input_table, keep_index)
-
-
-def unpack(packed):
-    """
-    Unpack the results of packing a cudf Frame returning a new
-    cudf Frame in the process.
-    """
-    return packed.unpack()
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
deleted file mode 100644
index 1c9d3a01b80..00000000000
--- a/python/cudf/cudf/_lib/interop.pyx
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import ListDtype, StructDtype
-
-
-def from_dlpack(object dlpack_capsule):
-    """
-    Converts a DLPack Tensor PyCapsule into a list of columns.
-
-    DLPack Tensor PyCapsule is expected to have the name "dltensor".
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_dlpack(dlpack_capsule)
-    )
-
-
-def to_dlpack(list source_columns):
-    """
-    Converts a list of columns into a DLPack Tensor PyCapsule.
-
-    DLPack Tensor PyCapsule will have the name "dltensor".
-    """
-    return pylibcudf.interop.to_dlpack(
-        pylibcudf.Table(
-            [col.to_pylibcudf(mode="read") for col in source_columns]
-        )
-    )
-
-
-def gather_metadata(object cols_dtypes):
-    """
-    Generates a ColumnMetadata vector for each column.
-
-    Parameters
-    ----------
-    cols_dtypes : iterable
-        An iterable of ``(column_name, dtype)`` pairs.
-    """
-    cpp_metadata = []
-    if cols_dtypes is not None:
-        for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
-            cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
-            if isinstance(col_dtype, (ListDtype, StructDtype)):
-                _set_col_children_metadata(col_dtype, cpp_metadata[idx])
-    else:
-        raise TypeError(
-            "An iterable of (column_name, dtype) pairs is required to "
-            "construct column_metadata"
-        )
-    return cpp_metadata
-
-
-def _set_col_children_metadata(dtype, col_meta):
-    if isinstance(dtype, StructDtype):
-        for name, value in dtype.fields.items():
-            element_metadata = pylibcudf.interop.ColumnMetadata(name)
-            _set_col_children_metadata(value, element_metadata)
-            col_meta.children_meta.append(element_metadata)
-    elif isinstance(dtype, ListDtype):
-        # Offsets - child 0
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-        # Element column - child 1
-        element_metadata = pylibcudf.interop.ColumnMetadata()
-        _set_col_children_metadata(dtype.element_type, element_metadata)
-        col_meta.children_meta.append(element_metadata)
-    else:
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-
-@acquire_spill_lock()
-def to_arrow(list source_columns, object column_dtypes):
-    """Convert a list of columns from
-    cudf Frame to a PyArrow Table.
-
-    Parameters
-    ----------
-    source_columns : a list of columns to convert
-    column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs
-
-    Returns
-    -------
-    pyarrow table
-    """
-    cpp_metadata = gather_metadata(column_dtypes)
-    return pylibcudf.interop.to_arrow(
-        pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
-        cpp_metadata,
-    )
-
-
-@acquire_spill_lock()
-def from_arrow(object input_table):
-    """Convert from PyArrow Table to a list of columns.
-
-    Parameters
-    ----------
-    input_table : PyArrow table
-
-    Returns
-    -------
-    A list of columns to construct Frame object
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_arrow(input_table)
-    )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 0f9820ed1db..a3a8a14e70f 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -17,9 +17,6 @@ cdef class DeviceScalar:
     @staticmethod
     cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*)
 
-    @staticmethod
-    cdef DeviceScalar from_pylibcudf(pscalar, dtype=*)
-
     cdef void _set_dtype(self, dtype=*)
 
     cpdef bool is_valid(DeviceScalar s)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 56712402919..3d3bdd730a8 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -218,7 +218,7 @@ cdef class DeviceScalar:
         return s
 
     @staticmethod
-    cdef DeviceScalar from_pylibcudf(pscalar, dtype=None):
+    def from_pylibcudf(pscalar, dtype=None):
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         s.c_value = pscalar
         s._set_dtype(dtype)
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
deleted file mode 100644
index eefe37d9880..00000000000
--- a/python/cudf/cudf/_lib/sort.pyx
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from itertools import repeat
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def is_sorted(
-    list source_columns, object ascending=None, object null_position=None
-):
-    """
-    Checks whether the rows of a `table` are sorted in lexicographical order.
-
-    Parameters
-    ----------
-    source_columns : list of columns
-        columns to be checked for sort order
-    ascending : None or list-like of booleans
-        None or list-like of boolean values indicating expected sort order of
-        each column. If list-like, size of list-like must be len(columns). If
-        None, all columns expected sort order is set to ascending. False (0) -
-        descending, True (1) - ascending.
-    null_position : None or list-like of booleans
-        None or list-like of boolean values indicating desired order of nulls
-        compared to other elements. If list-like, size of list-like must be
-        len(columns). If None, null order is set to before. False (0) - after,
-        True (1) - before.
-
-    Returns
-    -------
-    returns : boolean
-        Returns True, if sorted as expected by ``ascending`` and
-        ``null_position``, False otherwise.
-    """
-
-    if ascending is None:
-        column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns)
-    else:
-        if len(ascending) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(ascending)} for `ascending`"
-            )
-        column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns)
-        for idx, val in enumerate(ascending):
-            if val:
-                column_order[idx] = pylibcudf.types.Order.ASCENDING
-
-    if null_position is None:
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-    else:
-        if len(null_position) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(null_position)} for `null_position`"
-            )
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-        for idx, val in enumerate(null_position):
-            if val:
-                null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE
-
-    return pylibcudf.sorting.is_sorted(
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ),
-        column_order,
-        null_precedence
-    )
-
-
-def ordering(column_order, null_precedence):
-    """
-    Construct order and null order vectors
-
-    Parameters
-    ----------
-    column_order
-        Iterable of bool (True for ascending order, False for descending)
-    null_precedence
-        Iterable string for null positions ("first" for start, "last" for end)
-
-    Both iterables must be the same length (not checked)
-
-    Returns
-    -------
-    pair of vectors (order, and null_order)
-    """
-    c_column_order = []
-    c_null_precedence = []
-    for asc, null in zip(column_order, null_precedence):
-        c_column_order.append(
-            pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING
-        )
-        if asc ^ (null == "first"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.AFTER)
-        elif asc ^ (null == "last"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE)
-        else:
-            raise ValueError(f"Invalid null precedence {null}")
-    return c_column_order, c_null_precedence
-
-
-@acquire_spill_lock()
-def order_by(
-    list columns_from_table,
-    object ascending,
-    str na_position,
-    *,
-    bool stable
-):
-    """
-    Get index to sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    columns_from_table : list[Column]
-        Columns from the table which will be sorted
-    ascending : sequence[bool]
-         Sequence of boolean values which correspond to each column
-         in the table to be sorted signifying the order of each column
-         True - Ascending and False - Descending
-    na_position : str
-        Whether null values should show up at the "first" or "last"
-        position of **all** sorted column.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    Column of indices that sorts the table
-    """
-    order = ordering(ascending, repeat(na_position))
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order")
-
-    return Column.from_pylibcudf(
-        func(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in columns_from_table],
-            ),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort(
-    list values,
-    list column_order=None,
-    list null_precedence=None,
-):
-    """
-    Sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    return columns_from_pylibcudf_table(
-        pylibcudf.sorting.sort(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort_by_key(
-    list values,
-    list keys,
-    object ascending,
-    object na_position,
-    *,
-    bool stable,
-):
-    """
-    Sort a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    ascending : list[bool]
-        Sequence of boolean values which correspond to each column
-        in the table to be sorted signifying the order of each column
-        True - Ascending and False - Descending
-    na_position : list[str]
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    order = ordering(ascending, na_position)
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key")
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def segmented_sort_by_key(
-    list values,
-    list keys,
-    Column segment_offsets,
-    list column_order=None,
-    list null_precedence=None,
-    *,
-    bool stable,
-):
-    """
-    Sort segments of a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    offsets : Column
-        Segment offsets
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    func = getattr(
-        pylibcudf.sorting,
-        f"{'stable_' if stable else ''}segmented_sort_by_key"
-    )
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            segment_offsets.to_pylibcudf(mode="read"),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def digitize(list source_columns, list bins, bool right=False):
-    """
-    Return the indices of the bins to which each value in source_table belongs.
-
-    Parameters
-    ----------
-    source_columns : Input columns to be binned.
-    bins : List containing columns of bins
-    right : Indicating whether the intervals include the
-            right or the left bin edge.
-    """
-    return Column.from_pylibcudf(
-        getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in bins]
-            ),
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in source_columns]
-            ),
-            [pylibcudf.types.Order.ASCENDING]*len(bins),
-            [pylibcudf.types.NullOrder.BEFORE]*len(bins)
-        )
-    )
-
-
-@acquire_spill_lock()
-def rank_columns(list source_columns, rank_method method, str na_option,
-                 bool ascending, bool pct
-                 ):
-    """
-    Compute numerical data ranks (1 through n) of each column in the dataframe
-    """
-    column_order = (
-        pylibcudf.types.Order.ASCENDING
-        if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-    # ascending
-    #    #top    = na_is_smallest
-    #    #bottom = na_is_largest
-    #    #keep   = na_is_largest
-    # descending
-    #    #top    = na_is_largest
-    #    #bottom = na_is_smallest
-    #    #keep   = na_is_smallest
-    if ascending:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-        else:
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-    else:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-        else:
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-    c_null_handling = (
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if na_option == 'keep'
-        else pylibcudf.types.NullPolicy.INCLUDE
-    )
-
-    return [
-        Column.from_pylibcudf(
-            pylibcudf.sorting.rank(
-                col.to_pylibcudf(mode="read"),
-                method,
-                column_order,
-                c_null_handling,
-                null_precedence,
-                pct,
-            )
-        )
-        for col in source_columns
-    ]
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
deleted file mode 100644
index 1b8831940e3..00000000000
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def drop_nulls(list columns, how="any", keys=None, thresh=None):
-    """
-    Drops null rows from cols depending on key columns.
-
-    Parameters
-    ----------
-    columns : list of columns
-    how  : "any" or "all". If thresh is None, drops rows of cols that have any
-           nulls or all nulls (respectively) in subset (default: "any")
-    keys : List of column indices. If set, then these columns are checked for
-           nulls rather than all of columns (optional)
-    thresh : Minimum number of non-nulls required to keep a row (optional)
-
-    Returns
-    -------
-    columns with null rows dropped
-    """
-    if how not in {"any", "all"}:
-        raise ValueError("how must be 'any' or 'all'")
-
-    keys = list(keys if keys is not None else range(len(columns)))
-
-    # Note: If how == "all" and thresh is specified this prioritizes thresh
-    if thresh is not None:
-        keep_threshold = thresh
-    elif how == "all":
-        keep_threshold = 1
-    else:
-        keep_threshold = len(keys)
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.drop_nulls(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            keys,
-            keep_threshold,
-        )
-    )
-
-
-@acquire_spill_lock()
-def apply_boolean_mask(list columns, Column boolean_mask):
-    """
-    Drops the rows which correspond to False in boolean_mask.
-
-    Parameters
-    ----------
-    columns : list of columns whose rows are dropped as per boolean_mask
-    boolean_mask : a boolean column of same size as source_table
-
-    Returns
-    -------
-    columns obtained from applying mask
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.apply_boolean_mask(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            boolean_mask.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-_keep_options = {
-    "first": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-    "last": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-    False: pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-}
-
-
-@acquire_spill_lock()
-def drop_duplicates(list columns,
-                    object keys=None,
-                    object keep='first',
-                    bool nulls_are_equal=True):
-    """
-    Drops rows in source_table as per duplicate rows in keys.
-
-    Parameters
-    ----------
-    columns : List of columns
-    keys : List of column indices. If set, then these columns are checked for
-           duplicates rather than all of columns (optional)
-    keep : keep 'first' or 'last' or none of the duplicate rows
-    nulls_are_equal : if True, nulls are treated equal else not.
-
-    Returns
-    -------
-    columns with duplicate dropped
-    """
-    if (keep_option := _keep_options.get(keep)) is None:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.stable_distinct(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            list(keys if keys is not None else range(len(columns))),
-            keep_option,
-            pylibcudf.types.NullEquality.EQUAL
-            if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL,
-            pylibcudf.types.NanEquality.ALL_EQUAL,
-        )
-    )
-
-
-@acquire_spill_lock()
-def distinct_indices(
-    list columns,
-    object keep="first",
-    bool nulls_equal=True,
-    bool nans_equal=True,
-):
-    """
-    Return indices of the distinct rows in a table.
-
-    Parameters
-    ----------
-    columns : list of columns to check for duplicates
-    keep : treat "first", "last", or (False) none of any duplicate
-        rows as distinct
-    nulls_equal : Should nulls compare equal
-    nans_equal: Should nans compare equal
-
-    Returns
-    -------
-    Column of indices
-
-    See Also
-    --------
-    drop_duplicates
-    """
-    if (keep_option := _keep_options.get(keep)) is None:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return Column.from_pylibcudf(
-        pylibcudf.stream_compaction.distinct_indices(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            keep_option,
-            pylibcudf.types.NullEquality.EQUAL
-            if nulls_equal else pylibcudf.types.NullEquality.UNEQUAL,
-            pylibcudf.types.NanEquality.ALL_EQUAL
-            if nans_equal else pylibcudf.types.NanEquality.UNEQUAL,
-        )
-    )
-
-
-@acquire_spill_lock()
-def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
-    """
-    Finds number of unique rows in `source_column`
-
-    Parameters
-    ----------
-    source_column : source table checked for unique rows
-    ignore_nulls : If True nulls are ignored,
-                   else counted as one more distinct value
-    nan_as_null  : If True, NAN is considered NULL,
-                   else counted as one more distinct value
-
-    Returns
-    -------
-    Count of number of unique rows in `source_column`
-    """
-    return pylibcudf.stream_compaction.distinct_count(
-        source_column.to_pylibcudf(mode="read"),
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if ignore_nulls else pylibcudf.types.NullPolicy.INCLUDE,
-        pylibcudf.types.NanPolicy.NAN_IS_NULL
-        if nan_as_null else pylibcudf.types.NanPolicy.NAN_IS_VALID,
-    )
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
deleted file mode 100644
index 06ee07d8e2b..00000000000
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ /dev/null
@@ -1,598 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-from pylibcudf.types cimport DataType
-
-from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-
-def floating_to_string(Column input_col):
-    plc_column = plc.strings.convert.convert_floats.from_floats(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def string_to_floating(Column input_col, DataType out_type):
-    plc_column = plc.strings.convert.convert_floats.to_floats(
-        input_col.to_pylibcudf(mode="read"),
-        out_type
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def dtos(Column input_col):
-    """
-    Converting/Casting input column of type double to string column
-
-    Parameters
-    ----------
-    input_col : input column of type double
-
-    Returns
-    -------
-    A Column with double values cast to string
-    """
-
-    return floating_to_string(input_col)
-
-
-def stod(Column input_col):
-    """
-    Converting/Casting input column of type string to double
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to double
-    """
-
-    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64))
-
-
-def ftos(Column input_col):
-    """
-    Converting/Casting input column of type float to string column
-
-    Parameters
-    ----------
-    input_col : input column of type double
-
-    Returns
-    -------
-    A Column with float values cast to string
-    """
-
-    return floating_to_string(input_col)
-
-
-def stof(Column input_col):
-    """
-    Converting/Casting input column of type string to float
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to float
-    """
-
-    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32))
-
-
-def integer_to_string(Column input_col):
-    plc_column = plc.strings.convert.convert_integers.from_integers(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def string_to_integer(Column input_col, DataType out_type):
-    plc_column = plc.strings.convert.convert_integers.to_integers(
-        input_col.to_pylibcudf(mode="read"),
-        out_type
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def i8tos(Column input_col):
-    """
-    Converting/Casting input column of type int8 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int8
-
-    Returns
-    -------
-    A Column with int8 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi8(Column input_col):
-    """
-    Converting/Casting input column of type string to int8
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int8
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8))
-
-
-def i16tos(Column input_col):
-    """
-    Converting/Casting input column of type int16 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int16
-
-    Returns
-    -------
-    A Column with int16 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi16(Column input_col):
-    """
-    Converting/Casting input column of type string to int16
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int16
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16))
-
-
-def itos(Column input_col):
-    """
-    Converting/Casting input column of type int32 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int32
-
-    Returns
-    -------
-    A Column with int32 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi(Column input_col):
-    """
-    Converting/Casting input column of type string to int32
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int32
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32))
-
-
-def ltos(Column input_col):
-    """
-    Converting/Casting input column of type int64 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int64
-
-    Returns
-    -------
-    A Column with int64 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stol(Column input_col):
-    """
-    Converting/Casting input column of type string to int64
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int64
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64))
-
-
-def ui8tos(Column input_col):
-    """
-    Converting/Casting input column of type uint8 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint8
-
-    Returns
-    -------
-    A Column with uint8 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui8(Column input_col):
-    """
-    Converting/Casting input column of type string to uint8
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint8
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8))
-
-
-def ui16tos(Column input_col):
-    """
-    Converting/Casting input column of type uint16 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint16
-
-    Returns
-    -------
-    A Column with uint16 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui16(Column input_col):
-    """
-    Converting/Casting input column of type string to uint16
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint16
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16))
-
-
-def uitos(Column input_col):
-    """
-    Converting/Casting input column of type uint32 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint32
-
-    Returns
-    -------
-    A Column with uint32 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui(Column input_col):
-    """
-    Converting/Casting input column of type string to uint32
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint32
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32))
-
-
-def ultos(Column input_col):
-    """
-    Converting/Casting input column of type uint64 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint64
-
-    Returns
-    -------
-    A Column with uint64 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoul(Column input_col):
-    """
-    Converting/Casting input column of type string to uint64
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint64
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64))
-
-
-def to_booleans(Column input_col):
-    plc_column = plc.strings.convert.convert_booleans.to_booleans(
-        input_col.to_pylibcudf(mode="read"),
-        as_device_scalar("True").c_value,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def from_booleans(Column input_col):
-    plc_column = plc.strings.convert.convert_booleans.from_booleans(
-        input_col.to_pylibcudf(mode="read"),
-        as_device_scalar("True").c_value,
-        as_device_scalar("False").c_value,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def int2timestamp(
-        Column input_col,
-        str format,
-        Column names):
-    """
-    Converting/Casting input date-time column to string
-    column with specified format
-
-    Parameters
-    ----------
-    input_col : input column of type timestamp in integer format
-    format : The string specifying output format
-    names : The string names to use for weekdays ("%a", "%A") and
-    months ("%b", "%B")
-
-    Returns
-    -------
-    A Column with date-time represented in string format
-
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_datetime.from_timestamps(
-            input_col.to_pylibcudf(mode="read"),
-            format,
-            names.to_pylibcudf(mode="read")
-        )
-    )
-
-
-def timestamp2int(Column input_col, dtype, format):
-    """
-    Converting/Casting input string column to date-time column with specified
-    timestamp_format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with string represented in date-time format
-
-    """
-    dtype = dtype_to_pylibcudf_type(dtype)
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_datetime.to_timestamps(
-            input_col.to_pylibcudf(mode="read"),
-            dtype,
-            format
-        )
-    )
-
-
-def istimestamp(Column input_col, str format):
-    """
-    Check input string column matches the specified timestamp format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    format : format string of timestamp specifiers
-
-    Returns
-    -------
-    A Column of boolean values identifying strings that matched the format.
-
-    """
-    plc_column = plc.strings.convert.convert_datetime.is_timestamp(
-        input_col.to_pylibcudf(mode="read"),
-        format
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def timedelta2int(Column input_col, dtype, format):
-    """
-    Converting/Casting input string column to TimeDelta column with specified
-    format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with string represented in TimeDelta format
-
-    """
-    dtype = dtype_to_pylibcudf_type(dtype)
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_durations.to_durations(
-            input_col.to_pylibcudf(mode="read"),
-            dtype,
-            format
-        )
-    )
-
-
-def int2timedelta(Column input_col, str format):
-    """
-    Converting/Casting input Timedelta column to string
-    column with specified format
-
-    Parameters
-    ----------
-    input_col : input column of type Timedelta in integer format
-
-    Returns
-    -------
-    A Column with Timedelta represented in string format
-
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_durations.from_durations(
-            input_col.to_pylibcudf(mode="read"),
-            format
-        )
-    )
-
-
-def int2ip(Column input_col):
-    """
-    Converting/Casting integer column to string column in ipv4 format
-
-    Parameters
-    ----------
-    input_col : input integer column
-
-    Returns
-    -------
-    A Column with integer represented in string ipv4 format
-
-    """
-    plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
-        input_col.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def ip2int(Column input_col):
-    """
-    Converting string ipv4 column to integer column
-
-    Parameters
-    ----------
-    input_col : input string column
-
-    Returns
-    -------
-    A Column with ipv4 represented as integer
-
-    """
-    plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers(
-        input_col.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def is_ipv4(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn
-    where nnn is integer digits in [0,255].
-    """
-    plc_column = plc.strings.convert.convert_ipv4.is_ipv4(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def htoi(Column input_col):
-    """
-    Converting input column of type string having hex values
-    to integer of out_type
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column of integers parsed from hexadecimal string values.
-    """
-    plc_column = plc.strings.convert.convert_integers.hex_to_integers(
-        input_col.to_pylibcudf(mode="read"),
-        plc.DataType(plc.TypeId.INT64)
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def is_hex(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have hex characters.
-    """
-    plc_column = plc.strings.convert.convert_integers.is_hex(
-        source_strings.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def itoh(Column input_col):
-    """
-    Converting input column of type integer to a string
-    column with hexadecimal character digits.
-
-    Parameters
-    ----------
-    input_col : input column of type integer
-
-    Returns
-    -------
-    A Column of strings with hexadecimal characters.
-    """
-    plc_column = plc.strings.convert.convert_integers.integers_to_hex(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 6cc52d046af..900be721c9a 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -1,22 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column_view
-from pylibcudf.libcudf.table.table cimport table, table_view
-
-
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
-cdef data_from_table_view(
-    table_view tv, object owner, object column_names, object index_names=*)
-cdef table_view table_view_from_columns(columns) except *
-cdef table_view table_view_from_table(tbl, ignore_index=*) except*
-cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
-cdef columns_from_table_view(table_view tv, object owners)
 cpdef columns_from_pylibcudf_table(tbl)
 cpdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index ff032656f80..975c9eb741c 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,233 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pyarrow as pa
-
 import cudf
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column, column_view
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from pylibcudf cimport Column as plc_Column
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
-
-PARQUET_META_TYPE_MAP = {
-    str(cudf_dtype): str(pandas_dtype)
-    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
-}
-
-cdef table_view table_view_from_columns(columns) except*:
-    """Create a cudf::table_view from an iterable of Columns."""
-    cdef vector[column_view] column_views
-
-    cdef Column col
-    for col in columns:
-        column_views.push_back(col.view())
-
-    return table_view(column_views)
-
-
-cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
-    """Create a cudf::table_view from a Table.
-
-    Parameters
-    ----------
-    ignore_index : bool, default False
-        If True, don't include the index in the columns.
-    """
-    return table_view_from_columns(
-        tbl._index._columns + tbl._columns
-        if not ignore_index and tbl._index is not None
-        else tbl._columns
-    )
-
-
-cpdef generate_pandas_metadata(table, index):
-    col_names = []
-    types = []
-    index_levels = []
-    index_descriptors = []
-    columns_to_convert = list(table._columns)
-    # Columns
-    for name, col in table._column_labels_and_values:
-        if cudf.get_option("mode.pandas_compatible"):
-            # in pandas-compat mode, non-string column names are stringified.
-            col_names.append(str(name))
-        else:
-            col_names.append(name)
-
-        if isinstance(col.dtype, cudf.CategoricalDtype):
-            raise ValueError(
-                "'category' column dtypes are currently not "
-                + "supported by the gpu accelerated parquet writer"
-            )
-        elif isinstance(col.dtype, (
-            cudf.ListDtype,
-            cudf.StructDtype,
-            cudf.core.dtypes.DecimalDtype
-        )):
-            types.append(col.dtype.to_arrow())
-        else:
-            # A boolean element takes 8 bits in cudf and 1 bit in
-            # pyarrow. To make sure the cudf format is interperable
-            # in arrow, we use `int8` type when converting from a
-            # cudf boolean array.
-            if col.dtype.type == np.bool_:
-                types.append(pa.int8())
-            else:
-                types.append(np_to_pa_dtype(col.dtype))
-
-    # Indexes
-    materialize_index = False
-    if index is not False:
-        for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.MultiIndex):
-                idx = table.index.get_level_values(level)
-            else:
-                idx = table.index
-
-            if isinstance(idx, cudf.RangeIndex):
-                if index is None:
-                    descr = {
-                        "kind": "range",
-                        "name": table.index.name,
-                        "start": table.index.start,
-                        "stop": table.index.stop,
-                        "step": table.index.step,
-                    }
-                else:
-                    materialize_index = True
-                    # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = idx._as_int_index()
-                    descr = _index_level_name(
-                        index_name=materialized_idx.name,
-                        level=level,
-                        column_names=col_names
-                    )
-                    index_levels.append(materialized_idx)
-                    columns_to_convert.append(materialized_idx._values)
-                    col_names.append(descr)
-                    types.append(np_to_pa_dtype(materialized_idx.dtype))
-            else:
-                descr = _index_level_name(
-                    index_name=idx.name,
-                    level=level,
-                    column_names=col_names
-                )
-                columns_to_convert.append(idx._values)
-                col_names.append(descr)
-                if isinstance(idx.dtype, cudf.CategoricalDtype):
-                    raise ValueError(
-                        "'category' column dtypes are currently not "
-                        + "supported by the gpu accelerated parquet writer"
-                    )
-                elif isinstance(idx.dtype, cudf.ListDtype):
-                    types.append(col.dtype.to_arrow())
-                else:
-                    # A boolean element takes 8 bits in cudf and 1 bit in
-                    # pyarrow. To make sure the cudf format is interperable
-                    # in arrow, we use `int8` type when converting from a
-                    # cudf boolean array.
-                    if idx.dtype.type == np.bool_:
-                        types.append(pa.int8())
-                    else:
-                        types.append(np_to_pa_dtype(idx.dtype))
-
-                index_levels.append(idx)
-            index_descriptors.append(descr)
-
-    df_meta = table.head(0)
-    if materialize_index:
-        df_meta.index = df_meta.index._as_int_index()
-    metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=columns_to_convert,
-        # It is OKAY to do `.head(0).to_pandas()` because
-        # this method will extract `.columns` metadata only
-        df=df_meta.to_pandas(),
-        column_names=col_names,
-        index_levels=index_levels,
-        index_descriptors=index_descriptors,
-        preserve_index=index,
-        types=types,
-    )
-
-    md_dict = json.loads(metadata[b"pandas"])
-
-    # correct metadata for list and struct and nullable numeric types
-    for col_meta in md_dict["columns"]:
-        if (
-            col_meta["name"] in table._column_names
-            and table._data[col_meta["name"]].nullable
-            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
-            and col_meta["pandas_type"] != "decimal"
-        ):
-            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
-                col_meta["numpy_type"]
-            ]
-        if col_meta["numpy_type"] in ("list", "struct"):
-            col_meta["numpy_type"] = "object"
-
-    return json.dumps(md_dict)
-
-
-def _index_level_name(index_name, level, column_names):
-    """
-    Return the name of an index level or a default name
-    if `index_name` is None or is already a column name.
-
-    Parameters
-    ----------
-    index_name : name of an Index object
-    level : level of the Index object
-
-    Returns
-    -------
-    name : str
-    """
-    if index_name is not None and index_name not in column_names:
-        return index_name
-    else:
-        return f"__index_level_{level}__"
-
-
-cdef columns_from_unique_ptr(
-    unique_ptr[table] c_tbl
-):
-    """Convert a libcudf table into list of columns.
-
-    Parameters
-    ----------
-    c_tbl : unique_ptr[cudf::table]
-        The libcudf table whose columns will be extracted
-
-    Returns
-    -------
-    list[Column]
-        A list of columns.
-    """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
-
-    return [
-        Column.from_pylibcudf(
-            plc_Column.from_libcudf(move(dereference(it+i)))
-        ) for i in range(c_columns.size())
-    ]
 
 
 cpdef columns_from_pylibcudf_table(tbl):
@@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
         # the data while actually constructing the Index object here (instead
         # of just returning a dict for that as well). As we clean up the
         # Frame factories we may want to look for a less dissonant approach
-        # that does not impose performance penalties. The same applies to
-        # data_from_table_view below.
+        # that does not impose performance penalties.
         cudf.core.index._index_from_data(
             {
                 name: columns[i]
@@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
     return data, index
 
 
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=None
-):
-    return _data_from_columns(
-        columns_from_unique_ptr(move(c_tbl)),
-        column_names,
-        index_names
-    )
-
-
 cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
     return _data_from_columns(
         columns_from_pylibcudf_table(tbl),
@@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None)
         column_names=column_names,
         index_names=index_names
     )
-
-cdef columns_from_table_view(
-    table_view tv,
-    object owners,
-):
-    """
-    Given a ``cudf::table_view``, constructs a list of columns from it,
-    along with referencing an owner Python object that owns the memory
-    lifetime. owner must be either None or a list of column. If owner
-    is a list of columns, the owner of the `i`th ``cudf::column_view``
-    in the table view is ``owners[i]``. For more about memory ownership,
-    see ``Column.from_column_view``.
-    """
-
-    return [
-        Column.from_column_view(
-            tv.column(i), owners[i] if isinstance(owners, list) else None
-        ) for i in range(tv.num_columns())
-    ]
-
-cdef data_from_table_view(
-    table_view tv,
-    object owner,
-    object column_names,
-    object index_names=None
-):
-    """
-    Given a ``cudf::table_view``, constructs a Frame from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. If ``owner`` is a Frame we reach inside of it and
-    reach inside of each ``cudf.Column`` to make the owner of each newly
-    created ``Buffer`` underneath the ``cudf.Column`` objects of the
-    created Frame the respective ``Buffer`` from the relevant
-    ``cudf.Column`` of the ``owner`` Frame
-    """
-    cdef size_type column_idx = 0
-    table_owner = isinstance(owner, cudf.core.frame.Frame)
-
-    # First construct the index, if any
-    index = None
-    if index_names is not None:
-        index_columns = []
-        for _ in index_names:
-            column_owner = owner
-            if table_owner:
-                column_owner = owner._index._columns[column_idx]
-            index_columns.append(
-                Column.from_column_view(
-                    tv.column(column_idx),
-                    column_owner
-                )
-            )
-            column_idx += 1
-        index = cudf.core.index._index_from_data(
-            dict(zip(index_names, index_columns)))
-
-    # Construct the data dict
-    cdef size_type source_column_idx = 0
-    data_columns = []
-    for _ in column_names:
-        column_owner = owner
-        if table_owner:
-            column_owner = owner._columns[source_column_idx]
-        data_columns.append(
-            Column.from_column_view(tv.column(column_idx), column_owner)
-        )
-        column_idx += 1
-        source_column_idx += 1
-
-    return dict(zip(column_names, data_columns)), index
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 1b6152b81ca..f4543bc6156 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -10,17 +10,18 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import _gather_map_is_valid, gather
-from cudf._lib.stream_compaction import (
+from cudf._lib.types import size_type_dtype
+from cudf.api.extensions import no_default
+from cudf.api.types import is_integer, is_list_like, is_scalar
+from cudf.core._internals import copying
+from cudf.core._internals.stream_compaction import (
     apply_boolean_mask,
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.types import size_type_dtype
-from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
+from cudf.core.copy_types import GatherMap
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
 from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
@@ -413,7 +414,7 @@ def hasnans(self):
         raise NotImplementedError
 
     @property
-    def nlevels(self):
+    def nlevels(self) -> int:
         """
         Number of levels.
         """
@@ -1943,7 +1944,6 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
@@ -2032,7 +2032,6 @@ def dropna(self, how="any"):
             drop_nulls(
                 data_columns,
                 how=how,
-                keys=range(len(data_columns)),
             ),
             self._column_names,
         )
@@ -2050,13 +2049,9 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
-        if not _gather_map_is_valid(
-            gather_map, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
-
+        GatherMap(gather_map, len(self), nullify=not check_bounds or nullify)
         return self._from_columns_like_self(
-            gather(list(self._columns), gather_map, nullify=nullify),
+            copying.gather(self._columns, gather_map, nullify=nullify),
             self._column_names,
         )
 
diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py
new file mode 100644
index 00000000000..34c1850cb72
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/copying.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+import cudf
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+
+
+@acquire_spill_lock()
+def gather(
+    columns: Iterable[ColumnBase],
+    gather_map: NumericalColumn,
+    nullify: bool = False,
+) -> list[ColumnBase]:
+    plc_tbl = plc.copying.gather(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        gather_map.to_pylibcudf(mode="read"),
+        plc.copying.OutOfBoundsPolicy.NULLIFY
+        if nullify
+        else plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+    )
+    return [
+        cudf._lib.column.Column.from_pylibcudf(col)
+        for col in plc_tbl.columns()
+    ]
+
+
+@acquire_spill_lock()
+def scatter(
+    sources: list[ColumnBase | cudf.Scalar],
+    scatter_map: NumericalColumn,
+    target_columns: list[ColumnBase],
+    bounds_check: bool = True,
+):
+    """
+    Scattering source into target as per the scatter map.
+    `source` can be a list of scalars, or a list of columns. The number of
+    items in `sources` must equal the number of `target_columns` to scatter.
+    """
+    # TODO: Only single column scatter is used, we should explore multi-column
+    # scatter for frames for performance increase.
+
+    if len(sources) != len(target_columns):
+        raise ValueError("Mismatched number of source and target columns.")
+
+    if len(sources) == 0:
+        return []
+
+    if bounds_check:
+        n_rows = len(target_columns[0])
+        if not (
+            (scatter_map >= -n_rows).all() and (scatter_map < n_rows).all()
+        ):
+            raise IndexError(
+                f"index out of bounds for column of size {n_rows}"
+            )
+
+    plc_tbl = plc.copying.scatter(
+        plc.Table([col.to_pylibcudf(mode="read") for col in sources])  # type: ignore[union-attr]
+        if isinstance(sources[0], cudf._lib.column.Column)
+        else [slr.device_value.c_value for slr in sources],  # type: ignore[union-attr]
+        scatter_map.to_pylibcudf(mode="read"),
+        plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
+    )
+
+    return [
+        cudf._lib.column.Column.from_pylibcudf(col)
+        for col in plc_tbl.columns()
+    ]
+
+
+@acquire_spill_lock()
+def columns_split(
+    input_columns: Iterable[ColumnBase], splits: list[int]
+) -> list[list[ColumnBase]]:
+    return [
+        [
+            cudf._lib.column.Column.from_pylibcudf(col)
+            for col in plc_tbl.columns()
+        ]
+        for plc_tbl in plc.copying.split(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in input_columns]
+            ),
+            splits,
+        )
+    ]
diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py
new file mode 100644
index 00000000000..69f9e7664b1
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/sorting.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def is_sorted(
+    source_columns: list[ColumnBase],
+    ascending: list[bool] | None = None,
+    null_position: list[bool] | None = None,
+) -> bool:
+    """
+    Checks whether the rows of a `table` are sorted in lexicographical order.
+
+    Parameters
+    ----------
+    source_columns : list of columns
+        columns to be checked for sort order
+    ascending : None or list-like of booleans
+        None or list-like of boolean values indicating expected sort order of
+        each column. If list-like, size of list-like must be len(columns). If
+        None, all columns expected sort order is set to ascending. False (0) -
+        descending, True (1) - ascending.
+    null_position : None or list-like of booleans
+        None or list-like of boolean values indicating desired order of nulls
+        compared to other elements. If list-like, size of list-like must be
+        len(columns). If None, null order is set to before. False (0) - after,
+        True (1) - before.
+
+    Returns
+    -------
+    returns : boolean
+        Returns True, if sorted as expected by ``ascending`` and
+        ``null_position``, False otherwise.
+    """
+    if ascending is None:
+        column_order = [plc.types.Order.ASCENDING] * len(source_columns)
+    else:
+        if len(ascending) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(ascending)} for `ascending`"
+            )
+        column_order = [
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+            for asc in ascending
+        ]
+
+    if null_position is None:
+        null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns)
+    else:
+        if len(null_position) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(null_position)} for `null_position`"
+            )
+        null_precedence = [
+            plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER
+            for null in null_position
+        ]
+
+    return plc.sorting.is_sorted(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        column_order,
+        null_precedence,
+    )
+
+
+def ordering(
+    column_order: list[bool],
+    null_precedence: Iterable[Literal["first", "last"]],
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Construct order and null order vectors
+
+    Parameters
+    ----------
+    column_order
+        Iterable of bool (True for ascending order, False for descending)
+    null_precedence
+        Iterable string for null positions ("first" for start, "last" for end)
+
+    Both iterables must be the same length (not checked)
+
+    Returns
+    -------
+    pair of vectors (order, and null_order)
+    """
+    c_column_order = []
+    c_null_precedence = []
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.append(
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+        )
+        if asc ^ (null == "first"):
+            c_null_precedence.append(plc.types.NullOrder.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.append(plc.types.NullOrder.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    return c_column_order, c_null_precedence
+
+
+@acquire_spill_lock()
+def order_by(
+    columns_from_table: list[ColumnBase],
+    ascending: list[bool],
+    na_position: Literal["first", "last"],
+    *,
+    stable: bool,
+):
+    """
+    Get index to sort the table in ascending/descending order.
+
+    Parameters
+    ----------
+    columns_from_table : list[Column]
+        Columns from the table which will be sorted
+    ascending : sequence[bool]
+         Sequence of boolean values which correspond to each column
+         in the table to be sorted signifying the order of each column
+         True - Ascending and False - Descending
+    na_position : str
+        Whether null values should show up at the "first" or "last"
+        position of **all** sorted column.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    Column of indices that sorts the table
+    """
+    order = ordering(ascending, itertools.repeat(na_position))
+    func = (
+        plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns_from_table],
+            ),
+            order[0],
+            order[1],
+        )
+    )
+
+
+@acquire_spill_lock()
+def sort_by_key(
+    values: list[ColumnBase],
+    keys: list[ColumnBase],
+    ascending: list[bool],
+    na_position: list[Literal["first", "last"]],
+    *,
+    stable: bool,
+) -> list[ColumnBase]:
+    """
+    Sort a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    ascending : list[bool]
+        Sequence of boolean values which correspond to each column
+        in the table to be sorted signifying the order of each column
+        True - Ascending and False - Descending
+    na_position : list[str]
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    order = ordering(ascending, na_position)
+    func = (
+        plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+    )
+    return [
+        Column.from_pylibcudf(col)
+        for col in func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
+            order[0],
+            order[1],
+        ).columns()
+    ]
diff --git a/python/cudf/cudf/core/_internals/stream_compaction.py b/python/cudf/cudf/core/_internals/stream_compaction.py
new file mode 100644
index 00000000000..4ccc26c2a1c
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/stream_compaction.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def drop_nulls(
+    columns: list[ColumnBase],
+    how: Literal["any", "all"] = "any",
+    keys: list[int] | None = None,
+    thresh: int | None = None,
+) -> list[ColumnBase]:
+    """
+    Drops null rows from cols depending on key columns.
+
+    Parameters
+    ----------
+    columns : list of columns
+    how  : "any" or "all". If thresh is None, drops rows of cols that have any
+           nulls or all nulls (respectively) in subset (default: "any")
+    keys : List of column indices. If set, then these columns are checked for
+           nulls rather than all of columns (optional)
+    thresh : Minimum number of non-nulls required to keep a row (optional)
+
+    Returns
+    -------
+    columns with null rows dropped
+    """
+    if how not in {"any", "all"}:
+        raise ValueError("how must be 'any' or 'all'")
+
+    keys = keys if keys is not None else list(range(len(columns)))
+
+    # Note: If how == "all" and thresh is specified this prioritizes thresh
+    if thresh is not None:
+        keep_threshold = thresh
+    elif how == "all":
+        keep_threshold = 1
+    else:
+        keep_threshold = len(keys)
+
+    plc_table = plc.stream_compaction.drop_nulls(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        keys,
+        keep_threshold,
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
+
+
+@acquire_spill_lock()
+def apply_boolean_mask(
+    columns: list[ColumnBase], boolean_mask: ColumnBase
+) -> list[ColumnBase]:
+    """
+    Drops the rows which correspond to False in boolean_mask.
+
+    Parameters
+    ----------
+    columns : list of columns whose rows are dropped as per boolean_mask
+    boolean_mask : a boolean column of same size as source_table
+
+    Returns
+    -------
+    columns obtained from applying mask
+    """
+    plc_table = plc.stream_compaction.apply_boolean_mask(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        boolean_mask.to_pylibcudf(mode="read"),
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
+
+
+@acquire_spill_lock()
+def drop_duplicates(
+    columns: list[ColumnBase],
+    keys: list[int] | None = None,
+    keep: Literal["first", "last", False] = "first",
+    nulls_are_equal: bool = True,
+) -> list[ColumnBase]:
+    """
+    Drops rows in source_table as per duplicate rows in keys.
+
+    Parameters
+    ----------
+    columns : List of columns
+    keys : List of column indices. If set, then these columns are checked for
+           duplicates rather than all of columns (optional)
+    keep : keep 'first' or 'last' or none of the duplicate rows
+    nulls_are_equal : if True, nulls are treated equal else not.
+
+    Returns
+    -------
+    columns with duplicate dropped
+    """
+    _keep_options = {
+        "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+        "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+        False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+    }
+    if (keep_option := _keep_options.get(keep)) is None:
+        raise ValueError('keep must be either "first", "last" or False')
+
+    plc_table = plc.stream_compaction.stable_distinct(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        keys if keys is not None else list(range(len(columns))),
+        keep_option,
+        plc.types.NullEquality.EQUAL
+        if nulls_are_equal
+        else plc.types.NullEquality.UNEQUAL,
+        plc.types.NanEquality.ALL_EQUAL,
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 71ec11e75af..d9b54008e85 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1095,17 +1095,22 @@ def as_categorical_column(self, dtype: Dtype) -> Self:
             raise ValueError("dtype must be CategoricalDtype")
 
         if not isinstance(self.categories, type(dtype.categories._column)):
-            # If both categories are of different Column types,
-            # return a column full of Nulls.
-            codes = cast(
-                cudf.core.column.numerical.NumericalColumn,
-                column.as_column(
-                    _DEFAULT_CATEGORICAL_VALUE,
-                    length=self.size,
-                    dtype=self.codes.dtype,
-                ),
-            )
-            codes = as_unsigned_codes(len(dtype.categories), codes)
+            if isinstance(
+                self.categories.dtype, cudf.StructDtype
+            ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype):
+                codes = self.codes
+            else:
+                # Otherwise if both categories are of different Column types,
+                # return a column full of nulls.
+                codes = cast(
+                    cudf.core.column.numerical.NumericalColumn,
+                    column.as_column(
+                        _DEFAULT_CATEGORICAL_VALUE,
+                        length=self.size,
+                        dtype=self.codes.dtype,
+                    ),
+                )
+                codes = as_unsigned_codes(len(dtype.categories), codes)
             return type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
@@ -1193,7 +1198,7 @@ def _concat(
                 f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
             )
         elif newsize == 0:
-            codes_col = column.column_empty(0, head.codes.dtype, masked=True)
+            codes_col = column.column_empty(0, head.codes.dtype)
         else:
             codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 42b4fda8be2..cccafaeba88 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -26,12 +26,6 @@
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
 from cudf._lib.scalar import as_device_scalar
-from cudf._lib.stream_compaction import (
-    apply_boolean_mask,
-    distinct_count as cpp_distinct_count,
-    drop_duplicates,
-    drop_nulls,
-)
 from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -42,7 +36,12 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import aggregation, unary
+from cudf.core._internals import aggregation, copying, sorting, unary
+from cudf.core._internals.stream_compaction import (
+    apply_boolean_mask,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -51,6 +50,7 @@
     as_buffer,
     cuda_array_interface_wrapper,
 )
+from cudf.core.copy_types import GatherMap
 from cudf.core.dtypes import (
     CategoricalDtype,
     DecimalDtype,
@@ -77,6 +77,7 @@
     import builtins
 
     from cudf._typing import ColumnLike, Dtype, ScalarLike
+    from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_210:
     NumpyExtensionArray = pd.arrays.NumpyExtensionArray
@@ -274,10 +275,11 @@ def any(self, skipna: bool = True) -> bool:
 
     def dropna(self) -> Self:
         if self.has_nulls():
-            return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+            return drop_nulls([self])[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             return self.copy()
 
+    @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
@@ -294,9 +296,7 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
-            "None"
-        ].chunk(0)
+        return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -333,26 +333,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.indices for chunk in data["None"].chunks],
+                [
+                    pa.chunked_array(
+                        [chunk.indices for chunk in data.column(0).chunks],
                         type=array.type.index_type,
                     )
-                }
+                ],
+                [None],
             )
             dictionaries_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.dictionary for chunk in data["None"].chunks],
+                [
+                    pa.chunked_array(
+                        [chunk.dictionary for chunk in data.column(0).chunks],
                         type=array.type.value_type,
                     )
-                }
+                ],
+                [None],
             )
-
-            codes = libcudf.interop.from_arrow(indices_table)[0]
-            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
+            with acquire_spill_lock():
+                codes = cls.from_pylibcudf(
+                    plc.interop.from_arrow(indices_table).columns()[0]
+                )
+                categories = cls.from_pylibcudf(
+                    plc.interop.from_arrow(dictionaries_table).columns()[0]
+                )
             codes = cudf.core.column.categorical.as_unsigned_codes(
-                len(categories), codes
+                len(categories),
+                codes,  # type: ignore[arg-type]
             )
             return cudf.core.column.CategoricalColumn(
                 data=None,
@@ -363,10 +370,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 mask=codes.base_mask,
                 children=(codes,),
             )
-
-        result = libcudf.interop.from_arrow(data)[0]
-
-        return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
+        else:
+            result = cls.from_pylibcudf(
+                plc.interop.from_arrow(data).columns()[0]
+            )
+            # TODO: cudf_dtype_from_pa_type may be less necessary for some types
+            return result._with_type_metadata(
+                cudf_dtype_from_pa_type(array.type)
+            )
 
     @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:
@@ -431,8 +442,16 @@ def _fill(
             )
         return self
 
-    def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
-        return libcudf.copying.shift(self, offset, fill_value)
+    @acquire_spill_lock()
+    def shift(self, offset: int, fill_value: ScalarLike) -> Self:
+        if not isinstance(fill_value, cudf.Scalar):
+            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
+        plc_col = plc.copying.shift(
+            self.to_pylibcudf(mode="read"),
+            offset,
+            fill_value.device_value.c_value,
+        )
+        return type(self).from_pylibcudf(plc_col)  # type: ignore[return-value]
 
     @property
     def nullmask(self) -> Buffer:
@@ -460,8 +479,11 @@ def copy(self, deep: bool = True) -> Self:
             them.
         """
         if deep:
-            result = libcudf.copying.copy_column(self)
-            return result._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                result = type(self).from_pylibcudf(
+                    self.to_pylibcudf(mode="read").copy()
+                )
+            return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             return cast(
                 Self,
@@ -542,7 +564,15 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        return libcudf.copying.get_element(self, idx).value
+        with acquire_spill_lock():
+            dscalar = libcudf.scalar.DeviceScalar.from_pylibcudf(
+                plc.copying.get_element(
+                    self.to_pylibcudf(mode="read"),
+                    idx,
+                ),
+                dtype=self.dtype,
+            )
+        return dscalar.value
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         stride = 1 if stride is None else stride
@@ -551,12 +581,18 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return cast(Self, column_empty(0, self.dtype, masked=True))
+            return cast(Self, column_empty(0, self.dtype))
         # compute mask slice
         if stride == 1:
-            return libcudf.copying.column_slice(self, [start, stop])[
-                0
-            ]._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                result = [
+                    type(self).from_pylibcudf(col)
+                    for col in plc.copying.slice(
+                        self.to_pylibcudf(mode="read"),
+                        [start, stop],
+                    )
+                ]
+            return result[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             # Need to create a gather map for given slice with stride
             gather_map = as_column(
@@ -625,9 +661,16 @@ def _scatter_by_slice(
             if isinstance(value, cudf.core.scalar.Scalar):
                 return self._fill(value, start, stop, inplace=True)
             else:
-                return libcudf.copying.copy_range(
-                    value, self, 0, num_keys, start, stop, False
-                )
+                with acquire_spill_lock():
+                    return type(self).from_pylibcudf(  # type: ignore[return-value]
+                        plc.copying.copy_range(
+                            value.to_pylibcudf(mode="read"),
+                            self.to_pylibcudf(mode="read"),
+                            0,
+                            num_keys,
+                            start,
+                        )
+                    )
 
         # step != 1, create a scatter map with arange
         scatter_map = cast(
@@ -671,11 +714,21 @@ def _scatter_by_column(
         self._check_scatter_key_length(num_keys, value)
 
         if key.dtype.kind == "b":
-            return libcudf.copying.boolean_mask_scatter([value], [self], key)[
-                0
-            ]._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                plc_table = plc.copying.boolean_mask_scatter(
+                    plc.Table([value.to_pylibcudf(mode="read")])
+                    if isinstance(value, Column)
+                    else [value.device_value.c_value],
+                    plc.Table([self.to_pylibcudf(mode="read")]),
+                    key.to_pylibcudf(mode="read"),
+                )
+                return (
+                    type(self)  # type: ignore[return-value]
+                    .from_pylibcudf(plc_table.columns()[0])
+                    ._with_type_metadata(self.dtype)
+                )
         else:
-            return libcudf.copying.scatter([value], key, [self])[
+            return copying.scatter([value], key, [self])[
                 0
             ]._with_type_metadata(self.dtype)
 
@@ -805,7 +858,7 @@ def indices_of(
         else:
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
-        return apply_boolean_mask(
+        return apply_boolean_mask(  # type: ignore[return-value]
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
 
@@ -887,14 +940,9 @@ def take(
         # be done by the caller. This check will be removed in future release.
         if indices.dtype.kind not in {"u", "i"}:
             indices = indices.astype(libcudf.types.size_type_dtype)
-        if not libcudf.copying._gather_map_is_valid(
-            indices, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
-
-        return libcudf.copying.gather([self], indices, nullify=nullify)[
-            0
-        ]._with_type_metadata(self.dtype)
+        GatherMap(indices, len(self), nullify=not check_bounds or nullify)
+        gathered = copying.gather([self], indices, nullify=nullify)  # type: ignore[arg-type]
+        return gathered[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
     def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
@@ -996,13 +1044,13 @@ def is_unique(self) -> bool:
 
     @cached_property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [True], None
         )
 
     @cached_property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [False], None
         )
 
@@ -1026,23 +1074,34 @@ def contains(self, other: ColumnBase) -> ColumnBase:
     def sort_values(
         self: Self,
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
             return self.copy()
-        return libcudf.sort.sort(
-            [self], column_order=[ascending], null_precedence=[na_position]
-        )[0]
+        order = sorting.ordering([ascending], [na_position])
+        with acquire_spill_lock():
+            plc_table = plc.sorting.sort(
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                order[0],
+                order[1],
+            )
+            return type(self).from_pylibcudf(plc_table.columns()[0])  # type: ignore[return-value]
 
     def distinct_count(self, dropna: bool = True) -> int:
         try:
             return self._distinct_count[dropna]
         except KeyError:
-            self._distinct_count[dropna] = cpp_distinct_count(
-                self, ignore_nulls=dropna
-            )
+            with acquire_spill_lock():
+                result = plc.stream_compaction.distinct_count(
+                    self.to_pylibcudf(mode="read"),
+                    plc.types.NullPolicy.EXCLUDE
+                    if dropna
+                    else plc.types.NullPolicy.INCLUDE,
+                    plc.types.NanPolicy.NAN_IS_VALID,
+                )
+            self._distinct_count[dropna] = result
             return self._distinct_count[dropna]
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
@@ -1054,7 +1113,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             if self.dtype == dtype:
                 result = self
             else:
-                result = column_empty(0, dtype=dtype, masked=self.nullable)
+                result = column_empty(0, dtype=dtype)
         elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
@@ -1204,7 +1263,7 @@ def argsort(
                 as_column(range(len(self) - 1, -1, -1)),
             )
         else:
-            return libcudf.sort.order_by(
+            return sorting.order_by(
                 [self], [ascending], na_position, stable=True
             )
 
@@ -1271,7 +1330,7 @@ def unique(self) -> Self:
         if self.is_unique:
             return self.copy()
         else:
-            return drop_duplicates([self], keep="first")[
+            return drop_duplicates([self], keep="first")[  # type: ignore[return-value]
                 0
             ]._with_type_metadata(self.dtype)
 
@@ -1502,20 +1561,33 @@ def _return_sentinel_column():
         left_gather_map = type(self).from_pylibcudf(left_rows)
         right_gather_map = type(self).from_pylibcudf(right_rows)
 
-        codes = libcudf.copying.gather(
-            [as_column(range(len(cats)), dtype=dtype)],
-            right_gather_map,
-            nullify=True,
+        codes = as_column(range(len(cats)), dtype=dtype).take(
+            right_gather_map, nullify=True
         )
         del right_gather_map
         del right_rows
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        (codes,) = libcudf.sort.sort_by_key(
-            codes, [left_gather_map], [True], ["last"], stable=True
+        (codes,) = sorting.sort_by_key(
+            [codes], [left_gather_map], [True], ["last"], stable=True
         )
         return codes.fillna(na_sentinel.value)
 
+    @acquire_spill_lock()
+    def copy_if_else(
+        self, other: Self | cudf.Scalar, boolean_mask: NumericalColumn
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.copying.copy_if_else(
+                self.to_pylibcudf(mode="read"),
+                other.device_value.c_value
+                if isinstance(other, cudf.Scalar)
+                else other.to_pylibcudf(mode="read"),
+                boolean_mask.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
     def one_hot_encode(
         self, categories: ColumnBase
     ) -> abc.Generator[ColumnBase]:
@@ -1625,7 +1697,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
 def column_empty(
     row_count: int,
     dtype: Dtype = "object",
-    masked: bool = False,
     for_numba: bool = False,
 ) -> ColumnBase:
     """
@@ -1642,9 +1713,6 @@ def column_empty(
     dtype : Dtype
         Type of the column.
 
-    masked : bool
-        Unused.
-
     for_numba : bool, default False
         If True, don't allocate a mask as it's not supported by numba.
     """
@@ -2023,18 +2091,26 @@ def as_column(
             if isinstance(arbitrary.dtype, pd.DatetimeTZDtype):
                 new_tz = get_compatible_timezone(arbitrary.dtype)
                 arbitrary = arbitrary.astype(new_tz)
-            if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance(
-                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
-            ):
-                new_tz = get_compatible_timezone(
-                    arbitrary.dtype.categories.dtype
-                )
-                new_cats = arbitrary.dtype.categories.astype(new_tz)
-                new_dtype = pd.CategoricalDtype(
-                    categories=new_cats, ordered=arbitrary.dtype.ordered
-                )
-                arbitrary = arbitrary.astype(new_dtype)
-
+            if isinstance(arbitrary.dtype, pd.CategoricalDtype):
+                if isinstance(
+                    arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+                ):
+                    new_tz = get_compatible_timezone(
+                        arbitrary.dtype.categories.dtype
+                    )
+                    new_cats = arbitrary.dtype.categories.astype(new_tz)
+                    new_dtype = pd.CategoricalDtype(
+                        categories=new_cats, ordered=arbitrary.dtype.ordered
+                    )
+                    arbitrary = arbitrary.astype(new_dtype)
+                elif (
+                    isinstance(
+                        arbitrary.dtype.categories.dtype, pd.IntervalDtype
+                    )
+                    and dtype is None
+                ):
+                    # Conversion to arrow converts IntervalDtype to StructDtype
+                    dtype = cudf.CategoricalDtype.from_pandas(arbitrary.dtype)
             return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
@@ -2420,7 +2496,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
         dtype = cudf.dtype(None)
-        return column_empty(0, dtype=dtype, masked=True)
+        return column_empty(0, dtype=dtype)
 
     # If all columns are `NumericalColumn` with different dtypes,
     # we cast them to a common dtype.
@@ -2467,7 +2543,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
             f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
         )
     elif newsize == 0:
-        return column_empty(0, head.dtype, masked=True)
+        return column_empty(0, head.dtype)
 
     # Filter out inputs that have 0 length, then concatenate.
     objs_with_len = [o for o in objs if len(o)]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b526a6efa51..b6a4122ebb9 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -19,7 +19,6 @@
 
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import binaryop, unary
@@ -354,9 +353,7 @@ def is_year_end(self) -> ColumnBase:
 
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
-        return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
-            False
-        )
+        return leap.copy_if_else(non_leap, leap_dates).fillna(False)
 
     @property
     def is_leap_year(self) -> ColumnBase:
@@ -598,17 +595,20 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
-            names = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
+            names = column.column_empty(0, dtype="object")
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                plc.strings.convert.convert_datetime.from_timestamps(
+                    self.to_pylibcudf(mode="read"),
+                    format,
+                    names.to_pylibcudf(mode="read"),
+                )
             )
-        return string._datetime_to_str_typecast_functions[self.dtype](
-            self, format, names
-        )
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         format = _dtype_to_format_conversion.get(
@@ -1016,7 +1016,7 @@ def to_pandas(
                 self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
             )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return pa.compute.assume_timezone(
             self._local_time.to_arrow(), str(self.dtype.tz)
         )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 9e6a73f1a9c..09941665ba2 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")
+    def to_arrow(self) -> pa.Array:
+        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
 
         # use striding to set the first 32 bits of each 128-bit chunk:
@@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array):
         result.dtype.precision = data.type.precision
         return result
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return super().to_arrow().cast(self.dtype.to_arrow())
 
     def _with_type_metadata(
@@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")
+    def to_arrow(self) -> pa.Array:
+        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
 
         # use striding to set the first 64 bits of each 128-bit chunk:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ba98e28f6a2..3d9440cdf21 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn:
         """
         return cast(NumericalColumn, self.children[0])
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         offsets = self.offsets.to_arrow()
         elements = (
             pa.nulls(len(self.elements))
@@ -160,7 +160,7 @@ def to_arrow(self):
         pa_type = pa.list_(elements.type)
 
         if self.nullable:
-            nbuf = pa.py_buffer(self.mask.memoryview())
+            nbuf = pa.py_buffer(self.mask.memoryview())  # type: ignore[union-attr]
             buffers = (nbuf, offsets.buffers()[1])
         else:
             buffers = offsets.buffers()
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 28a2bd7fa6c..4405e153b0c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -14,8 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
-from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import acquire_spill_lock, as_buffer
@@ -366,22 +364,42 @@ def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         else:
             return NotImplemented
 
-    def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != cudf.dtype("uint32"):
+    @acquire_spill_lock()
+    def int2ip(self) -> cudf.core.column.StringColumn:
+        if self.dtype != np.dtype(np.uint32):
             raise TypeError("Only uint32 type can be converted to ip")
-
-        return libcudf.string_casting.int2ip(self)
+        plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
-        if len(self) > 0:
-            return string._numeric_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self)
-        else:
+        if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
                 column.column_empty(0, dtype="object"),
             )
+        elif self.dtype.kind == "b":
+            conv_func = functools.partial(
+                plc.strings.convert.convert_booleans.from_booleans,
+                true_string=cudf.Scalar(
+                    "True", dtype="str"
+                ).device_value.c_value,
+                false_string=cudf.Scalar(
+                    "False", dtype="str"
+                ).device_value.c_value,
+            )
+        elif self.dtype.kind in {"i", "u"}:
+            conv_func = plc.strings.convert.convert_integers.from_integers
+        elif self.dtype.kind == "f":
+            conv_func = plc.strings.convert.convert_floats.from_floats
+        else:
+            raise ValueError(f"No string conversion from type {self.dtype}")
+
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                conv_func(self.to_pylibcudf(mode="read"))
+            )
 
     def as_datetime_column(
         self, dtype: Dtype
@@ -718,6 +736,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
 
         return super()._reduction_result_dtype(reduction_op)
 
+    @acquire_spill_lock()
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
+        """Return the indices of the bins to which each value in column belongs.
+
+        Parameters
+        ----------
+        bins : np.ndarray
+            1-D column-like object of bins with same type as `column`, should be
+            monotonically increasing.
+        right : bool
+            Indicates whether interval contains the right or left bin edge.
+
+        Returns
+        -------
+        A column containing the indices
+        """
+        if self.dtype != bins.dtype:
+            raise ValueError(
+                "digitize() expects bins and input column have the same dtype."
+            )
+
+        bin_col = as_column(bins, dtype=bins.dtype)
+        if bin_col.nullable:
+            raise ValueError("`bins` cannot contain null entries.")
+
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            getattr(plc.search, "lower_bound" if right else "upper_bound")(
+                plc.Table([bin_col.to_pylibcudf(mode="read")]),
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                [plc.types.Order.ASCENDING],
+                [plc.types.NullOrder.BEFORE],
+            )
+        )
+
 
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list
@@ -772,34 +824,3 @@ def _normalize_find_and_replace_input(
     if not normalized_column.can_cast_safely(input_column_dtype):
         return normalized_column
     return normalized_column.astype(input_column_dtype)
-
-
-def digitize(
-    column: ColumnBase, bins: np.ndarray, right: bool = False
-) -> ColumnBase:
-    """Return the indices of the bins to which each value in column belongs.
-
-    Parameters
-    ----------
-    column : Column
-        Input column.
-    bins : Column-like
-        1-D column-like object of bins with same type as `column`, should be
-        monotonically increasing.
-    right : bool
-        Indicates whether interval contains the right or left bin edge.
-
-    Returns
-    -------
-    A column containing the indices
-    """
-    if not column.dtype == bins.dtype:
-        raise ValueError(
-            "Digitize() expects bins and input column have the same dtype."
-        )
-
-    bin_col = as_column(bins, dtype=bins.dtype)
-    if bin_col.nullable:
-        raise ValueError("`bins` cannot contain null entries.")
-
-    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index e06a0447f5c..689d5132d45 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -10,7 +10,7 @@
 import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import sorting
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
@@ -139,17 +139,18 @@ def quantile(
             result = cast(
                 NumericalBaseColumn,
                 cudf.core.column.column_empty(
-                    row_count=len(q), dtype=self.dtype, masked=True
+                    row_count=len(q), dtype=self.dtype
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
-            indices = libcudf.sort.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+            indices = sorting.order_by(
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c021554f3bd..fcdcb789f23 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -20,9 +20,8 @@
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
 from cudf import _lib as libcudf
-from cudf._lib import string_casting as str_cast
 from cudf._lib.column import Column
-from cudf._lib.types import size_type_dtype
+from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
@@ -49,62 +48,7 @@
     from cudf.core.column.numerical import NumericalColumn
 
 
-def str_to_boolean(column: StringColumn):
-    """Takes in string column and returns boolean column"""
-    with acquire_spill_lock():
-        plc_column = plc.strings.attributes.count_characters(
-            column.to_pylibcudf(mode="read")
-        )
-        result = Column.from_pylibcudf(plc_column)
-    return (result > cudf.Scalar(0, dtype="int8")).fillna(False)
-
-
-_str_to_numeric_typecast_functions = {
-    cudf.api.types.dtype("int8"): str_cast.stoi8,
-    cudf.api.types.dtype("int16"): str_cast.stoi16,
-    cudf.api.types.dtype("int32"): str_cast.stoi,
-    cudf.api.types.dtype("int64"): str_cast.stol,
-    cudf.api.types.dtype("uint8"): str_cast.stoui8,
-    cudf.api.types.dtype("uint16"): str_cast.stoui16,
-    cudf.api.types.dtype("uint32"): str_cast.stoui,
-    cudf.api.types.dtype("uint64"): str_cast.stoul,
-    cudf.api.types.dtype("float32"): str_cast.stof,
-    cudf.api.types.dtype("float64"): str_cast.stod,
-    cudf.api.types.dtype("bool"): str_to_boolean,
-}
-
-_numeric_to_str_typecast_functions = {
-    cudf.api.types.dtype("int8"): str_cast.i8tos,
-    cudf.api.types.dtype("int16"): str_cast.i16tos,
-    cudf.api.types.dtype("int32"): str_cast.itos,
-    cudf.api.types.dtype("int64"): str_cast.ltos,
-    cudf.api.types.dtype("uint8"): str_cast.ui8tos,
-    cudf.api.types.dtype("uint16"): str_cast.ui16tos,
-    cudf.api.types.dtype("uint32"): str_cast.uitos,
-    cudf.api.types.dtype("uint64"): str_cast.ultos,
-    cudf.api.types.dtype("float32"): str_cast.ftos,
-    cudf.api.types.dtype("float64"): str_cast.dtos,
-    cudf.api.types.dtype("bool"): str_cast.from_booleans,
-}
-
-_datetime_to_str_typecast_functions = {
-    # TODO: support Date32 UNIX days
-    # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp,
-}
-
-_timedelta_to_str_typecast_functions = {
-    cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta,
-}
-
-
-def _is_supported_regex_flags(flags):
+def _is_supported_regex_flags(flags: int) -> bool:
     return flags == 0 or (
         (flags & (re.MULTILINE | re.DOTALL) != 0)
         and (flags & ~(re.MULTILINE | re.DOTALL) == 0)
@@ -155,10 +99,7 @@ def htoi(self) -> SeriesOrIndex:
         3       51966
         dtype: int64
         """
-
-        out = str_cast.htoi(self._column)
-
-        return self._return_or_inplace(out, inplace=False)
+        return self._return_or_inplace(self._column.hex_to_integers())
 
     hex_to_int = htoi
 
@@ -188,10 +129,7 @@ def ip2int(self) -> SeriesOrIndex:
         2            0
         dtype: int64
         """
-
-        out = str_cast.ip2int(self._column)
-
-        return self._return_or_inplace(out, inplace=False)
+        return self._return_or_inplace(self._column.ipv4_to_integers())
 
     ip_to_int = ip2int
 
@@ -1380,7 +1318,7 @@ def ishex(self) -> SeriesOrIndex:
         4     True
         dtype: bool
         """
-        return self._return_or_inplace(str_cast.is_hex(self._column))
+        return self._return_or_inplace(self._column.is_hex())
 
     def istimestamp(self, format: str) -> SeriesOrIndex:
         """
@@ -1404,9 +1342,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(
-            str_cast.istimestamp(self._column, format)
-        )
+        return self._return_or_inplace(self._column.is_timestamp(format))
 
     def isfloat(self) -> SeriesOrIndex:
         r"""
@@ -1957,7 +1893,7 @@ def isipv4(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(str_cast.is_ipv4(self._column))
+        return self._return_or_inplace(self._column.is_ipv4())
 
     def lower(self) -> SeriesOrIndex:
         """
@@ -4125,9 +4061,7 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex:
         ends_column = self.endswith(suffix)._column  # type: ignore[union-attr]
         removed_column = self.slice(0, -len(suffix), None)._column  # type: ignore[union-attr]
 
-        result = cudf._lib.copying.copy_if_else(
-            removed_column, self._column, ends_column
-        )
+        result = removed_column.copy_if_else(self._column, ends_column)
         return self._return_or_inplace(result)
 
     def removeprefix(self, prefix: str) -> SeriesOrIndex:
@@ -4165,9 +4099,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex:
             return self._return_or_inplace(self._column)
         starts_column = self.startswith(prefix)._column  # type: ignore[union-attr]
         removed_column = self.slice(len(prefix), None, None)._column  # type: ignore[union-attr]
-        result = cudf._lib.copying.copy_if_else(
-            removed_column, self._column, starts_column
-        )
+        result = removed_column.copy_if_else(self._column, starts_column)
         return self._return_or_inplace(result)
 
     def _find(
@@ -5826,26 +5758,38 @@ def __contains__(self, item: ScalarLike) -> bool:
         other = [item] if is_scalar(item) else item
         return self.contains(column.as_column(other, dtype=self.dtype)).any()
 
-    def as_numerical_column(
-        self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
+    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         out_dtype = cudf.api.types.dtype(dtype)
-        string_col = self
-        if out_dtype.kind in {"i", "u"}:
-            if not string_col.is_integer().all():
+        if out_dtype.kind == "b":
+            with acquire_spill_lock():
+                plc_column = plc.strings.attributes.count_characters(
+                    self.to_pylibcudf(mode="read")
+                )
+                result = Column.from_pylibcudf(plc_column)
+            return (result > cudf.Scalar(0, dtype="int8")).fillna(False)
+        elif out_dtype.kind in {"i", "u"}:
+            if not self.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
+            cast_func = plc.strings.convert.convert_integers.to_integers
         elif out_dtype.kind == "f":
-            if not string_col.is_float().all():
+            if not self.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
                 )
-
-        result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
-        return result_col
+            cast_func = plc.strings.convert.convert_floats.to_floats
+        else:
+            raise ValueError(
+                f"dtype must be a numerical type, not {out_dtype}"
+            )
+        plc_dtype = dtype_to_pylibcudf_type(out_dtype)
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                cast_func(self.to_pylibcudf(mode="read"), plc_dtype)
+            )
 
     def strptime(
         self, dtype: Dtype, format: str
@@ -5855,7 +5799,7 @@ def strptime(
                 f"dtype must be datetime or timedelta type, not {dtype}"
             )
         elif self.null_count == len(self):
-            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+            return column.column_empty(len(self), dtype=dtype)  # type: ignore[return-value]
         elif (self == "None").any():
             raise ValueError(
                 "Cannot convert `None` value to datetime or timedelta."
@@ -5880,23 +5824,27 @@ def strptime(
                 raise NotImplementedError(
                     "Cannot parse date-like strings with different formats"
                 )
-            valid_ts = str_cast.istimestamp(self, format)
+            valid_ts = self.is_timestamp(format)
             valid = valid_ts | is_nat
             if not valid.all():
                 raise ValueError(f"Column contains invalid data for {format=}")
 
-            casting_func = str_cast.timestamp2int
+            casting_func = plc.strings.convert.convert_datetime.to_timestamps
             add_back_nat = is_nat.any()
         elif dtype.kind == "m":  # type: ignore[union-attr]
-            casting_func = str_cast.timedelta2int
+            casting_func = plc.strings.convert.convert_durations.to_durations
             add_back_nat = False
 
-        result_col = casting_func(self, dtype, format)
+        with acquire_spill_lock():
+            plc_dtype = dtype_to_pylibcudf_type(dtype)
+            result_col = type(self).from_pylibcudf(
+                casting_func(self.to_pylibcudf(mode="read"), plc_dtype, format)
+            )
 
         if add_back_nat:
             result_col[is_nat] = None
 
-        return result_col
+        return result_col  # type: ignore[return-value]
 
     def as_datetime_column(
         self, dtype: Dtype
@@ -6398,15 +6346,15 @@ def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self:
             )
         )
 
+    @acquire_spill_lock()
     def _modify_characters(
         self, method: Callable[[plc.Column], plc.Column]
     ) -> Self:
         """
         Helper function for methods that modify characters e.g. to_lower
         """
-        with acquire_spill_lock():
-            plc_column = method(self.to_pylibcudf(mode="read"))
-            return cast(Self, Column.from_pylibcudf(plc_column))
+        plc_column = method(self.to_pylibcudf(mode="read"))
+        return cast(Self, Column.from_pylibcudf(plc_column))
 
     def to_lower(self) -> Self:
         return self._modify_characters(plc.strings.case.to_lower)
@@ -6435,6 +6383,46 @@ def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
         )
         return cast(Self, Column.from_pylibcudf(plc_result))
 
+    @acquire_spill_lock()
+    def is_hex(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_integers.is_hex(
+                self.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def hex_to_integers(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_integers.hex_to_integers(
+                self.to_pylibcudf(mode="read"), plc.DataType(plc.TypeId.INT64)
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_ipv4(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_ipv4.is_ipv4(
+                self.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def ipv4_to_integers(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_ipv4.ipv4_to_integers(
+                self.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_timestamp(self, format: str) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_datetime.is_timestamp(
+                self.to_pylibcudf(mode="read"), format
+            )
+        )
+
     @acquire_spill_lock()
     def _split_record_re(
         self,
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index f3a7916aa35..417fa99dac0 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -10,9 +10,10 @@
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
 from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
@@ -294,12 +295,15 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         else:
-            return string._timedelta_to_str_typecast_functions[self.dtype](
-                self, format=format
-            )
+            with acquire_spill_lock():
+                return type(self).from_pylibcudf(  # type: ignore[return-value]
+                    plc.strings.convert.convert_durations.from_durations(
+                        self.to_pylibcudf(mode="read"), format
+                    )
+                )
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         return self.strftime("%D days %H:%M:%S")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8cdc45e12da..e66e4f41642 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -774,9 +774,7 @@ def __init__(
                 label_dtype = getattr(columns, "dtype", None)
                 self._data = ColumnAccessor(
                     {
-                        k: column.column_empty(
-                            len(self), dtype="object", masked=True
-                        )
+                        k: column_empty(len(self), dtype="object")
                         for k in columns
                     },
                     level_names=tuple(columns.names)
@@ -979,8 +977,8 @@ def _init_from_series_list(self, data, columns, index):
         if columns is not None:
             for col_name in columns:
                 if col_name not in self._data:
-                    self._data[col_name] = column.column_empty(
-                        row_count=len(self), dtype=None, masked=True
+                    self._data[col_name] = column_empty(
+                        row_count=len(self), dtype=None
                     )
             self._data._level_names = (
                 tuple(columns.names)
@@ -1031,11 +1029,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
-                data = [
-                    cudf.core.column.column_empty(row_count=0, dtype=None)
-                    for _ in columns
-                ]
-
+                data = [column_empty(row_count=0, dtype=None) for _ in columns]
             for col_name, col in enumerate(data):
                 self._data[col_name] = column.as_column(col)
             self._data.rangeindex = True
@@ -1074,9 +1068,8 @@ def _init_from_dict_like(
                 # the provided index, so we need to return a masked
                 # array of nulls if an index is given.
                 empty_column = functools.partial(
-                    cudf.core.column.column_empty,
-                    row_count=(0 if index is None else len(index)),
-                    masked=index is not None,
+                    column_empty,
+                    row_count=0 if index is None else len(index),
                 )
 
             data = {
@@ -1421,7 +1414,7 @@ def __setitem__(self, arg, value):
                         new_columns = (
                             value
                             if key == arg
-                            else column.column_empty(
+                            else column_empty(
                                 row_count=length, dtype=col.dtype
                             )
                             for key, col in self._column_labels_and_values
@@ -1809,13 +1802,37 @@ def _concat(
                 )
                 for table in tables
             ]
-
-            concatted = libcudf.utils.data_from_pylibcudf_table(
-                plc.concatenate.concatenate(plc_tables),
-                column_names=column_names,
-                index_names=index_names,
-            )
-        out = cls._from_data(*concatted)
+            plc_result = plc.concatenate.concatenate(plc_tables)
+            if ignore:
+                index = None
+                data = {
+                    col_name: ColumnBase.from_pylibcudf(col)
+                    for col_name, col in zip(
+                        column_names, plc_result.columns(), strict=True
+                    )
+                }
+            else:
+                result_columns = [
+                    ColumnBase.from_pylibcudf(col)
+                    for col in plc_result.columns()
+                ]
+                index = _index_from_data(
+                    dict(
+                        zip(
+                            index_names,
+                            result_columns[: len(index_names)],
+                            strict=True,
+                        )
+                    )
+                )
+                data = dict(
+                    zip(
+                        column_names,
+                        result_columns[len(index_names) :],
+                        strict=True,
+                    )
+                )
+        out = cls._from_data(data=data, index=index)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex
@@ -3179,10 +3196,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
             if cond_col := cond._data.get(name):
-                result = cudf._lib.copying.copy_if_else(
-                    source_col, other_col, cond_col
-                )
-
+                result = source_col.copy_if_else(other_col, cond_col)
                 out.append(result._with_type_metadata(col.dtype))
             else:
                 out_mask = as_buffer(
@@ -3373,7 +3387,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
                         (
-                            column.column_empty(row_count=length, dtype=dtype)
+                            column_empty(row_count=length, dtype=dtype)
                             for _, dtype in self._dtypes
                         ),
                         verify=False,
@@ -3479,7 +3493,7 @@ def diff(self, periods=1, axis=0):
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
-                    name: column_empty(len(self), dtype=dtype, masked=True)
+                    name: column_empty(len(self), dtype=dtype)
                     for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
@@ -3859,9 +3873,7 @@ def agg(self, aggs, axis=None):
                 result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = self[key]
-                    col_empty = column_empty(
-                        len(idxs), dtype=col.dtype, masked=True
-                    )
+                    col_empty = column_empty(len(idxs), dtype=col.dtype)
                     ans = cudf.Series._from_column(
                         col_empty, index=cudf.Index(idxs)
                     )
@@ -6177,9 +6189,7 @@ def quantile(
                         quant_index=False,
                     )._column
                     if len(res) == 0:
-                        res = column.column_empty(
-                            row_count=len(qs), dtype=ser.dtype
-                        )
+                        res = column_empty(row_count=len(qs), dtype=ser.dtype)
                     result[k] = res
             result = DataFrame._from_data(result)
 
@@ -7333,9 +7343,7 @@ def unnamed_group_generator():
             )
 
             all_nulls = functools.cache(
-                functools.partial(
-                    column_empty, self.shape[0], common_type, masked=True
-                )
+                functools.partial(column_empty, self.shape[0], common_type)
             )
 
             # homogenize the dtypes of the columns
@@ -7870,7 +7878,8 @@ def interleave_columns(self):
         return self._constructor_sliced._from_column(result_col)
 
     @acquire_spill_lock()
-    def _compute_columns(self, expr: str) -> ColumnBase:
+    def _compute_column(self, expr: str) -> ColumnBase:
+        """Helper function for eval"""
         plc_column = plc.transform.compute_column(
             plc.Table(
                 [col.to_pylibcudf(mode="read") for col in self._columns]
@@ -8006,7 +8015,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_column(self._compute_columns(statements[0]))
+            return Series._from_column(self._compute_column(statements[0]))
 
         targets = []
         exprs = []
@@ -8024,7 +8033,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
 
         ret = self if inplace else self.copy(deep=False)
         for name, expr in zip(targets, exprs):
-            ret._data[name] = self._compute_columns(expr)
+            ret._data[name] = self._compute_column(expr)
         if not inplace:
             return ret
 
@@ -8582,7 +8591,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
             # If column not in this df, fill with an all-null column
             if idx >= len(cols) or cols[idx] is None:
                 n = len(next(x for x in cols if x is not None))
-                cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True)
+                cols[idx] = column_empty(row_count=n, dtype=dtype)
             else:
                 # If column is categorical, rebase the codes with the
                 # combined categories, and cast the new codes to the
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 9bb29f1920a..971f0be77f8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -189,9 +189,7 @@ def categories(self) -> cudf.Index:
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            col = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            col = cudf.core.column.column_empty(0, dtype="object")
         else:
             col = self._categories
         return cudf.Index._from_column(col)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 00199cca828..9aadbf8f47a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -22,6 +22,7 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying, sorting
 from cudf.core._internals.search import search_sorted
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
@@ -945,16 +946,17 @@ def from_arrow(cls, data: pa.Table) -> Self:
         if len(dict_indices):
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
+            plc_indices = plc.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in zip(
-                dict_indices_table.column_names, indices_columns
+            for name, plc_codes in zip(
+                dict_indices_table.column_names, plc_indices.columns()
             ):
+                codes = libcudf.column.Column.from_pylibcudf(plc_codes)
                 categories = cudf_dictionaries_columns[name]
                 codes = as_unsigned_codes(len(categories), codes)
                 cudf_category_frame[name] = CategoricalColumn(
@@ -970,9 +972,9 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
-            name: col
-            for name, col in zip(
-                data.column_names, libcudf.interop.from_arrow(data)
+            name: libcudf.column.Column.from_pylibcudf(plc_col)
+            for name, plc_col in zip(
+                data.column_names, plc.interop.from_arrow(data).columns()
             )
         }
 
@@ -1031,7 +1033,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
         return cls._from_data({name: result[name] for name in column_names})
 
     @_performance_tracking
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Table:
         """
         Convert to arrow Table
 
@@ -1057,19 +1059,6 @@ def to_arrow(self):
             }
         )
 
-    @_performance_tracking
-    def _positions_from_column_names(self, column_names) -> list[int]:
-        """Map each column name into their positions in the frame.
-
-        The order of indices returned corresponds to the column order in this
-        Frame.
-        """
-        return [
-            i
-            for i, name in enumerate(self._column_names)
-            if name in set(column_names)
-        ]
-
     @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
@@ -1476,7 +1465,7 @@ def _get_sorted_inds(
         else:
             ascending_lst = list(ascending)
 
-        return libcudf.sort.order_by(
+        return sorting.order_by(
             list(to_sort),
             ascending_lst,
             na_position,
@@ -1484,18 +1473,13 @@ def _get_sorted_inds(
         )
 
     @_performance_tracking
-    def _split(self, splits):
+    def _split(self, splits: list[int]) -> list[Self]:
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
         """
         return [
-            self._from_columns_like_self(
-                libcudf.copying.columns_split(list(self._columns), splits)[
-                    split_idx
-                ],
-                self._column_names,
-            )
-            for split_idx in range(len(splits) + 1)
+            self._from_columns_like_self(split, self._column_names)
+            for split in copying.columns_split(self._columns, splits)
         ]
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d4f3394833a..6cd8e11695f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -16,13 +16,14 @@
 import pylibcudf as plc
 
 import cudf
+import cudf.core._internals
 from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
-from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -430,7 +431,9 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
             ]
         )
 
-        group_keys = libcudf.stream_compaction.drop_duplicates(group_keys)
+        group_keys = cudf.core._internals.stream_compaction.drop_duplicates(
+            group_keys
+        )
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
@@ -493,9 +496,7 @@ def size(self):
         """
         Return the size of each group.
         """
-        col = cudf.core.column.column_empty(
-            len(self.obj), "int8", masked=False
-        )
+        col = cudf.core.column.column_empty(len(self.obj), "int8")
         result = (
             cudf.Series._from_column(col, name=getattr(self.obj, "name", None))
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
@@ -523,7 +524,8 @@ def cumcount(self, ascending: bool = True):
         return (
             cudf.Series._from_column(
                 cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
+                    len(self.obj),
+                    "int8",
                 ),
                 index=self.obj.index,
             )
@@ -793,7 +795,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 # want, and right order is a matching gather map for
                 # the result table. Get the correct order by sorting
                 # the right gather map.
-                (right_order,) = libcudf.sort.sort_by_key(
+                (right_order,) = sorting.sort_by_key(
                     [right_order],
                     [left_order],
                     [True],
@@ -1249,15 +1251,20 @@ def sample(
                 for off, size in zip(group_offsets, size_per_group):
                     rs.shuffle(indices[off : off + size])
             else:
-                rng = cp.random.default_rng(seed=random_state)
-                (indices,) = segmented_sort_by_key(
-                    [as_column(indices)],
-                    [as_column(rng.random(size=nrows))],
-                    as_column(group_offsets),
-                    [],
-                    [],
-                    stable=True,
+                keys = cp.random.default_rng(seed=random_state).random(
+                    size=nrows
                 )
+                with acquire_spill_lock():
+                    plc_table = plc.sorting.stable_segmented_sort_by_key(
+                        plc.Table(
+                            [as_column(indices).to_pylibcudf(mode="read")]
+                        ),
+                        plc.Table([as_column(keys).to_pylibcudf(mode="read")]),
+                        as_column(group_offsets).to_pylibcudf(mode="read"),
+                        [plc.types.Order.ASCENDING],
+                        [plc.types.NullOrder.AFTER],
+                    )
+                    indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
             want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index eeb6e3bd547..eac5b9d71ae 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -31,6 +31,7 @@
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying
 from cudf.core._internals.search import search_sorted
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -336,7 +337,7 @@ def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
-            return column.column_empty(0, masked=False, dtype=self.dtype)
+            return column.column_empty(0, dtype=self.dtype)
 
     def _clean_nulls_from_index(self) -> Self:
         return self
@@ -1371,7 +1372,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result = copying.scatter([indices], scatter_map, [result])[0]
         result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0e6a5e03ea6..72bb85821fa 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -27,6 +27,7 @@
 import cudf
 import cudf._lib as libcudf
 import cudf.core
+import cudf.core._internals
 import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -37,6 +38,7 @@
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.column_accessor import ColumnAccessor
@@ -2952,10 +2954,10 @@ def _gather(
         if not gather_map.nullify and len(self) != gather_map.nrows:
             raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
-            libcudf.copying.gather(
-                list(self.index._columns + self._columns)
+            copying.gather(
+                itertools.chain(self.index._columns, self._columns)
                 if keep_index
-                else list(self._columns),
+                else self._columns,
                 gather_map.column,
                 nullify=gather_map.nullify,
             ),
@@ -3035,16 +3037,24 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                 keep_index=keep_index,
             )
 
-        columns_to_slice = [
-            *(
-                self.index._columns
-                if keep_index and not has_range_index
-                else []
-            ),
-            *self._columns,
-        ]
+        columns_to_slice = (
+            itertools.chain(self.index._columns, self._columns)
+            if keep_index and not has_range_index
+            else self._columns
+        )
+        with acquire_spill_lock():
+            plc_tables = plc.copying.slice(
+                plc.Table(
+                    [col.to_pylibcudf(mode="read") for col in columns_to_slice]
+                ),
+                [start, stop],
+            )
+            sliced = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_tables[0].columns()
+            ]
         result = self._from_columns_like_self(
-            libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
+            sliced,
             self._column_names,
             None if has_range_index or not keep_index else self.index.names,
         )
@@ -3054,21 +3064,21 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         return result
 
     def _positions_from_column_names(
-        self, column_names, offset_by_index_columns=False
-    ):
+        self,
+        column_names: set[abc.Hashable],
+        offset_by_index_columns: bool = True,
+    ) -> list[int]:
         """Map each column name into their positions in the frame.
 
         Return positions of the provided column names, offset by the number of
         index columns if `offset_by_index_columns` is True. The order of
         indices returned corresponds to the column order in this Frame.
         """
-        num_index_columns = (
-            len(self.index._data) if offset_by_index_columns else 0
-        )
+        start = self.index.nlevels if offset_by_index_columns else 0
         return [
-            i + num_index_columns
-            for i, name in enumerate(self._column_names)
-            if name in set(column_names)
+            i
+            for i, name in enumerate(self._column_names, start=start)
+            if name in column_names
         ]
 
     def drop_duplicates(
@@ -3105,7 +3115,7 @@ def drop_duplicates(
             subset, offset_by_index_columns=not ignore_index
         )
         return self._from_columns_like_self(
-            libcudf.stream_compaction.drop_duplicates(
+            cudf.core._internals.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
                 else list(self.index._columns + self._columns),
@@ -3118,7 +3128,9 @@ def drop_duplicates(
         )
 
     @_performance_tracking
-    def duplicated(self, subset=None, keep="first"):
+    def duplicated(
+        self, subset=None, keep: Literal["first", "last", False] = "first"
+    ) -> cudf.Series:
         """
         Return boolean Series denoting duplicate rows.
 
@@ -3218,10 +3230,25 @@ def duplicated(self, subset=None, keep="first"):
             name = self.name
         else:
             columns = [self._data[n] for n in subset]
-        distinct = libcudf.stream_compaction.distinct_indices(
-            columns, keep=keep
-        )
-        result = libcudf.copying.scatter(
+
+        _keep_options = {
+            "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+            "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+            False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+        }
+
+        if (keep_option := _keep_options.get(keep)) is None:
+            raise ValueError('keep must be either "first", "last" or False')
+
+        with acquire_spill_lock():
+            plc_column = plc.stream_compaction.distinct_indices(
+                plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+                keep_option,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+            distinct = libcudf.column.Column.from_pylibcudf(plc_column)
+        result = copying.scatter(
             [cudf.Scalar(False, dtype=bool)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
@@ -3230,14 +3257,26 @@ def duplicated(self, subset=None, keep="first"):
         return cudf.Series._from_column(result, index=self.index, name=name)
 
     @_performance_tracking
-    def _empty_like(self, keep_index=True) -> Self:
+    def _empty_like(self, keep_index: bool = True) -> Self:
+        with acquire_spill_lock():
+            plc_table = plc.copying.empty_like(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in (
+                            itertools.chain(self.index._columns, self._columns)
+                            if keep_index
+                            else self._columns
+                        )
+                    ]
+                )
+            )
+            columns = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_table.columns()
+            ]
         result = self._from_columns_like_self(
-            libcudf.copying.columns_empty_like(
-                [
-                    *(self.index._columns if keep_index else ()),
-                    *self._columns,
-                ]
-            ),
+            columns,
             self._column_names,
             self.index.names if keep_index else None,
         )
@@ -3245,25 +3284,24 @@ def _empty_like(self, keep_index=True) -> Self:
         result._data.rangeindex = self._data.rangeindex
         return result
 
-    def _split(self, splits, keep_index=True):
+    def _split(self, splits, keep_index: bool = True) -> list[Self]:
         if self._num_rows == 0:
             return []
 
-        columns_split = libcudf.copying.columns_split(
-            [
-                *(self.index._columns if keep_index else []),
-                *self._columns,
-            ],
+        columns_split = copying.columns_split(
+            itertools.chain(self.index._columns, self._columns)
+            if keep_index
+            else self._columns,
             splits,
         )
 
         return [
             self._from_columns_like_self(
-                columns_split[i],
+                split,
                 self._column_names,
                 self.index.names if keep_index else None,
             )
-            for i in range(len(splits) + 1)
+            for split in columns_split
         ]
 
     @_performance_tracking
@@ -3851,7 +3889,6 @@ def _reindex(
                 if name in df._data
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
-                    masked=True,
                     row_count=len(index),
                 )
             )
@@ -4334,12 +4371,10 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
-            libcudf.stream_compaction.drop_nulls(
+            cudf.core._internals.stream_compaction.drop_nulls(
                 [*self.index._columns, *data_columns],
                 how=how,
-                keys=self._positions_from_column_names(
-                    subset, offset_by_index_columns=True
-                ),
+                keys=self._positions_from_column_names(subset),
                 thresh=thresh,
             ),
             self._column_names,
@@ -4359,7 +4394,7 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
                 f"{len(boolean_mask.column)} not {len(self)}"
             )
         return self._from_columns_like_self(
-            libcudf.stream_compaction.apply_boolean_mask(
+            cudf.core._internals.stream_compaction.apply_boolean_mask(
                 list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
@@ -6270,17 +6305,16 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
-    def _preprocess_subset(self, subset):
+    def _preprocess_subset(self, subset) -> set[abc.Hashable]:
         if subset is None:
             subset = self._column_names
         elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
+            is_scalar(subset)
             or isinstance(subset, tuple)
             and subset in self._column_names
         ):
             subset = (subset,)
-        diff = set(subset) - set(self._data)
+        diff = set(subset) - set(self._column_names)
         if len(diff) != 0:
             raise KeyError(f"columns {diff} do not exist")
         return subset
@@ -6368,10 +6402,50 @@ def rank(
             elif source._num_columns != num_cols:
                 dropped_cols = True
 
-        result_columns = libcudf.sort.rank_columns(
-            [*source._columns], method_enum, na_option, ascending, pct
+        column_order = (
+            plc.types.Order.ASCENDING
+            if ascending
+            else plc.types.Order.DESCENDING
+        )
+        # ascending
+        #    #top    = na_is_smallest
+        #    #bottom = na_is_largest
+        #    #keep   = na_is_largest
+        # descending
+        #    #top    = na_is_largest
+        #    #bottom = na_is_smallest
+        #    #keep   = na_is_smallest
+        if ascending:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.BEFORE
+            else:
+                null_precedence = plc.types.NullOrder.AFTER
+        else:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.AFTER
+            else:
+                null_precedence = plc.types.NullOrder.BEFORE
+        c_null_handling = (
+            plc.types.NullPolicy.EXCLUDE
+            if na_option == "keep"
+            else plc.types.NullPolicy.INCLUDE
         )
 
+        with acquire_spill_lock():
+            result_columns = [
+                libcudf.column.Column.from_pylibcudf(
+                    plc.sorting.rank(
+                        col.to_pylibcudf(mode="read"),
+                        method_enum,
+                        column_order,
+                        c_null_handling,
+                        null_precedence,
+                        pct,
+                    )
+                )
+                for col in source._columns
+            ]
+
         if dropped_cols:
             result = type(source)._from_data(
                 ColumnAccessor(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 5c224176730..6e965ceca66 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
-import itertools
 from typing import Any
 
 import pylibcudf as plc
@@ -9,6 +8,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.types import size_type_dtype
+from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
@@ -242,21 +242,13 @@ def _gather_maps(self, left_cols, right_cols):
         # To reorder maps so that they are in order of the input
         # tables, we gather from iota on both right and left, and then
         # sort the gather maps with those two columns as key.
-        key_order = list(
-            itertools.chain.from_iterable(
-                libcudf.copying.gather(
-                    [
-                        cudf.core.column.as_column(
-                            range(n), dtype=size_type_dtype
-                        )
-                    ],
-                    map_,
-                    nullify=null,
-                )
-                for map_, n, null in zip(maps, lengths, nullify)
+        key_order = [
+            cudf.core.column.as_column(range(n), dtype=size_type_dtype).take(
+                map_, nullify=null, check_bounds=False
             )
-        )
-        return libcudf.sort.sort_by_key(
+            for map_, n, null in zip(maps, lengths, nullify)
+        ]
+        return sorting.sort_by_key(
             list(maps),
             # If how is right, right map is primary sort key.
             key_order[:: -1 if self.how == "right" else 1],
@@ -426,7 +418,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
             else:
                 to_sort = [*result._columns]
                 index_names = None
-            result_columns = libcudf.sort.sort_by_key(
+            result_columns = sorting.sort_by_key(
                 to_sort,
                 by,
                 [True] * len(by),
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index f5ee36f851c..d2afe643dc4 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -22,6 +22,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core._internals import copying, sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -199,10 +200,8 @@ def __init__(
                 if lo == -1:
                     # Now we can gather and insert null automatically
                     code[code == -1] = np.iinfo(size_type_dtype).min
-            result_col = libcudf.copying.gather(
-                [level._column], code, nullify=True
-            )
-            source_data[i] = result_col[0]._with_type_metadata(level.dtype)
+            result_col = level._column.take(code, nullify=True)
+            source_data[i] = result_col._with_type_metadata(level.dtype)
 
         super().__init__(ColumnAccessor(source_data))
         self._levels = new_levels
@@ -1677,7 +1676,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
                 f"Expected a list-like or None for `null_position`, got "
                 f"{type(null_position)}"
             )
-        return libcudf.sort.is_sorted(
+        return sorting.is_sorted(
             [*self._columns], ascending=ascending, null_position=null_position
         )
 
@@ -1933,7 +1932,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result = copying.scatter([indices], scatter_map, [result])[0]
         result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 59a3e9dbf3b..3ab6ed306b6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1029,7 +1029,8 @@ def as_tuple(x):
                 {
                     name: idx._column
                     for name, idx in zip(
-                        names, target._split(range(nrows, new_size, nrows))
+                        names,
+                        target._split(list(range(nrows, new_size, nrows))),
                     )
                 }
             )
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 647e20fc16b..961e5e11bc0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3410,7 +3410,7 @@ def describe(
         )
 
     @_performance_tracking
-    def digitize(self, bins, right=False):
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
         """Return the indices of the bins to which each value belongs.
 
         Notes
@@ -3441,9 +3441,8 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series._from_column(
-            cudf.core.column.numerical.digitize(self._column, bins, right),
-            name=self.name,
+        return type(self)._from_column(
+            self._column.digitize(bins, right), name=self.name
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f6d0664758f..9c8da020ddc 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -380,7 +380,7 @@ def where(self, cond, other=None, inplace=False):
             source_col=self._column, other=other, inplace=inplace
         )
 
-        result = cudf._lib.copying.copy_if_else(input_col, other, cond)
+        result = input_col.copy_if_else(other, cond)
         return result._with_type_metadata(self.dtype)
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 40348461f8c..6d3dc2dc7d9 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -8,7 +8,6 @@
 import pandas as pd
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -251,9 +250,9 @@ def _convert_str_col(
             return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            converted_col = libcudf.string_casting.stod(converted_col)
             non_numerics = is_float.unary_operator("not")
             converted_col[non_numerics] = None
+            converted_col = converted_col.astype(np.dtype(np.float64))  # type: ignore[assignment]
             return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index 11730e98c95..4966cdb86e1 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -33,11 +33,18 @@ def read_avro(
     if not isinstance(skip_rows, int) or skip_rows < 0:
         raise TypeError("skip_rows must be an int >= 0")
 
-    plc_result = plc.io.avro.read_avro(
-        plc.io.types.SourceInfo([filepath_or_buffer]),
-        columns,
-        skip_rows,
-        num_rows,
+    options = (
+        plc.io.avro.AvroReaderOptions.builder(
+            plc.io.types.SourceInfo([filepath_or_buffer])
+        )
+        .skip_rows(skip_rows)
+        .num_rows(num_rows)
+        .build()
     )
 
+    if columns is not None and len(columns) > 0:
+        options.set_columns(columns)
+
+    plc_result = plc.io.avro.read_avro(options)
+
     return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index fe8e446f9c0..3b3fd5f7c56 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
 from cudf.utils import ioutils
 
 
-def from_dlpack(pycapsule_obj):
+def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
     """Converts from a DLPack tensor to a cuDF object.
 
     DLPack is an open-source memory tensor structure:
@@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj):
     cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
     tensor is row-major, transpose it before passing it to this function.
     """
+    plc_table = plc.interop.from_dlpack(pycapsule_obj)
+    data = dict(
+        enumerate(
+            (ColumnBase.from_pylibcudf(col) for col in plc_table.columns())
+        )
+    )
 
-    columns = libdlpack.from_dlpack(pycapsule_obj)
-    data = dict(enumerate(columns))
-
-    if len(columns) == 1:
+    if len(data) == 1:
         return cudf.Series._from_data(data)
     else:
         return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
-def to_dlpack(cudf_obj):
+def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
     """Converts a cuDF object to a DLPack tensor.
 
     DLPack is an open-source memory tensor structure:
@@ -80,13 +84,14 @@ def to_dlpack(cudf_obj):
 
     if any(
         not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
-        for _, dtype in gdf._dtypes
+        for _, dtype in gdf._dtypes  # type: ignore[union-attr]
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [dtype for _, dtype in gdf._dtypes]
+        [dtype for _, dtype in gdf._dtypes]  # type: ignore[union-attr]
     )
     gdf = gdf.astype(dtype)
-
-    return libdlpack.to_dlpack([*gdf._columns])
+    return plc.interop.to_dlpack(
+        plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns])
+    )
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 89af00c713d..39a85465deb 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -91,11 +91,6 @@ def read_json(
         if dtype is None:
             dtype = True
 
-        if kwargs:
-            raise ValueError(
-                "cudf engine doesn't support the "
-                f"following keyword arguments: {list(kwargs.keys())}"
-            )
         if args:
             raise ValueError(
                 "cudf engine doesn't support the "
@@ -166,13 +161,15 @@ def read_json(
         if cudf.get_option("io.json.low_memory") and lines:
             res_cols, res_col_names, res_child_names = (
                 plc.io.json.chunked_read_json(
-                    plc.io.SourceInfo(filepaths_or_buffers),
-                    processed_dtypes,
-                    c_compression,
-                    keep_quotes=keep_quotes,
-                    mixed_types_as_string=mixed_types_as_string,
-                    prune_columns=prune_columns,
-                    recovery_mode=c_on_bad_lines,
+                    plc.io.json._setup_json_reader_options(
+                        plc.io.SourceInfo(filepaths_or_buffers),
+                        processed_dtypes,
+                        c_compression,
+                        keep_quotes=keep_quotes,
+                        mixed_types_as_string=mixed_types_as_string,
+                        prune_columns=prune_columns,
+                        recovery_mode=c_on_bad_lines,
+                    )
                 )
             )
             df = cudf.DataFrame._from_data(
@@ -186,18 +183,23 @@ def read_json(
             return df
         else:
             table_w_meta = plc.io.json.read_json(
-                plc.io.SourceInfo(filepaths_or_buffers),
-                processed_dtypes,
-                c_compression,
-                lines,
-                byte_range_offset=byte_range[0]
-                if byte_range is not None
-                else 0,
-                byte_range_size=byte_range[1] if byte_range is not None else 0,
-                keep_quotes=keep_quotes,
-                mixed_types_as_string=mixed_types_as_string,
-                prune_columns=prune_columns,
-                recovery_mode=c_on_bad_lines,
+                plc.io.json._setup_json_reader_options(
+                    plc.io.SourceInfo(filepaths_or_buffers),
+                    processed_dtypes,
+                    c_compression,
+                    lines,
+                    byte_range_offset=byte_range[0]
+                    if byte_range is not None
+                    else 0,
+                    byte_range_size=byte_range[1]
+                    if byte_range is not None
+                    else 0,
+                    keep_quotes=keep_quotes,
+                    mixed_types_as_string=mixed_types_as_string,
+                    prune_columns=prune_columns,
+                    recovery_mode=c_on_bad_lines,
+                    extra_parameters=kwargs,
+                )
             )
 
             df = cudf.DataFrame._from_data(
@@ -291,21 +293,25 @@ def _plc_write_json(
     rows_per_chunk: int = 1024 * 64,  # 64K rows
 ) -> None:
     try:
-        plc.io.json.write_json(
-            plc.io.SinkInfo([path_or_buf]),
-            plc.io.TableWithMetadata(
-                plc.Table(
-                    [col.to_pylibcudf(mode="read") for col in table._columns]
-                ),
-                colnames,
+        tbl_w_meta = plc.io.TableWithMetadata(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
             ),
-            na_rep,
-            include_nulls,
-            lines,
-            rows_per_chunk,
-            true_value="true",
-            false_value="false",
+            colnames,
+        )
+        options = (
+            plc.io.json.JsonWriterOptions.builder(
+                plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl
+            )
+            .metadata(tbl_w_meta)
+            .na_rep(na_rep)
+            .include_nulls(include_nulls)
+            .lines(lines)
+            .build()
         )
+        if rows_per_chunk != np.iinfo(np.int32).max:
+            options.set_rows_per_chunk(rows_per_chunk)
+        plc.io.json.write_json(options)
     except OverflowError as err:
         raise OverflowError(
             f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 5616413b7e4..5103137bc77 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -240,15 +240,27 @@ def read_orc(
         elif not isinstance(num_rows, int) or num_rows < -1:
             raise TypeError("num_rows must be an int >= -1")
 
-        tbl_w_meta = plc.io.orc.read_orc(
-            plc.io.SourceInfo(filepaths_or_buffers),
-            columns,
-            stripes,
-            skiprows,
-            num_rows,
-            use_index,
-            dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
+        options = (
+            plc.io.orc.OrcReaderOptions.builder(
+                plc.io.types.SourceInfo(filepaths_or_buffers)
+            )
+            .use_index(use_index)
+            .build()
         )
+        if num_rows >= 0:
+            options.set_num_rows(num_rows)
+        if skiprows >= 0:
+            options.set_skip_rows(skiprows)
+        if stripes is not None and len(stripes) > 0:
+            options.set_stripes(stripes)
+        if timestamp_type is not None:
+            options.set_timestamp_type(
+                dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
+            )
+        if columns is not None and len(columns) > 0:
+            options.set_columns(columns)
+
+        tbl_w_meta = plc.io.orc.read_orc(options)
 
         if isinstance(columns, list) and len(columns) == 0:
             # When `columns=[]`, index needs to be
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 66095d4a155..c13489630a3 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -25,9 +25,7 @@
 from cudf._lib.column import Column
 from cudf._lib.utils import (
     _data_from_columns,
-    _index_level_name,
     data_from_pylibcudf_io,
-    generate_pandas_metadata,
 )
 from cudf.api.types import is_list_like
 from cudf.core.buffer import acquire_spill_lock
@@ -128,7 +126,7 @@ def _plc_write_parquet(
         tbl_meta = plc.io.types.TableInputMetadata(plc_table)
         for level, idx_name in enumerate(table.index.names):
             tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
+                ioutils._index_level_name(idx_name, level, table._column_names)
             )
         num_index_cols_meta = len(table.index.names)
     else:
@@ -162,7 +160,7 @@ def _plc_write_parquet(
     if partitions_info is not None:
         user_data = [
             {
-                "pandas": generate_pandas_metadata(
+                "pandas": ioutils.generate_pandas_metadata(
                     table.iloc[start_row : start_row + num_row].copy(
                         deep=False
                     ),
@@ -172,7 +170,9 @@ def _plc_write_parquet(
             for start_row, num_row in partitions_info
         ]
     else:
-        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
+        user_data = [
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
+        ]
 
     if header_version not in ("1.0", "2.0"):
         raise ValueError(
@@ -1139,7 +1139,6 @@ def _parquet_to_frame(
                     dfs[-1][name] = column_empty(
                         row_count=_len,
                         dtype=_dtype,
-                        masked=True,
                     )
                 else:
                     dfs[-1][name] = as_column(
@@ -1738,7 +1737,7 @@ def _initialize_chunked_state(
             False if isinstance(table.index, cudf.RangeIndex) else self.index
         )
         user_data = [
-            {"pandas": generate_pandas_metadata(table, index)}
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
         ] * num_partitions
         comp_type = _get_comp_type(self.compression)
         stat_freq = _get_stat_freq(self.statistics)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index db24fdd2a29..8e1dba858c3 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -950,3 +950,13 @@ def test_index_set_categories(ordered):
     expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
     result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
     assert_eq(result, expected)
+
+
+def test_categorical_interval_pandas_roundtrip():
+    expected = cudf.Series(cudf.interval_range(0, 5)).astype("category")
+    result = cudf.Series.from_pandas(expected.to_pandas())
+    assert_eq(result, expected)
+
+    expected = pd.Series(pd.interval_range(0, 5)).astype("category")
+    result = cudf.Series.from_pandas(expected).to_pandas()
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 260b481b933..da0aa5be6f5 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -10,7 +10,6 @@
 
 import cudf
 from cudf import NA
-from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
 from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
@@ -715,9 +714,8 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
     ],
 )
 def test_list_scalar_device_construction(data):
-    col = cudf.Series([data])._column
-    slr = get_element(col, 0)
-    assert slr.value == data
+    res = cudf.Series([data])._column.element_indexing(0)
+    assert res == data
 
 
 @pytest.mark.parametrize("nesting_level", [1, 2, 3])
@@ -729,10 +727,8 @@ def test_list_scalar_device_construction_null(nesting_level):
     arrow_type = pa.infer_type(data)
     arrow_arr = pa.array([None], type=arrow_type)
 
-    col = cudf.Series(arrow_arr)._column
-    slr = get_element(col, 0)
-
-    assert slr.value is cudf.NA
+    res = cudf.Series(arrow_arr)._column.element_indexing(0)
+    assert res is cudf.NA
 
 
 @pytest.mark.parametrize("input_obj", [[[1, NA, 3]], [[1, NA, 3], [4, 5, NA]]])
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
deleted file mode 100644
index b474bbe9bd8..00000000000
--- a/python/cudf/cudf/tests/test_pack.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pickle
-import sys
-
-import numpy as np
-import pandas as pd
-
-from cudf import DataFrame, Index, Series
-from cudf._lib.copying import pack, unpack
-from cudf.testing import assert_eq
-
-
-def test_sizeof_packed_dataframe():
-    rng = np.random.default_rng(seed=0)
-    df = DataFrame()
-    nelem = 1000
-    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = rng.random(nelem)
-    packed = pack(df)
-
-    nbytes = hkeys.nbytes + hvals.nbytes
-    sizeof = sys.getsizeof(packed)
-    assert sizeof < nbytes
-
-    serialized_nbytes = len(
-        pickle.dumps(packed, protocol=pickle.HIGHEST_PROTOCOL)
-    )
-
-    # assert at least sizeof bytes were serialized
-    assert serialized_nbytes >= sizeof
-
-
-def check_packed_equality(df):
-    # basic
-    assert_packed_frame_equality(df)
-    # sliced
-    assert_packed_frame_equality(df[:-1])
-    assert_packed_frame_equality(df[1:])
-    assert_packed_frame_equality(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_equality(sortvaldf)
-
-
-def assert_packed_frame_equality(df):
-    pdf = df.to_pandas()
-
-    packed = pack(df)
-    del df
-    unpacked = unpack(packed)
-
-    assert_eq(unpacked, pdf)
-
-
-def test_packed_dataframe_equality_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def check_packed_unique_pointers(df):
-    # basic
-    assert_packed_frame_unique_pointers(df)
-    # sliced
-    assert_packed_frame_unique_pointers(df[:-1])
-    assert_packed_frame_unique_pointers(df[1:])
-    assert_packed_frame_unique_pointers(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_unique_pointers(sortvaldf)
-
-
-def assert_packed_frame_unique_pointers(df):
-    unpacked = unpack(pack(df))
-
-    for col in df:
-        if df._data[col].data:
-            assert df._data[col].data.get_ptr(mode="read") != unpacked._data[
-                col
-            ].data.get_ptr(mode="read")
-
-
-def test_packed_dataframe_unique_pointers_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def check_packed_pickled_equality(df):
-    # basic
-    assert_packed_frame_picklable(df)
-    # sliced
-    assert_packed_frame_picklable(df[:-1])
-    assert_packed_frame_picklable(df[1:])
-    assert_packed_frame_picklable(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_picklable(sortvaldf)
-    # out-of-band
-    buffers = []
-    serialbytes = pickle.dumps(
-        pack(df), protocol=5, buffer_callback=buffers.append
-    )
-    for b in buffers:
-        assert isinstance(b, pickle.PickleBuffer)
-    loaded = unpack(pickle.loads(serialbytes, buffers=buffers))
-    assert_eq(loaded, df)
-
-
-def assert_packed_frame_picklable(df):
-    serialbytes = pickle.dumps(pack(df))
-    loaded = unpack(pickle.loads(serialbytes))
-    assert_eq(loaded, df)
-
-
-def test_pickle_packed_dataframe_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def check_packed_serialized_equality(df):
-    # basic
-    assert_packed_frame_serializable(df)
-    # sliced
-    assert_packed_frame_serializable(df[:-1])
-    assert_packed_frame_serializable(df[1:])
-    assert_packed_frame_serializable(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_serializable(sortvaldf)
-
-
-def assert_packed_frame_serializable(df):
-    packed = pack(df)
-    header, frames = packed.serialize()
-    loaded = unpack(packed.deserialize(header, frames))
-    assert_eq(loaded, df)
-
-
-def test_serialize_packed_dataframe_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 9a2816f5444..84de2ac38e7 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):
 
     assert expected == actual
     assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index fcd98831686..c14fab4040b 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -10,10 +10,11 @@
 import pytest
 from packaging import version
 
+import pylibcudf as plc
 import rmm
 
 import cudf
-from cudf._lib.copying import get_element
+from cudf.core.buffer import acquire_spill_lock
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
@@ -143,8 +144,14 @@ def test_scalar_host_initialization(value):
 @pytest.mark.parametrize("value", SCALAR_VALUES)
 def test_scalar_device_initialization(value):
     column = cudf.Series([value], nan_as_null=False)._column
-    dev_slr = get_element(column, 0)
-
+    with acquire_spill_lock():
+        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
+            plc.copying.get_element(
+                column.to_pylibcudf(mode="read"),
+                0,
+            ),
+            dtype=column.dtype,
+        )
     s = cudf.Scalar.from_device_scalar(dev_slr)
 
     assert s._is_device_value_current
@@ -164,8 +171,14 @@ def test_scalar_device_initialization(value):
 def test_scalar_device_initialization_decimal(value, decimal_type):
     dtype = decimal_type._from_decimal(value)
     column = cudf.Series([str(value)]).astype(dtype)._column
-    dev_slr = get_element(column, 0)
-
+    with acquire_spill_lock():
+        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
+            plc.copying.get_element(
+                column.to_pylibcudf(mode="read"),
+                0,
+            ),
+            dtype=column.dtype,
+        )
     s = cudf.Scalar.from_device_scalar(dev_slr)
 
     assert s._is_device_value_current
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index 624a9bd87ea..2ba483c7b2d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -40,6 +40,7 @@ def __init__(
         self.dtype = dtype
         self.name = name
         self.options = options
+        self.is_pointwise = False
         self.children = children
         if name not in Agg._SUPPORTED:
             raise NotImplementedError(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 4c7ae007070..8ba3f9f407c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -36,9 +36,11 @@ class ExecutionContext(IntEnum):
 class Expr(Node["Expr"]):
     """An abstract expression object."""
 
-    __slots__ = ("dtype",)
+    __slots__ = ("dtype", "is_pointwise")
     dtype: plc.DataType
     """Data type of the expression."""
+    is_pointwise: bool
+    """Whether this expression acts pointwise on its inputs."""
     # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
     """Names of non-child data (not Exprs) for reconstruction."""
@@ -164,6 +166,7 @@ def __init__(self, dtype: plc.DataType, error: str) -> None:
         self.dtype = dtype
         self.error = error
         self.children = ()
+        self.is_pointwise = True
 
 
 class NamedExpr:
@@ -243,6 +246,7 @@ class Col(Expr):
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
+        self.is_pointwise = True
         self.children = ()
 
     def do_evaluate(
@@ -280,6 +284,7 @@ def __init__(
         self.dtype = dtype
         self.index = index
         self.table_ref = table_ref
+        self.is_pointwise = True
         self.children = (column,)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 245bdbefe88..556847b4738 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -42,6 +42,7 @@ def __init__(
             op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
         self.op = op
         self.children = (left, right)
+        self.is_pointwise = True
         if not plc.binaryop.is_supported_operation(
             self.dtype, left.dtype, right.dtype, op
         ):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 5aa35ead127..d5ca22dd8d5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -81,6 +81,14 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = self.name not in (
+            BooleanFunction.Name.All,
+            BooleanFunction.Name.Any,
+            BooleanFunction.Name.IsDuplicated,
+            BooleanFunction.Name.IsFirstDistinct,
+            BooleanFunction.Name.IsLastDistinct,
+            BooleanFunction.Name.IsUnique,
+        )
         if self.name is BooleanFunction.Name.IsIn and not all(
             c.dtype == self.children[0].dtype for c in self.children
         ):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index c2dddfd9940..0c3159c73d6 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -114,6 +114,7 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = True
         if self.name not in self._COMPONENT_MAP:
             raise NotImplementedError(f"Temporal function {self.name}")
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index 7eba0c110ab..8528e66c69c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -38,6 +38,7 @@ def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
         self.children = ()
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
@@ -65,6 +66,7 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
         self.children = ()
+        self.is_pointwise = True
 
     def get_hashable(self) -> Hashable:
         """Compute a hash of the column."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index 48c37d101f4..d4616d5d00a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -24,6 +24,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
         self.dtype = dtype
         self.options = options
         self.children = (agg,)
+        self.is_pointwise = False
         raise NotImplementedError("Rolling window not implemented")
 
 
@@ -35,4 +36,5 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> N
         self.dtype = dtype
         self.options = options
         self.children = (agg, *by)
+        self.is_pointwise = False
         raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index 12326740f74..93ecd026eaf 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -30,6 +30,7 @@ class Gather(Expr):
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
         self.dtype = dtype
         self.children = (values, indices)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -71,6 +72,7 @@ class Filter(Expr):
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
         self.dtype = dtype
         self.children = (values, indices)
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
index 99512e2ef52..189f109e1a2 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -32,6 +32,7 @@ def __init__(
         self.dtype = dtype
         self.options = options
         self.children = (column,)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -71,6 +72,7 @@ def __init__(
         self.dtype = dtype
         self.options = options
         self.children = (column, *by)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 124a6e8d71c..256840c1f3d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -106,6 +106,7 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = True
         self._validate_input()
 
     def _validate_input(self):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
index d2b5d6bae29..120ca8edce0 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -34,6 +34,7 @@ def __init__(
     ) -> None:
         self.dtype = dtype
         self.children = (when, then, otherwise)
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 10caaff6811..3336c901e7f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -33,6 +33,7 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         self.dtype = dtype
         self.children = (value,)
+        self.is_pointwise = True
         if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
                 f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}"
@@ -63,6 +64,7 @@ class Len(Expr):
     def __init__(self, dtype: plc.DataType) -> None:
         self.dtype = dtype
         self.children = ()
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -147,6 +149,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
+        self.is_pointwise = self.name not in (
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+            "drop_nulls",
+            "unique",
+        )
 
         if self.name not in UnaryFunction._supported_fns:
             raise NotImplementedError(f"Unary function {name=}")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b5af3bb80bf..1c1d4860eec 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -604,10 +604,12 @@ def slice_skip(tbl: plc.Table):
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(
-                plc.io.SourceInfo(paths),
-                lines=True,
-                dtypes=json_schema,
-                prune_columns=True,
+                plc.io.json._setup_json_reader_options(
+                    plc.io.SourceInfo(paths),
+                    lines=True,
+                    dtypes=json_schema,
+                    prune_columns=True,
+                )
             )
             # TODO: I don't think cudf-polars supports nested types in general right now
             # (but when it does, we should pass child column names from nested columns in)
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
index b3248dae93c..9c45a68812a 100644
--- a/python/cudf_polars/cudf_polars/dsl/traversal.py
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -10,7 +10,7 @@
 from cudf_polars.typing import U_contra, V_co
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Generator, Mapping, MutableMapping
+    from collections.abc import Callable, Generator, Mapping, MutableMapping, Sequence
 
     from cudf_polars.typing import GenericTransformer, NodeT
 
@@ -23,22 +23,22 @@
 ]
 
 
-def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+def traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]:
     """
     Pre-order traversal of nodes in an expression.
 
     Parameters
     ----------
-    node
-        Root of expression to traverse.
+    nodes
+        Roots of expressions to traverse.
 
     Yields
     ------
-    Unique nodes in the expression, parent before child, children
+    Unique nodes in the expressions, parent before child, children
     in-order from left to right.
     """
-    seen = {node}
-    lifo = [node]
+    seen = set(nodes)
+    lifo = list(nodes)
 
     while lifo:
         node = lifo.pop()
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index e5884f1c574..6843ed9ee2e 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -9,8 +9,9 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any
 
-import cudf_polars.experimental.io  # noqa: F401
-from cudf_polars.dsl.ir import IR, Cache, Projection, Union
+import cudf_polars.experimental.io
+import cudf_polars.experimental.select  # noqa: F401
+from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
 from cudf_polars.experimental.dispatch import (
@@ -112,7 +113,7 @@ def task_graph(
     """
     graph = reduce(
         operator.or_,
-        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
+        (generate_ir_tasks(node, partition_info) for node in traversal([ir])),
     )
 
     key_name = get_key_name(ir)
@@ -226,6 +227,8 @@ def _lower_ir_pwise(
 
 lower_ir_node.register(Projection, _lower_ir_pwise)
 lower_ir_node.register(Cache, _lower_ir_pwise)
+lower_ir_node.register(Filter, _lower_ir_pwise)
+lower_ir_node.register(HStack, _lower_ir_pwise)
 
 
 def _generate_ir_tasks_pwise(
@@ -245,3 +248,6 @@ def _generate_ir_tasks_pwise(
 
 generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise)
 generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Filter, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(HStack, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Select, _generate_ir_tasks_pwise)
diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py
new file mode 100644
index 00000000000..5f79384b569
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/select.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Parallel Select Logic."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import Select
+from cudf_polars.dsl.traversal import traversal
+from cudf_polars.experimental.dispatch import lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.experimental.parallel import LowerIRTransformer
+
+
+@lower_ir_node.register(Select)
+def _(
+    ir: Select, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    child, partition_info = rec(ir.children[0])
+    pi = partition_info[child]
+    if pi.count > 1 and not all(
+        expr.is_pointwise for expr in traversal([e.value for e in ir.exprs])
+    ):
+        # TODO: Handle non-pointwise expressions.
+        raise NotImplementedError(
+            f"Selection {ir} does not support multiple partitions."
+        )
+    new_node = ir.reconstruct([child])
+    partition_info[new_node] = pi
+    return new_node, partition_info
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 9755994c419..9fcca2e290e 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -32,21 +32,21 @@ def test_traversal_unique():
     dt = plc.DataType(plc.TypeId.INT8)
 
     e1 = make_expr(dt, "a", "a")
-    unique_exprs = list(traversal(e1))
+    unique_exprs = list(traversal([e1]))
 
     assert len(unique_exprs) == 2
     assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
     assert unique_exprs == [e1, expr.Col(dt, "a")]
 
     e2 = make_expr(dt, "a", "b")
-    unique_exprs = list(traversal(e2))
+    unique_exprs = list(traversal([e2]))
 
     assert len(unique_exprs) == 3
     assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
     assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
 
     e3 = make_expr(dt, "b", "a")
-    unique_exprs = list(traversal(e3))
+    unique_exprs = list(traversal([e3]))
 
     assert len(unique_exprs) == 3
     assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
diff --git a/python/cudf_polars/tests/experimental/test_select.py b/python/cudf_polars/tests/experimental/test_select.py
new file mode 100644
index 00000000000..7dfe6ead148
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_select.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def engine():
+    return pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 3},
+    )
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    )
+
+
+def test_select(df, engine):
+    query = df.select(
+        pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d")
+    )
+    assert_gpu_result_equal(query, engine=engine)
+
+
+def test_select_reduce_raises(df, engine):
+    query = df.select(
+        (pl.col("a") + pl.col("b")).max(),
+        (pl.col("a") * 2 + pl.col("b")).alias("d").mean(),
+    )
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="NotImplementedError",
+    ):
+        assert_gpu_result_equal(query, engine=engine)
+
+
+def test_select_with_cse_no_agg(df, engine):
+    expr = pl.col("a") + pl.col("a")
+    query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c"))
+    assert_gpu_result_equal(query, engine=engine)
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 962a229a839..fceaaf185e8 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -714,21 +714,35 @@ def read_csv(
         storage_options=None,
         **kwargs,
     ):
-        import dask_expr as dx
-        from fsspec.utils import stringify_path
+        try:
+            # TODO: Remove when cudf is pinned to dask>2024.12.0
+            import dask_expr as dx
+            from dask_expr.io.csv import ReadCSV
+            from fsspec.utils import stringify_path
+
+            if not isinstance(path, str):
+                path = stringify_path(path)
+            return dx.new_collection(
+                ReadCSV(
+                    path,
+                    dtype_backend=dtype_backend,
+                    storage_options=storage_options,
+                    kwargs=kwargs,
+                    header=header,
+                    dataframe_backend="cudf",
+                )
+            )
+        except ImportError:
+            # Requires dask>2024.12.0
+            from dask_cudf.io.csv import read_csv
 
-        if not isinstance(path, str):
-            path = stringify_path(path)
-        return dx.new_collection(
-            dx.io.csv.ReadCSV(
+            return read_csv(
                 path,
-                dtype_backend=dtype_backend,
-                storage_options=storage_options,
-                kwargs=kwargs,
+                *args,
                 header=header,
-                dataframe_backend="cudf",
+                storage_options=storage_options,
+                **kwargs,
             )
-        )
 
     @staticmethod
     def read_json(*args, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index b22b31a591f..29f98b14511 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -1,8 +1,193 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+import os
+from glob import glob
+from warnings import warn
 
-read_csv = _deprecated_api(
-    "dask_cudf.io.csv.read_csv",
-    new_api="dask_cudf.read_csv",
-)
+from fsspec.utils import infer_compression
+
+from dask import dataframe as dd
+from dask.dataframe.io.csv import make_reader
+from dask.utils import parse_bytes
+
+import cudf
+
+
+def read_csv(path, blocksize="default", **kwargs):
+    """
+    Read CSV files into a :class:`.DataFrame`.
+
+    This API parallelizes the :func:`cudf:cudf.read_csv` function in
+    the following ways:
+
+    It supports loading many files at once using globstrings:
+
+    >>> import dask_cudf
+    >>> df = dask_cudf.read_csv("myfiles.*.csv")
+
+    In some cases it can break up large files:
+
+    >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
+
+    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
+
+    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
+    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
+
+    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
+    supports many of the same keyword arguments with the same
+    performance guarantees. See the docstring for
+    :func:`cudf:cudf.read_csv` for more information on available
+    keyword arguments.
+
+    Parameters
+    ----------
+    path : str, path object, or file-like object
+        Either a path to a file (a str, :py:class:`pathlib.Path`, or
+        py._path.local.LocalPath), URL (including http, ftp, and S3
+        locations), or any object with a read() method (such as
+        builtin :py:func:`open` file handler function or
+        :py:class:`~io.StringIO`).
+    blocksize : int or str, default "256 MiB"
+        The target task partition size. If ``None``, a single block
+        is used for each file.
+    **kwargs : dict
+        Passthrough key-word arguments that are sent to
+        :func:`cudf:cudf.read_csv`.
+
+    Notes
+    -----
+    If any of `skipfooter`/`skiprows`/`nrows` are passed,
+    `blocksize` will default to None.
+
+    Examples
+    --------
+    >>> import dask_cudf
+    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
+    >>> ddf.compute()
+       a      b
+    0  1     hi
+    1  2  hello
+    2  3     ai
+
+    """
+    # Set default `blocksize`
+    if blocksize == "default":
+        if (
+            kwargs.get("skipfooter", 0) != 0
+            or kwargs.get("skiprows", 0) != 0
+            or kwargs.get("nrows", None) is not None
+        ):
+            # Cannot read in blocks if skipfooter,
+            # skiprows or nrows is passed.
+            blocksize = None
+        else:
+            blocksize = "256 MiB"
+
+    if "://" in str(path):
+        func = make_reader(cudf.read_csv, "read_csv", "CSV")
+        return func(path, blocksize=blocksize, **kwargs)
+    else:
+        return _internal_read_csv(path=path, blocksize=blocksize, **kwargs)
+
+
+def _internal_read_csv(path, blocksize="256 MiB", **kwargs):
+    if isinstance(blocksize, str):
+        blocksize = parse_bytes(blocksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    compression = kwargs.get("compression", "infer")
+
+    if compression == "infer":
+        # Infer compression from first path by default
+        compression = infer_compression(filenames[0])
+
+    if compression and blocksize:
+        # compressed CSVs reading must read the entire file
+        kwargs.pop("byte_range", None)
+        warn(
+            "Warning %s compression does not support breaking apart files\n"
+            "Please ensure that each individual file can fit in memory and\n"
+            "use the keyword ``blocksize=None to remove this message``\n"
+            "Setting ``blocksize=(size of file)``" % compression
+        )
+        blocksize = None
+
+    if blocksize is None:
+        return read_csv_without_blocksize(path, **kwargs)
+
+    # Let dask.dataframe generate meta
+    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
+
+    i = 0
+    path_list = []
+    kwargs_list = []
+    for fn in filenames:
+        size = os.path.getsize(fn)
+        for start in range(0, size, blocksize):
+            kwargs2 = kwargs.copy()
+            kwargs2["byte_range"] = (
+                start,
+                blocksize,
+            )  # specify which chunk of the file we care about
+            if start != 0:
+                kwargs2["names"] = names  # no header in the middle of the file
+                kwargs2["header"] = None
+            path_list.append(fn)
+            kwargs_list.append(kwargs2)
+            i += 1
+
+    return dd.from_map(_read_csv, path_list, kwargs_list, meta=meta)
+
+
+def _read_csv(fn, kwargs):
+    return cudf.read_csv(fn, **kwargs)
+
+
+def read_csv_without_blocksize(path, **kwargs):
+    """Read entire CSV with optional compression (gzip/zip)
+
+    Parameters
+    ----------
+    path : str
+        path to files (support for glob)
+    """
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    meta_kwargs = kwargs.copy()
+    if "skipfooter" in meta_kwargs:
+        meta_kwargs.pop("skipfooter")
+    if "nrows" in meta_kwargs:
+        meta_kwargs.pop("nrows")
+    # Read "head" of first file (first 5 rows).
+    # Convert to empty df for metadata.
+    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
+    return dd.from_map(cudf.read_csv, filenames, meta=meta, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index a0acb86f5a9..ddfd1c1adac 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -185,11 +185,6 @@ def test_read_csv_blocksize_none(tmp_path, compression, size):
     df2 = dask_cudf.read_csv(path, blocksize=None, dtype=typ)
     dd.assert_eq(df, df2)
 
-    # Test chunksize deprecation
-    with pytest.warns(FutureWarning, match="deprecated"):
-        df3 = dask_cudf.read_csv(path, chunksize=None, dtype=typ)
-    dd.assert_eq(df, df3)
-
 
 @pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
 def test_csv_reader_usecols(tmp_path, dtype):
@@ -275,7 +270,3 @@ def test_deprecated_api_paths(tmp_path):
     with pytest.warns(match="dask_cudf.io.read_csv is now deprecated"):
         df2 = dask_cudf.io.read_csv(csv_path)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.csv.read_csv is now deprecated"):
-        df2 = dask_cudf.io.csv.read_csv(csv_path)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd
index 8696fcb3c15..a0fca95d459 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/io/avro.pxd
@@ -1,12 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from pylibcudf.libcudf.io.avro cimport avro_reader_options
+from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder
 from pylibcudf.libcudf.types cimport size_type
 
 
-cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = *,
-    size_type skip_rows = *,
-    size_type num_rows = *
-)
+from pylibcudf.libcudf.types cimport size_type
+
+cdef class AvroReaderOptions:
+    cdef avro_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_columns(self, list col_names)
+
+
+cdef class AvroReaderOptionsBuilder:
+    cdef avro_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef AvroReaderOptionsBuilder columns(self, list col_names)
+    cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows)
+    cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows)
+    cpdef AvroReaderOptions build(self)
+
+cpdef TableWithMetadata read_avro(AvroReaderOptions options)
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi
index 49c2f083702..8cafc9a6573 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyi
+++ b/python/pylibcudf/pylibcudf/io/avro.pyi
@@ -1,11 +1,16 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.io.types import SourceInfo, TableWithMetadata
 
-__all__ = ["read_avro"]
-
-def read_avro(
-    source_info: SourceInfo,
-    columns: list[str] | None = None,
-    skip_rows: int = 0,
-    num_rows: int = -1,
-) -> TableWithMetadata: ...
+__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
+
+class AvroReaderOptions:
+    @staticmethod
+    def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...
+
+class AvroReaderOptionsBuilder:
+    def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ...
+    def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ...
+    def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ...
+    def build(self) -> AvroReaderOptions: ...
+
+def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 4271333511a..c378fca0415 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
-__all__ = ["read_avro"]
+__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"]
+
+
+cdef class AvroReaderOptions:
+    """
+    The settings to use for ``read_avro``
+    For details, see :cpp:class:`cudf::io::avro_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a AvroWriterOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::avro_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the Avro file from.
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+            Builder to build AvroReaderOptions
+        """
+        cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__(
+            AvroReaderOptionsBuilder
+        )
+        avro_builder.c_obj = avro_reader_options.builder(source.c_obj)
+        avro_builder.source = source
+        return avro_builder
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Set names of the column to be read.
+
+        Parameters
+        ----------
+        col_names : list[str]
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] vec
+        vec.reserve(len(col_names))
+        for name in col_names:
+            vec.push_back(str(name).encode())
+        self.c_obj.set_columns(vec)
+
+
+cdef class AvroReaderOptionsBuilder:
+    cpdef AvroReaderOptionsBuilder columns(self, list col_names):
+        """
+        Set names of the column to be read.
+
+        Parameters
+        ----------
+        col_names : list
+            List of column names
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        cdef vector[string] vec
+        vec.reserve(len(col_names))
+        for name in col_names:
+            vec.push_back(str(name).encode())
+        self.c_obj.columns(vec)
+        return self
+
+    cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows):
+        """
+        Sets number of rows to skip.
+
+        Parameters
+        ----------
+        skip_rows : size_type
+            Number of rows to skip from start
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        self.c_obj.skip_rows(skip_rows)
+        return self
+
+    cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows):
+        """
+        Sets number of rows to read.
+
+        Parameters
+        ----------
+        num_rows : size_type
+            Number of rows to read after skip
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        self.c_obj.num_rows(num_rows)
+        return self
+
+    cpdef AvroReaderOptions build(self):
+        """Create a AvroReaderOptions object"""
+        cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__(
+            AvroReaderOptions
+        )
+        avro_options.c_obj = move(self.c_obj.build())
+        avro_options.source = self.source
+        return avro_options
 
 
 cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = None,
-    size_type skip_rows = 0,
-    size_type num_rows = -1
+    AvroReaderOptions options
 ):
     """
-    Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
+    Read from Avro format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
 
     For details, see :cpp:func:`read_avro`.
 
     Parameters
     ----------
-    source_info: SourceInfo
-        The SourceInfo object to read the avro dataset from.
-    columns: list, default None
-        Optional columns to read, if not provided, reads all columns in the file.
-    skip_rows: size_type, default 0
-        The number of rows to skip.
-    num_rows: size_type, default -1
-        The number of rows to read, after skipping rows.
-        If -1 is passed, all rows will be read.
-
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    options: AvroReaderOptions
+        Settings for controlling reading behavior
     """
-    cdef vector[string] c_columns
-    if columns is not None and len(columns) > 0:
-        c_columns.reserve(len(columns))
-        for col in columns:
-            c_columns.push_back(str(col).encode())
-
-    cdef avro_reader_options avro_opts = (
-        avro_reader_options.builder(source_info.c_obj)
-        .columns(c_columns)
-        .skip_rows(skip_rows)
-        .num_rows(num_rows)
-        .build()
-    )
-
     with nogil:
-        c_result = move(cpp_read_avro(avro_opts))
+        c_result = move(cpp_read_avro(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
index f65c1034598..7e446298ba9 100644
--- a/python/pylibcudf/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -6,42 +6,76 @@ from pylibcudf.io.types cimport (
     TableWithMetadata,
     compression_type,
 )
-from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.io.json cimport (
+    json_recovery_mode_t,
+    json_reader_options,
+    json_reader_options_builder,
+    json_writer_options,
+    json_writer_options_builder,
+)
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.table cimport Table
 
 
-cpdef TableWithMetadata read_json(
-    SourceInfo source_info,
-    list dtypes = *,
-    compression_type compression = *,
-    bool lines = *,
-    size_t byte_range_offset = *,
-    size_t byte_range_size = *,
-    bool keep_quotes = *,
-    bool mixed_types_as_string = *,
-    bool prune_columns = *,
-    json_recovery_mode_t recovery_mode = *,
-)
+cdef class JsonReaderOptions:
+    cdef json_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_dtypes(self, list types)
+    cpdef void enable_keep_quotes(self, bool keep_quotes)
+    cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string)
+    cpdef void enable_prune_columns(self, bool prune_columns)
+    cpdef void set_byte_range_offset(self, size_t offset)
+    cpdef void set_byte_range_size(self, size_t size)
+    cpdef void enable_lines(self, bool val)
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+    cpdef void set_delimiter(self, str val)
+    cpdef void enable_dayfirst(self, bool val)
+    cpdef void enable_experimental(self, bool val)
+    cpdef void enable_normalize_single_quotes(self, bool val)
+    cpdef void enable_normalize_whitespace(self, bool val)
+    cpdef void set_strict_validation(self, bool val)
+    cpdef void allow_unquoted_control_chars(self, bool val)
+    cpdef void allow_numeric_leading_zeros(self, bool val)
+    cpdef void allow_nonnumeric_numbers(self, bool val)
+    cpdef void set_na_values(self, list vals)
 
+cdef class JsonReaderOptionsBuilder:
+    cdef json_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef JsonReaderOptionsBuilder compression(self, compression_type compression)
+    cpdef JsonReaderOptionsBuilder lines(self, bool val)
+    cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val)
+    cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset)
+    cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size)
+    cpdef JsonReaderOptionsBuilder recovery_mode(
+        self, json_recovery_mode_t recovery_mode
+    )
+    cpdef build(self)
 
-cpdef void write_json(
-    SinkInfo sink_info,
-    TableWithMetadata tbl,
-    str na_rep = *,
-    bool include_nulls = *,
-    bool lines = *,
-    size_type rows_per_chunk = *,
-    str true_value = *,
-    str false_value = *
-)
+cpdef TableWithMetadata read_json(JsonReaderOptions options)
+
+cdef class JsonWriterOptions:
+    cdef json_writer_options c_obj
+    cdef SinkInfo sink
+    cdef Table table
+    cpdef void set_rows_per_chunk(self, size_type val)
+    cpdef void set_true_value(self, str val)
+    cpdef void set_false_value(self, str val)
+
+cdef class JsonWriterOptionsBuilder:
+    cdef json_writer_options_builder c_obj
+    cdef SinkInfo sink
+    cdef Table table
+    cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta)
+    cpdef JsonWriterOptionsBuilder na_rep(self, str val)
+    cpdef JsonWriterOptionsBuilder include_nulls(self, bool val)
+    cpdef JsonWriterOptionsBuilder lines(self, bool val)
+    cpdef JsonWriterOptions build(self)
+
+cpdef void write_json(JsonWriterOptions options)
 
 cpdef tuple chunked_read_json(
-    SourceInfo source_info,
-    list dtypes = *,
-    compression_type compression = *,
-    bool keep_quotes = *,
-    bool mixed_types_as_string = *,
-    bool prune_columns = *,
-    json_recovery_mode_t recovery_mode = *,
+    JsonReaderOptions options,
     int chunk_size= *,
 )
diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi
index b2bc6a43700..b84b437a3a2 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyi
+++ b/python/pylibcudf/pylibcudf/io/json.pyi
@@ -2,6 +2,8 @@
 from collections.abc import Mapping
 from typing import TypeAlias
 
+from typing_extensions import Self
+
 from pylibcudf.column import Column
 from pylibcudf.io.types import (
     CompressionType,
@@ -10,41 +12,64 @@ from pylibcudf.io.types import (
     SourceInfo,
     TableWithMetadata,
 )
+from pylibcudf.table import Table
 from pylibcudf.types import DataType
 
 ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
 
 NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]]
 
-def read_json(
-    source_info: SourceInfo,
-    dtypes: list[NameAndType] | None = None,
-    compression: CompressionType = CompressionType.AUTO,
-    lines: bool = False,
-    byte_range_offset: int = 0,
-    byte_range_size: int = 0,
-    keep_quotes: bool = False,
-    mixed_types_as_string: bool = False,
-    prune_columns: bool = False,
-    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
-) -> TableWithMetadata: ...
-def write_json(
-    sink_info: SinkInfo,
-    table_w_meta: TableWithMetadata,
-    na_rep: str = "",
-    include_nulls: bool = False,
-    lines: bool = False,
-    rows_per_chunk: int = 2**32 - 1,
-    true_value: str = "true",
-    false_value: str = "false",
-) -> None: ...
+class JsonReaderOptions:
+    def set_dtypes(
+        self, types: list[DataType] | list[NameAndType]
+    ) -> None: ...
+    def enable_keep_quotes(self, keep_quotes: bool) -> None: ...
+    def enable_mixed_types_as_string(
+        self, mixed_types_as_string: bool
+    ) -> None: ...
+    def enable_prune_columns(self, prune_columns: bool) -> None: ...
+    def set_byte_range_offset(self, offset: int) -> None: ...
+    def set_byte_range_size(self, size: int) -> None: ...
+    def enable_lines(self, val: bool) -> None: ...
+    def set_delimiter(self, val: str) -> None: ...
+    def enable_dayfirst(self, val: bool) -> None: ...
+    def enable_experimental(self, val: bool) -> None: ...
+    def enable_normalize_single_quotes(self, val: bool) -> None: ...
+    def enable_normalize_whitespace(self, val: bool) -> None: ...
+    def set_strict_validation(self, val: bool) -> None: ...
+    def allow_unquoted_control_chars(self, val: bool) -> None: ...
+    def allow_numeric_leading_zeros(self, val: bool) -> None: ...
+    def allow_nonnumeric_numbers(self, val: bool) -> None: ...
+    def set_na_values(self, vals: list[str]) -> None: ...
+    @staticmethod
+    def builder(source: SourceInfo) -> JsonReaderOptionsBuilder: ...
+
+class JsonReaderOptionsBuilder:
+    def compression(self, compression: CompressionType) -> Self: ...
+    def lines(self, lines: bool) -> Self: ...
+    def byte_range_offset(self, byte_range_offset: int) -> Self: ...
+    def byte_range_size(self, byte_range_size: int) -> Self: ...
+    def recovery_mode(self, recovery_mode: JSONRecoveryMode) -> Self: ...
+    def build(self) -> JsonReaderOptions: ...
+
+def read_json(options: JsonReaderOptions) -> TableWithMetadata: ...
+
+class JsonWriterOptions:
+    @staticmethod
+    def builder(sink: SinkInfo, table: Table) -> JsonWriterOptionsBuilder: ...
+    def set_rows_per_chunk(self, val: int) -> None: ...
+    def set_true_value(self, val: str) -> None: ...
+    def set_false_value(self, val: str) -> None: ...
+
+class JsonWriterOptionsBuilder:
+    def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ...
+    def na_rep(self, val: str) -> Self: ...
+    def include_nulls(self, val: bool) -> Self: ...
+    def lines(self, val: bool) -> Self: ...
+    def build(self) -> JsonWriterOptions: ...
+
+def write_json(options: JsonWriterOptions) -> None: ...
 def chunked_read_json(
-    source_info: SourceInfo,
-    dtypes: list[NameAndType] | None = None,
-    compression: CompressionType = CompressionType.AUTO,
-    keep_quotes: bool = False,
-    mixed_types_as_string: bool = False,
-    prune_columns: bool = False,
-    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+    options: JsonReaderOptions,
     chunk_size: int = 100_000_000,
 ) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ...
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index ad2989925c9..1d8a559afad 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -1,6 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp cimport bool
-from libcpp.limits cimport numeric_limits
 from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -17,13 +16,20 @@ from pylibcudf.libcudf.io.json cimport (
 )
 from pylibcudf.libcudf.io.types cimport (
     compression_type,
-    table_metadata,
     table_with_metadata,
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
-__all__ = ["chunked_read_json", "read_json", "write_json"]
+__all__ = [
+    "chunked_read_json",
+    "read_json",
+    "write_json",
+    "JsonReaderOptions",
+    "JsonReaderOptionsBuilder",
+    "JsonWriterOptions",
+    "JsonWriterOptionsBuilder"
+]
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
     cdef map[string, schema_element] schema_map
@@ -47,21 +53,21 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
     return schema_map
 
 
-cdef json_reader_options _setup_json_reader_options(
+cpdef JsonReaderOptions _setup_json_reader_options(
         SourceInfo source_info,
         list dtypes,
-        compression_type compression,
-        bool lines,
-        size_t byte_range_offset,
-        size_t byte_range_size,
-        bool keep_quotes,
-        bool mixed_types_as_string,
-        bool prune_columns,
-        json_recovery_mode_t recovery_mode):
-
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = (
-        json_reader_options.builder(source_info.c_obj)
+        compression_type compression = compression_type.AUTO,
+        bool lines = False,
+        size_t byte_range_offset = 0,
+        size_t byte_range_size = 0,
+        bool keep_quotes = False,
+        bool mixed_types_as_string = False,
+        bool prune_columns = False,
+        json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+        dict extra_parameters=None,
+):
+    options = (
+        JsonReaderOptions.builder(source_info)
         .compression(compression)
         .lines(lines)
         .byte_range_offset(byte_range_offset)
@@ -71,55 +77,359 @@ cdef json_reader_options _setup_json_reader_options(
     )
 
     if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        options.set_dtypes(dtypes)
+
+    options.enable_keep_quotes(keep_quotes)
+    options.enable_mixed_types_as_string(mixed_types_as_string)
+    options.enable_prune_columns(prune_columns)
+
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+    if extra_parameters is not None:
+        for key, value in extra_parameters.items():
+            if key == 'delimiter':
+                options.set_delimiter(value)
+            elif key == 'dayfirst':
+                options.enable_dayfirst(value)
+            elif key == 'experimental':
+                options.enable_experimental(value)
+            elif key == 'normalize_single_quotes':
+                options.enable_normalize_single_quotes(value)
+            elif key == 'normalize_whitespace':
+                options.enable_normalize_whitespace(value)
+            elif key == 'strict_validation':
+                options.set_strict_validation(value)
+            elif key == 'allow_unquoted_control_chars':
+                options.allow_unquoted_control_chars(value)
+            elif key == 'allow_numeric_leading_zeros':
+                options.allow_numeric_leading_zeros(value)
+            elif key == 'allow_nonnumeric_numbers':
+                options.allow_nonnumeric_numbers(value)
+            elif key == 'na_values':
+                options.set_na_values(value)
+            else:
+                raise ValueError(
+                    "cudf engine doesn't support the "
+                    f"'{key}' keyword argument for read_json"
+                )
+    return options
+
+
+cdef class JsonReaderOptions:
+    """
+    The settings to use for ``read_json``
+
+    For details, see `:cpp:class:`cudf::io::json_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a JsonReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::json_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the JSON file from.
+
+        Returns
+        -------
+        JsonReaderOptionsBuilder
+            Builder to build JsonReaderOptions
+        """
+        cdef JsonReaderOptionsBuilder json_builder = (
+            JsonReaderOptionsBuilder.__new__(JsonReaderOptionsBuilder)
+        )
+        json_builder.c_obj = json_reader_options.builder(source.c_obj)
+        json_builder.source = source
+        return json_builder
+
+    cpdef void set_dtypes(self, list types):
+        """
+        Set data types for columns to be read.
+
+        Parameters
+        ----------
+        types : list
+            List of dtypes or a list of tuples of
+            column names, dtypes, and list of tuples
+            (to support nested column hierarchy)
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[data_type] types_vec
+        if isinstance(types[0], tuple):
+            self.c_obj.set_dtypes(_generate_schema_map(types))
         else:
-            for dtype in dtypes:
+            types_vec.reserve(len(types))
+            for dtype in types:
                 types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-    return opts
+            self.c_obj.set_dtypes(types_vec)
+
+    cpdef void enable_keep_quotes(self, bool keep_quotes):
+        """
+        Set whether the reader should keep quotes of string values.
+
+        Parameters
+        ----------
+        keep_quotes : bool
+           Boolean value to indicate whether the reader should
+           keep quotes of string values
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_keep_quotes(keep_quotes)
+
+    cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string):
+        """
+        Set whether to parse mixed types as a string column.
+        Also enables forcing to read a struct as string column using schema.
+
+        Parameters
+        ----------
+        mixed_types_as_string : bool
+           Boolean value to enable/disable parsing mixed types
+           as a string column
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_mixed_types_as_string(mixed_types_as_string)
+
+    cpdef void enable_prune_columns(self, bool prune_columns):
+        """
+        Set whether to prune columns on read, selected
+        based on the ``set_dtypes`` option.
+
+        Parameters
+        ----------
+        prune_columns : bool
+           When set as true, if the reader options include
+           ``set_dtypes``, then the reader will only return those
+           columns which are mentioned in ``set_dtypes``. If false,
+           then all columns are returned, independent of the
+           ``set_dtypes`` setting.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_prune_columns(prune_columns)
+
+    cpdef void set_byte_range_offset(self, size_t offset):
+        """
+        Set number of bytes to skip from source start.
+
+        Parameters
+        ----------
+        offset : size_t
+            Number of bytes of offset
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_byte_range_offset(offset)
+
+    cpdef void set_byte_range_size(self, size_t size):
+        """
+        Set number of bytes to read.
+
+        Parameters
+        ----------
+        size : size_t
+            Number of bytes to read
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_byte_range_size(size)
+
+    cpdef void enable_lines(self, bool val):
+        """
+        Set whether to read the file as a json object per line.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable the option
+            to read each line as a json object
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_lines(val)
+
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+
+    cpdef void set_delimiter(self, str val):
+        self.c_obj.set_delimiter(val.encode())
+
+    cpdef void enable_dayfirst(self, bool val):
+        self.c_obj.enable_dayfirst(val)
+
+    cpdef void enable_experimental(self, bool val):
+        self.c_obj.enable_experimental(val)
+
+    cpdef void enable_normalize_single_quotes(self, bool val):
+        self.c_obj.enable_normalize_single_quotes(val)
+
+    cpdef void enable_normalize_whitespace(self, bool val):
+        self.c_obj.enable_normalize_whitespace(val)
+
+    cpdef void set_strict_validation(self, bool val):
+        self.c_obj.set_strict_validation(val)
+
+    cpdef void allow_unquoted_control_chars(self, bool val):
+        self.c_obj.allow_unquoted_control_chars(val)
+
+    cpdef void allow_numeric_leading_zeros(self, bool val):
+        self.c_obj.allow_numeric_leading_zeros(val)
+
+    cpdef void allow_nonnumeric_numbers(self, bool val):
+        self.c_obj.allow_nonnumeric_numbers(val)
+
+    cpdef void set_na_values(self, list vals):
+        cdef vector[string] vec
+        for val in vals:
+            if isinstance(val, str):
+                vec.push_back(val.encode())
+        self.c_obj.set_na_values(vec)
+
+
+cdef class JsonReaderOptionsBuilder:
+    cpdef JsonReaderOptionsBuilder compression(self, compression_type compression):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        compression : CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(compression)
+        return self
+
+    cpdef JsonReaderOptionsBuilder lines(self, bool val):
+        """
+        Set whether to read the file as a json object per line.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable the option
+            to read each line as a json object
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.lines(val)
+        return self
+
+    cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val):
+        """
+        Set whether the reader should keep quotes of string values.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to indicate whether the
+            reader should keep quotes of string values
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.keep_quotes(val)
+        return self
+
+    cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset):
+        """
+        Set number of bytes to skip from source start.
+
+        Parameters
+        ----------
+        byte_range_offset : size_t
+            Number of bytes of offset
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.byte_range_offset(byte_range_offset)
+        return self
+
+    cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size):
+        """
+        Set number of bytes to read.
+
+        Parameters
+        ----------
+        byte_range_size : size_t
+            Number of bytes to read
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.byte_range_size(byte_range_size)
+        return self
+
+    cpdef JsonReaderOptionsBuilder recovery_mode(
+        self,
+        json_recovery_mode_t recovery_mode
+    ):
+        """
+        Specifies the JSON reader's behavior on invalid JSON lines.
+
+        Parameters
+        ----------
+        recovery_mode : json_recovery_mode_t
+            An enum value to indicate the JSON reader's
+            behavior on invalid JSON lines.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.recovery_mode(recovery_mode)
+        return self
+
+    cpdef build(self):
+        """Create a JsonReaderOptions object"""
+        cdef JsonReaderOptions json_options = JsonReaderOptions.__new__(
+            JsonReaderOptions
+        )
+        json_options.c_obj = move(self.c_obj.build())
+        json_options.source = self.source
+        return json_options
 
 
 cpdef tuple chunked_read_json(
-    SourceInfo source_info,
-    list dtypes = None,
-    compression_type compression = compression_type.AUTO,
-    bool keep_quotes = False,
-    bool mixed_types_as_string = False,
-    bool prune_columns = False,
-    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    JsonReaderOptions options,
     int chunk_size=100_000_000,
 ):
-    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+    """
+    Reads chunks of a JSON file into a :py:class:`~.types.TableWithMetadata`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the JSON file from.
-    dtypes : list, default None
-        Set data types for the columns in the JSON file.
-
-        Each element of the list has the format
-        (column_name, column_dtype, list of child dtypes), where
-        the list of child dtypes is an empty list if the child is not
-        a nested type (list or struct dtype), and is of format
-        (column_child_name, column_child_type, list of grandchild dtypes).
-    compression: CompressionType, default CompressionType.AUTO
-        The compression format of the JSON source.
-    keep_quotes : bool, default False
-        Whether the reader should keep quotes of string values.
-    mixed_types_as_string : bool, default False
-        If True, mixed type columns are returned as string columns.
-        If `False` parsing mixed type columns will thrown an error.
-    prune_columns : bool, default False
-        Whether to only read columns specified in dtypes.
-    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
-        Whether to raise an error or set corresponding values to null
-        when encountering an invalid JSON line.
+    options : JsonReaderOptions
+        Settings for controlling reading behavior
     chunk_size : int, default 100_000_000 bytes.
         The number of bytes to be read in chunks.
         The chunk_size should be set to at least row_size.
@@ -132,20 +442,6 @@ cpdef tuple chunked_read_json(
     cdef size_type c_range_size = (
         chunk_size if chunk_size is not None else 0
     )
-    cdef json_reader_options opts = _setup_json_reader_options(
-        source_info=source_info,
-        dtypes=dtypes,
-        compression=compression,
-        lines=True,
-        byte_range_offset=0,
-        byte_range_size=0,
-        keep_quotes=keep_quotes,
-        mixed_types_as_string=mixed_types_as_string,
-        prune_columns=prune_columns,
-        recovery_mode=recovery_mode,
-    )
-
-    # Read JSON
     cdef table_with_metadata c_result
 
     final_columns = []
@@ -153,12 +449,13 @@ cpdef tuple chunked_read_json(
     child_names = None
     i = 0
     while True:
-        opts.set_byte_range_offset(c_range_size * i)
-        opts.set_byte_range_size(c_range_size)
+        options.enable_lines(True)
+        options.set_byte_range_offset(c_range_size * i)
+        options.set_byte_range_size(c_range_size)
 
         try:
             with nogil:
-                c_result = move(cpp_read_json(opts))
+                c_result = move(cpp_read_json(options.c_obj))
         except (ValueError, OverflowError):
             break
         if meta_names is None:
@@ -186,125 +483,199 @@ cpdef tuple chunked_read_json(
 
 
 cpdef TableWithMetadata read_json(
-    SourceInfo source_info,
-    list dtypes = None,
-    compression_type compression = compression_type.AUTO,
-    bool lines = False,
-    size_t byte_range_offset = 0,
-    size_t byte_range_size = 0,
-    bool keep_quotes = False,
-    bool mixed_types_as_string = False,
-    bool prune_columns = False,
-    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    JsonReaderOptions options
 ):
-    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+    """
+    Read from JSON format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`read_json`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the JSON file from.
-    dtypes : list, default None
-        Set data types for the columns in the JSON file.
-
-        Each element of the list has the format
-        (column_name, column_dtype, list of child dtypes), where
-        the list of child dtypes is an empty list if the child is not
-        a nested type (list or struct dtype), and is of format
-        (column_child_name, column_child_type, list of grandchild dtypes).
-    compression: CompressionType, default CompressionType.AUTO
-        The compression format of the JSON source.
-    byte_range_offset : size_t, default 0
-        Number of bytes to skip from source start.
-    byte_range_size : size_t, default 0
-        Number of bytes to read. By default, will read all bytes.
-    keep_quotes : bool, default False
-        Whether the reader should keep quotes of string values.
-    mixed_types_as_string : bool, default False
-        If True, mixed type columns are returned as string columns.
-        If `False` parsing mixed type columns will thrown an error.
-    prune_columns : bool, default False
-        Whether to only read columns specified in dtypes.
-    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
-        Whether to raise an error or set corresponding values to null
-        when encountering an invalid JSON line.
+    options: JsonReaderOptions
+        Settings for controlling reading behavior
 
     Returns
     -------
     TableWithMetadata
         The Table and its corresponding metadata (column names) that were read in.
     """
-    cdef json_reader_options opts = _setup_json_reader_options(
-        source_info=source_info,
-        dtypes=dtypes,
-        compression=compression,
-        lines=lines,
-        byte_range_offset=byte_range_offset,
-        byte_range_size=byte_range_size,
-        keep_quotes=keep_quotes,
-        mixed_types_as_string=mixed_types_as_string,
-        prune_columns=prune_columns,
-        recovery_mode=recovery_mode,
-    )
-
-    # Read JSON
     cdef table_with_metadata c_result
 
     with nogil:
-        c_result = move(cpp_read_json(opts))
+        c_result = move(cpp_read_json(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
 
-cpdef void write_json(
-    SinkInfo sink_info,
-    TableWithMetadata table_w_meta,
-    str na_rep = "",
-    bool include_nulls = False,
-    bool lines = False,
-    size_type rows_per_chunk = numeric_limits[size_type].max(),
-    str true_value = "true",
-    str false_value = "false"
-):
+cdef class JsonWriterOptions:
     """
-    Writes a :py:class:`~pylibcudf.table.Table` to JSON format.
+    The settings to use for ``write_json``
 
-    Parameters
-    ----------
-    sink_info: SinkInfo
-        The SinkInfo object to write the JSON to.
-    table_w_meta: TableWithMetadata
-        The TableWithMetadata object containing the Table to write
-    na_rep: str, default ""
-        The string representation for null values.
-    include_nulls: bool, default False
+    For details, see :cpp:class:`cudf::io::json_writer_options`
+    """
+    @staticmethod
+    def builder(SinkInfo sink, Table table):
+        """
+        Create a JsonWriterOptionsBuilder object
+
+        Parameters
+        ----------
+        sink : SinkInfo
+            The sink used for writer output
+        table : Table
+            Table to be written to output
+
+        Returns
+        -------
+        JsonWriterOptionsBuilder
+            Builder to build JsonWriterOptions
+        """
+        cdef JsonWriterOptionsBuilder json_builder = (
+            JsonWriterOptionsBuilder.__new__(JsonWriterOptionsBuilder)
+        )
+        json_builder.c_obj = json_writer_options.builder(sink.c_obj, table.view())
+        json_builder.sink = sink
+        json_builder.table = table
+        return json_builder
+
+    cpdef void set_rows_per_chunk(self, size_type val):
+        """
+        Sets string to used for null entries.
+
+        Parameters
+        ----------
+        val : size_type
+            String to represent null value
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_rows_per_chunk(val)
+
+    cpdef void set_true_value(self, str val):
+        """
+        Sets string used for values != 0
+
+        Parameters
+        ----------
+        val : str
+            String to represent values != 0
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_true_value(val.encode())
+
+    cpdef void set_false_value(self, str val):
+        """
+        Sets string used for values == 0
+
+        Parameters
+        ----------
+        val : str
+            String to represent values == 0
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_false_value(val.encode())
+
+
+cdef class JsonWriterOptionsBuilder:
+    cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta):
+        """
+        Sets optional metadata (with column names).
+
+        Parameters
+        ----------
+        tbl_w_meta : TableWithMetadata
+            Associated metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.metadata(tbl_w_meta.metadata)
+        return self
+
+    cpdef JsonWriterOptionsBuilder na_rep(self, str val):
+        """
+        Sets string to used for null entries.
+
+        Parameters
+        ----------
+        val : str
+            String to represent null value
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.na_rep(val.encode())
+        return self
+
+    cpdef JsonWriterOptionsBuilder include_nulls(self, bool val):
+        """
         Enables/Disables output of nulls as 'null'.
-    lines: bool, default False
-        If `True`, write output in the JSON lines format.
-    rows_per_chunk: size_type, defaults to length of the input table
-        The maximum number of rows to write at a time.
-    true_value: str, default "true"
-        The string representation for values != 0 in INT8 types.
-    false_value: str, default "false"
-        The string representation for values == 0 in INT8 types.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.include_nulls(val)
+        return self
+
+    cpdef JsonWriterOptionsBuilder lines(self, bool val):
+        """
+        Enables/Disables JSON lines for records format.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.lines(val)
+        return self
+
+    cpdef JsonWriterOptions build(self):
+        """Create a JsonWriterOptions object"""
+        cdef JsonWriterOptions json_options = JsonWriterOptions.__new__(
+            JsonWriterOptions
+        )
+        json_options.c_obj = move(self.c_obj.build())
+        json_options.sink = self.sink
+        json_options.table = self.table
+        return json_options
+
+
+cpdef void write_json(JsonWriterOptions options):
     """
-    cdef table_metadata tbl_meta = table_w_meta.metadata
-    cdef string na_rep_c = na_rep.encode()
-
-    cdef json_writer_options options = (
-        json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
-        .metadata(tbl_meta)
-        .na_rep(na_rep_c)
-        .include_nulls(include_nulls)
-        .lines(lines)
-        .build()
-    )
+    Writes a set of columns to JSON format.
 
-    if rows_per_chunk != numeric_limits[size_type].max():
-        options.set_rows_per_chunk(rows_per_chunk)
-    if true_value != "true":
-        options.set_true_value(<string>true_value.encode())
-    if false_value != "false":
-        options.set_false_value(<string>false_value.encode())
+    Parameters
+    ----------
+    options : JsonWriterOptions
+        Settings for controlling writing behavior
 
+    Returns
+    -------
+    None
+    """
     with nogil:
-        cpp_write_json(options)
+        cpp_write_json(options.c_obj)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
index 671f0692444..7531608519c 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint64_t, int64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport (
 )
 from pylibcudf.libcudf.io.orc cimport (
     orc_chunked_writer,
+    orc_reader_options,
+    orc_reader_options_builder,
     orc_writer_options,
     orc_writer_options_builder,
     chunked_orc_writer_options,
@@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport (
     statistics_freq,
 )
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = *,
-    list stripes = *,
-    size_type skip_rows = *,
-    size_type nrows = *,
-    bool use_index = *,
-    bool use_np_dtypes = *,
-    DataType timestamp_type = *,
-    list decimal128_columns = *
-)
+cdef class OrcReaderOptions:
+    cdef orc_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_num_rows(self, int64_t nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_stripes(self, list stripes)
+    cpdef void set_decimal128_columns(self, list val)
+    cpdef void set_timestamp_type(self, DataType type_)
+    cpdef void set_columns(self, list col_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cdef orc_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use)
+    cpdef OrcReaderOptions build(self)
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options)
 
 cdef class OrcColumnStatistics:
     cdef optional[uint64_t] number_of_values_c
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
index 516f97981e9..c496b7a2152 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyi
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -1,6 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from typing import Any, Self
+from typing import Any
+
+from typing_extensions import Self
 
 from pylibcudf.io.types import (
     CompressionType,
@@ -11,19 +13,21 @@ from pylibcudf.io.types import (
     TableWithMetadata,
 )
 from pylibcudf.table import Table
-from pylibcudf.types import DataType
 
-def read_orc(
-    source_info: SourceInfo,
-    columns: list[str] | None = None,
-    stripes: list[list[int]] | None = None,
-    skip_rows: int = 0,
-    nrows: int = -1,
-    use_index: bool = True,
-    use_np_dtypes: bool = True,
-    timestamp_type: DataType | None = None,
-    decimal128_columns: list[str] | None = None,
-) -> TableWithMetadata: ...
+class OrcReaderOptions:
+    def set_num_rows(self, nrows: int) -> None: ...
+    def set_skip_rows(self, skip_rows: int) -> None: ...
+    def set_stripes(self, stripes: list[list[int]]) -> None: ...
+    def set_decimal128_columns(self, val: list[str]) -> None: ...
+    def set_columns(self, col_names: list[str]) -> None: ...
+    @staticmethod
+    def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ...
+
+class OrcReaderOptionsBuilder:
+    def use_index(self, use: bool) -> Self: ...
+    def build(self) -> OrcReaderOptions: ...
+
+def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ...
 
 class OrcColumnStatistics:
     def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 63eab4a9634..c125d7e76fa 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -46,6 +46,8 @@ __all__ = [
     "read_orc",
     "read_parsed_orc_statistics",
     "write_orc",
+    "OrcReaderOptions",
+    "OrcReaderOptionsBuilder",
     "OrcWriterOptions",
     "OrcWriterOptionsBuilder",
     "OrcChunkedWriter",
@@ -237,84 +239,190 @@ cdef class ParsedOrcStatistics:
         return out
 
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = None,
-    list stripes = None,
-    size_type skip_rows = 0,
-    size_type nrows = -1,
-    bool use_index = True,
-    bool use_np_dtypes = True,
-    DataType timestamp_type = None,
-    list decimal128_columns = None,
-):
-    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
-
-    Parameters
-    ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    stripes : list[list[size_type]], default None
-        List of stripes to be read.
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    use_index : bool, default True
-        Whether to use the row index to speed up reading.
-    use_np_dtypes : bool, default True
-        Whether to use numpy compatible dtypes.
-    timestamp_type : DataType, default None
-        The timestamp type to use for the timestamp columns.
-    decimal128_columns : list, default None
-        List of column names to be read as 128-bit decimals.
+cdef class OrcReaderOptions:
+    """
+    The settings to use for ``read_orc``
 
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    For details, see :cpp:class:`cudf::io::orc_reader_options`
     """
-    cdef orc_reader_options opts
-    cdef vector[vector[size_type]] c_stripes
-    opts = (
-        orc_reader_options.builder(source_info.c_obj)
-        .use_index(use_index)
-        .build()
-    )
-    if nrows >= 0:
-        opts.set_num_rows(nrows)
-    if skip_rows >= 0:
-        opts.set_skip_rows(skip_rows)
-    if stripes is not None:
-        c_stripes = stripes
-        opts.set_stripes(c_stripes)
-    if timestamp_type is not None:
-        opts.set_timestamp_type(timestamp_type.c_obj)
-
-    cdef vector[string] c_decimal128_columns
-    if decimal128_columns is not None and len(decimal128_columns) > 0:
-        c_decimal128_columns.reserve(len(decimal128_columns))
-        for col in decimal128_columns:
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a OrcReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::orc_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the ORC file from.
+
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+            Builder to build OrcReaderOptions
+        """
+        cdef OrcReaderOptionsBuilder orc_builder = (
+            OrcReaderOptionsBuilder.__new__(OrcReaderOptionsBuilder)
+        )
+        orc_builder.c_obj = orc_reader_options.builder(source.c_obj)
+        orc_builder.source = source
+        return orc_builder
+
+    cpdef void set_num_rows(self, int64_t nrows):
+        """
+        Sets number of row to read.
+
+        Parameters
+        ----------
+        nrows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip from the start.
+
+        Parameters
+        ----------
+        skip_rows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_stripes(self, list stripes):
+        """
+        Sets list of stripes to read for each input source.
+
+        Parameters
+        ----------
+        stripes: list[list[size_type]]
+            List of lists, mapping stripes to read to input sources
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] c_stripes
+        cdef vector[size_type] vec
+        for sub_list in stripes:
+            for x in sub_list:
+                vec.push_back(x)
+            c_stripes.push_back(vec)
+            vec.clear()
+        self.c_obj.set_stripes(c_stripes)
+
+    cpdef void set_decimal128_columns(self, list val):
+        """
+        Set columns that should be read as 128-bit Decimal.
+
+        Parameters
+        ----------
+        val: list[str]
+            List of fully qualified column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_decimal128_columns
+        c_decimal128_columns.reserve(len(val))
+        for col in val:
             if not isinstance(col, str):
                 raise TypeError("Decimal 128 column names must be strings!")
             c_decimal128_columns.push_back(col.encode())
-        opts.set_decimal128_columns(c_decimal128_columns)
+        self.c_obj.set_decimal128_columns(c_decimal128_columns)
+
+    cpdef void set_timestamp_type(self, DataType type_):
+        """
+        Sets timestamp type to which timestamp column will be cast.
+
+        Parameters
+        ----------
+        type_: DataType
+            Type of timestamp
 
-    cdef vector[string] c_column_names
-    if columns is not None and len(columns) > 0:
-        c_column_names.reserve(len(columns))
-        for col in columns:
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_timestamp_type(type_.c_obj)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the column to read.
+
+        Parameters
+        ----------
+        col_names: list[str]
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_column_names
+        c_column_names.reserve(len(col_names))
+        for col in col_names:
             if not isinstance(col, str):
                 raise TypeError("Column names must be strings!")
             c_column_names.push_back(col.encode())
-        opts.set_columns(c_column_names)
+        self.c_obj.set_columns(c_column_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use):
+        """
+        Enable/Disable use of row index to speed-up reading.
+
+        Parameters
+        ----------
+        use : bool
+            Boolean value to enable/disable row index use
 
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+        """
+        self.c_obj.use_index(use)
+        return self
+
+    cpdef OrcReaderOptions build(self):
+        """Create a OrcReaderOptions object"""
+        cdef OrcReaderOptions orc_options = OrcReaderOptions.__new__(
+            OrcReaderOptions
+        )
+        orc_options.c_obj = move(self.c_obj.build())
+        orc_options.source = self.source
+        return orc_options
+
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options):
+    """
+    Read from ORC format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`read_orc`.
+
+    Parameters
+    ----------
+    options: OrcReaderOptions
+        Settings for controlling reading behavior
+    """
     cdef table_with_metadata c_result
 
     with nogil:
-        c_result = move(cpp_read_orc(opts))
+        c_result = move(cpp_read_orc(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
@@ -503,7 +611,7 @@ cpdef void write_orc(OrcWriterOptions options):
     The table to write, output paths, and options are encapsulated
     by the `options` object.
 
-    For details, see :cpp:func:`write_csv`.
+    For details, see :cpp:func:`write_orc`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index a7ca6978621..c241c478f25 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \
     cdef struct schema_element:
         data_type type
         map[string, schema_element] child_types
+        optional[vector[string]] column_order
 
     cpdef enum class json_recovery_mode_t(int32_t):
         FAIL
@@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \
             except +libcudf_exception_handler
         size_t get_byte_range_offset() except +libcudf_exception_handler
         size_t get_byte_range_size() except +libcudf_exception_handler
+        size_t get_byte_range_size_with_padding() except +libcudf_exception_handler
+        size_t get_byte_range_padding() except +libcudf_exception_handler
+        char get_delimiter() except +libcudf_exception_handler
         bool is_enabled_lines() except +libcudf_exception_handler
         bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler
         bool is_enabled_prune_columns() except +libcudf_exception_handler
-        bool is_enabled_dayfirst() except +libcudf_exception_handler
         bool is_enabled_experimental() except +libcudf_exception_handler
+        bool is_enabled_dayfirst() except +libcudf_exception_handler
+        bool is_enabled_keep_quotes() except +libcudf_exception_handler
+        bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler
+        bool is_enabled_normalize_whitespace() except +libcudf_exception_handler
+        json_recovery_mode_t recovery_mode() except +libcudf_exception_handler
+        bool is_strict_validation() except +libcudf_exception_handler
+        bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler
+        bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler
+        bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler
+        vector[string] get_na_values() except +libcudf_exception_handler
 
         # setter
-        void set_dtypes(
-            vector[data_type] types
-        ) except +libcudf_exception_handler
-        void set_dtypes(
-            map[string, schema_element] types
-        ) except +libcudf_exception_handler
-        void set_compression(
-            cudf_io_types.compression_type compression
-        ) except +libcudf_exception_handler
+        void set_dtypes(vector[data_type] types) except +libcudf_exception_handler
+        void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler
+        void set_dtypes(map[string, schema_element] types)\
+            except +libcudf_exception_handler
+        void set_dtypes(schema_element types) except +libcudf_exception_handler
+        void set_compression(cudf_io_types.compression_type comp_type)\
+            except +libcudf_exception_handler
         void set_byte_range_offset(size_t offset) except +libcudf_exception_handler
         void set_byte_range_size(size_t size) except +libcudf_exception_handler
+        void set_delimiter(char delimiter) except +libcudf_exception_handler
         void enable_lines(bool val) except +libcudf_exception_handler
         void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler
         void enable_prune_columns(bool val) except +libcudf_exception_handler
-        void enable_dayfirst(bool val) except +libcudf_exception_handler
         void enable_experimental(bool val) except +libcudf_exception_handler
+        void enable_dayfirst(bool val) except +libcudf_exception_handler
         void enable_keep_quotes(bool val) except +libcudf_exception_handler
+        void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler
+
+        void enable_normalize_whitespace(bool val) except +libcudf_exception_handler
+        void set_recovery_mode(json_recovery_mode_t val)\
+            except +libcudf_exception_handler
+        void set_strict_validation(bool val) except +libcudf_exception_handler
+        void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler
+        void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler
+        void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler
+        void set_na_values(vector[string] vals) except +libcudf_exception_handler
 
         @staticmethod
         json_reader_options_builder builder(
@@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& dtypes(
             map[string, schema_element] types
         ) except +libcudf_exception_handler
+        json_reader_options_builder& dtypes(
+            schema_element types
+        ) except +libcudf_exception_handler
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +libcudf_exception_handler
@@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& byte_range_size(
             size_t size
         ) except +libcudf_exception_handler
+        json_reader_options_builder& delimiter(
+            char delimiter
+        ) except +libcudf_exception_handler
         json_reader_options_builder& lines(
             bool val
         ) except +libcudf_exception_handler
@@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& prune_columns(
             bool val
         ) except +libcudf_exception_handler
+        json_reader_options_builder& experimental(
+            bool val
+        ) except +libcudf_exception_handler
         json_reader_options_builder& dayfirst(
             bool val
         ) except +libcudf_exception_handler
         json_reader_options_builder& keep_quotes(
             bool val
         ) except +libcudf_exception_handler
+        json_reader_options_builder& normalize_single_quotes(
+            bool val
+        ) except +libcudf_exception_handler
+        json_reader_options_builder& normalize_whitespace(
+            bool val
+        ) except +libcudf_exception_handler
         json_reader_options_builder& recovery_mode(
             json_recovery_mode_t val
         ) except +libcudf_exception_handler
 
+        json_reader_options_builder& strict_validation(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& numeric_leading_zeros(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& nonnumeric_numbers(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& unquoted_control_chars(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& na_values(vector[string] vals)\
+            except +libcudf_exception_handler
+
         json_reader_options build() except +libcudf_exception_handler
 
     cdef cudf_io_types.table_with_metadata read_json(
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 3d9d99ffa61..bda8921b62a 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
     buffer.seek(0)
 
     res = plc.io.avro.read_avro(
-        plc.io.types.SourceInfo([buffer]),
-        columns=columns,
-        skip_rows=skip_rows,
-        num_rows=num_rows,
+        (
+            plc.io.avro.AvroReaderOptions.builder(
+                plc.io.types.SourceInfo([buffer])
+            )
+            .columns(columns)
+            .skip_rows(skip_rows)
+            .num_rows(num_rows)
+            .build()
+        )
     )
 
     expected = pa.Table.from_arrays(
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 453e5ce32a8..747bbfa1370 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -24,13 +24,19 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
     plc_table_w_meta, pa_table = table_data
     sink = source_or_sink
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_table_w_meta,
-        lines=lines,
-        rows_per_chunk=rows_per_chunk,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_table_w_meta.tbl
+        )
+        .metadata(plc_table_w_meta)
+        .lines(lines)
+        .build()
     )
 
+    options.set_rows_per_chunk(rows_per_chunk)
+
+    plc.io.json.write_json(options)
+
     exp = pa_table.to_pandas()
 
     # Convert everything to string to make
@@ -57,13 +63,18 @@ def test_write_json_nulls(na_rep, include_nulls):
 
     sink = io.StringIO()
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_tbl_w_meta,
-        na_rep=na_rep,
-        include_nulls=include_nulls,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl
+        )
+        .metadata(plc_tbl_w_meta)
+        .na_rep(na_rep)
+        .include_nulls(include_nulls)
+        .build()
     )
 
+    plc.io.json.write_json(options)
+
     exp = pa_tbl.to_pandas()
 
     # Convert everything to string to make
@@ -100,15 +111,21 @@ def test_write_json_bool_opts(true_value, false_value):
 
     sink = io.StringIO()
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_tbl_w_meta,
-        include_nulls=True,
-        na_rep="null",
-        true_value=true_value,
-        false_value=false_value,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl
+        )
+        .metadata(plc_tbl_w_meta)
+        .na_rep("null")
+        .include_nulls(True)
+        .build()
     )
 
+    options.set_true_value(true_value)
+    options.set_false_value(false_value)
+
+    plc.io.json.write_json(options)
+
     exp = pa_tbl.to_pandas()
 
     # Convert everything to string to make
@@ -150,9 +167,12 @@ def test_read_json_basic(
         source.seek(0)
 
     res = plc.io.json.read_json(
-        plc.io.SourceInfo([source]),
-        compression=compression_type,
-        lines=lines,
+        (
+            plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+            .compression(compression_type)
+            .lines(lines)
+            .build()
+        )
     )
 
     # Adjustments to correct for the fact orient=records is lossy
@@ -226,9 +246,14 @@ def get_child_types(typ):
 
     new_schema = pa.schema(new_fields)
 
-    res = plc.io.json.read_json(
-        plc.io.SourceInfo([source]), dtypes=dtypes, lines=True
+    options = (
+        plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+        .lines(True)
+        .build()
     )
+    options.set_dtypes(dtypes)
+
+    res = plc.io.json.read_json(options)
     new_table = pa_table.cast(new_schema)
 
     # orient=records is lossy
@@ -252,10 +277,15 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size):
     for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
         tbls_w_meta.append(
             plc.io.json.read_json(
-                plc.io.SourceInfo([source]),
-                lines=True,
-                byte_range_offset=chunk_start,
-                byte_range_size=chunk_start + chunk_size,
+                (
+                    plc.io.json.JsonReaderOptions.builder(
+                        plc.io.SourceInfo([source])
+                    )
+                    .lines(True)
+                    .byte_range_offset(chunk_start)
+                    .byte_range_size(chunk_start + chunk_size)
+                    .build()
+                )
             )
         )
 
@@ -285,7 +315,12 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
     write_source_str(source, json_bytes)
 
     tbl_w_meta = plc.io.json.read_json(
-        plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
+        (
+            plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+            .lines(True)
+            .keep_quotes(keep_quotes)
+            .build()
+        )
     )
 
     template = "{0}"
@@ -313,20 +348,19 @@ def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
     json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
     write_source_str(source, json_str)
 
+    options = (
+        plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+        .lines(True)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
     if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
         with pytest.raises(RuntimeError):
-            plc.io.json.read_json(
-                plc.io.SourceInfo([source]),
-                lines=True,
-                recovery_mode=recovery_mode,
-            )
+            plc.io.json.read_json(options)
     else:
         # Recover case (bad values replaced with nulls)
-        tbl_w_meta = plc.io.json.read_json(
-            plc.io.SourceInfo([source]),
-            lines=True,
-            recovery_mode=recovery_mode,
-        )
+        tbl_w_meta = plc.io.json.read_json(options)
         exp = pa.Table.from_arrays(
             [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"]
         )
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 2557e40c935..fe35255505c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -37,12 +37,17 @@ def test_read_orc_basic(
         binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
     )
 
-    res = plc.io.orc.read_orc(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.orc.OrcReaderOptions.builder(
+        plc.io.types.SourceInfo([source])
+    ).build()
+    if nrows >= 0:
+        options.set_num_rows(nrows)
+    if skiprows >= 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None and len(columns) > 0:
+        options.set_columns(columns)
+
+    res = plc.io.orc.read_orc(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)